In [1]:
# Import Dependencies
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import concurrent.futures
import sys
import pandas as pd
import json
import queue
import threading



In [2]:
def crawl_web(start, success_file_path, error_file_path):
    unseen = set([start])
    seen = set([])

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        while unseen:
            url = unseen.pop()
            seen.add(url)
            executor.submit(crawl_website, url, seen, unseen, success_file_path, error_file_path)

    print(f"Finished crawling {start}")
    return seen 

def crawl_website(url, seen, unseen, success_file_path, error_file_path):
    try:
        new_links = getNewLinks(url, seen, unseen, success_file_path)
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            for link in new_links:
                print(f"Adding: {link}")
                unseen.add(link)  
                executor.submit(crawl_website, link, seen, unseen, success_file_path, error_file_path)
    except:
        print(f"Error crawling {url}")
        print(f"Error due to: {sys.exc_info()[0]}")
        with open(error_file_path, 'a') as f:
            f.write(f"{url}\n")

def save_links_to_file(links, file_path):
    with open(file_path, 'a') as f:
        for link in links:
            if not is_duplicate_link(link, file_path):
                f.write(f"{link}\n")
                
def getNewLinks(url, seen, unseen, success_file_path):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [link['href'] for link in soup.find_all('a') if link.has_attr('href') and not any(ext in link['href'] for ext in ['.png', '.pdf', '.jpg', '.jpeg', '~json/', 'javascript:;', 'mailto:', 'webcal:'])]
        links = [urljoin(url, link) for link in links]
        links = [link for link in links if link not in seen and link not in unseen and urlparse(link).netloc.endswith('ventures.jhu.edu')] # Change this to limit domain
        save_links_to_file(links, success_file_path)
        return links
    else:
        print(f"Error getting {url}")
        print(f"Error due to: {response.status_code}")
        # Print error reason
        print(response.reason)
        # Print all information regarding error
        print(response.text)
    return []

def is_duplicate_link(link, file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if link.strip() == line.strip():
                return True
    return False

def domain(url1, url2):
    return urlparse(url1).netloc == urlparse(url2).netloc

In [3]:
# crawled_links = crawl_web('https://ventures.jhu.edu/', "../data/ventures-jhu-edu/success_links.txt", "../data/ventures-jhu-edu/error_links.txt")

In [4]:
# Load crawled links from file into a dataframe (in case of termination)
df = pd.read_csv("../backend/data/ventures-jhu-edu/success_links.txt", header=None, error_bad_lines=False)

# Rename column
df.rename(columns={0: "Link"}, inplace=True)

# Normalize dataframe (remove any links that are not jhu.edu in base url or no https:// in the link)
df = df[df['Link'].str.contains("https://ventures.jhu.edu")]
# Remove any links that has "email-protection"
df = df[~df['Link'].str.contains("email-protection")]
df




  df = pd.read_csv("../backend/data/ventures-jhu-edu/success_links.txt", header=None, error_bad_lines=False)


Unnamed: 0,Link
0,https://ventures.jhu.edu/#skip_content
1,https://ventures.jhu.edu/innovations/
2,https://ventures.jhu.edu/companies
3,https://ventures.jhu.edu/innovations/technolog...
4,https://ventures.jhu.edu/technology-transfer/
...,...
5825,https://ventures.jhu.edu/working-jhtv/#staff1
6295,https://ventures.jhu.edu/accessibility/#skip_c...
6296,https://ventures.jhu.edu/accessibility/#menu
6298,https://ventures.jhu.edu/jhtv-legal-disclaimer...


In [5]:
def clean_text(text):
    # Remove leading/trailing whitespace, condense multiple whitespaces to single
    return ' '.join(text.split())

def scrape_page(url, q):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'li', 'div', 'table', 'th', 'td', 'tr', 'ol', 'ul', 'blockquote', 'pre', 'code', 'caption', 'dt', 'dd']

    text_data = []
    for tag in tags:
        elements = soup.find_all(tag)
        for element in elements:
            text = clean_text(element.get_text())
            if text:
                text_data.append(text)

    json_data = []
    for i in range(len(text_data) - 1):
        json_data.append({
            'prompt': text_data[i],
            'completion': text_data[i + 1]
        })
        
    # add to queue
    q.put(json_data)
    
def write_to_file(q):
    with open('../backend/data/ventures-jhu-edu/data.json', 'a') as f:
        while True:
            data = q.get()
            if data is None:
                break
            json.dump(data, f)
            f.write('\n')
            q.task_done()

In [6]:
# # List of URLs to scrape
# urls = df['Link'].tolist()

# # create a queue
# q = queue.Queue()

# # create a separate thread to write data to file
# file_writer = threading.Thread(target=write_to_file, args=(q,))
# file_writer.start()

# with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#     futures = {executor.submit(scrape_page, url, q) for url in urls}
#     concurrent.futures.wait(futures)

# # stop the file writing thread
# q.put(None)
# file_writer.join()


In [7]:
# Load pairs into a dataframe
with open('../backend/data/ventures-jhu-edu/data.json', 'r') as f:
    data = []
    for line in f:
        # Error handling
        try:
            data.extend(json.loads(line))
        except:
            pass
        
pc_pairs_df = pd.DataFrame(data)

print(f"Number of Prompt/Completion pairs before cleaning: {len(pc_pairs_df)}")
print(f"Number of words scraped (prompt): {pc_pairs_df['prompt'].str.split().str.len().sum()}")
print(f"Number of words scraped (completion): {pc_pairs_df['completion'].str.split().str.len().sum()}")

# Remove any prompts/completion pairs that contains characters less than 15
pc_pairs_df = pc_pairs_df[pc_pairs_df['prompt'].str.len() > 15]
pc_pairs_df = pc_pairs_df[pc_pairs_df['completion'].str.len() > 15]

# Remove Cloudfare blocked data
pc_pairs_df = pc_pairs_df[~pc_pairs_df['prompt'].str.contains("malicious bots | Cloudflare | Your IP: | Email Protection | Performance & security | prevent this in the future | please complete the captcha below | JavaScript needs to be enabled | This process is automatic | JavaScript is required | Please stand by, while we are checking your browser | Checking your browser | DDoS protection by Cloudflare | This page is having a slideshow that uses Javascript | Please enable Javascript to view this page | This page uses Javascript | Please enable Javascript")]

# Remove duplicates
pc_pairs_df.drop_duplicates(subset='prompt', keep='last', inplace=True)

# Save to JSON file in the format of {"prompt": "prompt text", "completion": "completion text"}
pc_pairs_df.to_json('../backend/data/ventures-jhu-edu/prompt-completion-pairs.json', orient='records', lines=True)

print(f"Number of Prompt/Completion pairs after cleaning: {len(pc_pairs_df)}")

pc_pairs_df

Number of Prompt/Completion pairs before cleaning: 635472
Number of words scraped (prompt): 11600537
Number of words scraped (completion): 11547221
Number of Prompt/Completion pairs after cleaning: 12610


Unnamed: 0,prompt,completion
6923,Skip to Main Content Rivier University Search....,Skip to Main Content Rivier University Search....
6951,FastForward startups are fulfilling the foundi...,FastForward startups are fulfilling the foundi...
6956,Startups The entrepreneurs and technologies th...,Startups The entrepreneurs and technologies th...
6988,startup news Watch Session 8 of JHTV's 'Emergi...,startup news Watch Session 8 of JHTV's 'Emergi...
7061,Watch Session 8 of JHTV's 'Emerging CEO Panel ...,News Support JHTV Events Staff Directory Worki...
...,...,...
635467,The Licensing Process Reports of Invention Mat...,Industry-Sponsored Research Translational Fund...
635468,Industry-Sponsored Research Translational Fund...,Corporate Partnerships Licensing Investments S...
635469,Corporate Partnerships Licensing Investments S...,FastForward FastForward U Social Innovation La...
635470,FastForward FastForward U Social Innovation La...,Faculty & Inventors Students Industry Investors


In [8]:
# Get dataset statistics
# Amount of links (from links.txt)
print(f"Number of links: {len(df)}")

Number of links: 3136
Number of words in dataset: 1137355
