In [1]:
# Import Dependencies
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import concurrent.futures
import sys
import pandas as pd
import json
import queue
import threading



In [2]:
def crawl_web(start, success_file_path, error_file_path):
    unseen = set([start])
    seen = set([])

    with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
        while unseen:
            url = unseen.pop()
            seen.add(url)
            executor.submit(crawl_website, url, seen, unseen, success_file_path, error_file_path)

    print(f"Finished crawling {start}")
    return seen 

def crawl_website(url, seen, unseen, success_file_path, error_file_path):
    try:
        new_links = getNewLinks(url, seen, unseen, success_file_path)
        with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
            for link in new_links:
                print(f"Adding: {link}")
                unseen.add(link)  
                executor.submit(crawl_website, link, seen, unseen, success_file_path, error_file_path)
    except:
        print(f"Error crawling {url} due to {sys.exc_info()[0]}")
        with open(error_file_path, 'a') as f:
            f.write(f"{url}\n")

def save_links_to_file(links, file_path):
    with open(file_path, 'a') as f:
        for link in links:
            if not is_duplicate_link(link, file_path):
                f.write(f"{link}\n")
                
def getNewLinks(url, seen, unseen, success_file_path):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [link['href'] for link in soup.find_all('a') if link.has_attr('href') and not any(ext in link['href'] for ext in ['.png', '.pdf', '.jpg', '.jpeg', '~json/', 'javascript:;', 'mailto:', 'webcal:'])]
        links = [urljoin(url, link) for link in links]
        links = [link for link in links if link not in seen and link not in unseen and urlparse(link).netloc.endswith('e-catalogue.jhu.edu')] # Change this to limit domain
        save_links_to_file(links, success_file_path)
        return links
    else:
        print(f"Error getting {url} with status code {response.status_code} and reason {response.reason}")
        print(f"Error full response: {response.text}")
    return []

def is_duplicate_link(link, file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if link.strip() == line.strip():
                return True
    return False

def domain(url1, url2):
    return urlparse(url1).netloc == urlparse(url2).netloc

In [3]:
# crawled_links = crawl_web('https://e-catalogue.jhu.edu/', "../data/e-catalogue-jhu-edu/success_links.txt", "../data/e-catalogue-jhu-edu/error_links.txt")

In [4]:
# Load crawled links from file into a dataframe (in case of termination)
df = pd.read_csv("../backend/data/e-catalogue-jhu-edu/success_links.txt", header=None, error_bad_lines=False)

# Rename column
df.rename(columns={0: "Link"}, inplace=True)

# Normalize dataframe (remove any links that are not jhu.edu in base url or no https:// in the link)
df = df[df['Link'].str.contains("https://e-catalogue.jhu.edu")]
df




  df = pd.read_csv("../backend/data/e-catalogue-jhu-edu/success_links.txt", header=None, error_bad_lines=False)


Unnamed: 0,Link
0,https://e-catalogue.jhu.edu//#contentarea
1,https://e-catalogue.jhu.edu/azindex/
2,https://e-catalogue.jhu.edu/
3,https://e-catalogue.jhu.edu/
4,https://e-catalogue.jhu.edu/university-wide-po...
...,...
13394,https://e-catalogue.jhu.edu/search/?P=AS.020.346
13395,https://e-catalogue.jhu.edu/search/?P=AS.020.347
13398,https://e-catalogue.jhu.edu/archive/2015-16/de...
13399,https://e-catalogue.jhu.edu/archive/2015-16/de...


In [5]:
def clean_text(text):
    # Remove leading/trailing whitespace, condense multiple whitespaces to single
    return ' '.join(text.split())

def scrape_page(url, q):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'li', 'div', 'table', 'th', 'td', 'tr', 'ol', 'ul', 'blockquote', 'pre', 'code', 'caption', 'dt', 'dd']

    text_data = []
    for tag in tags:
        elements = soup.find_all(tag)
        for element in elements:
            text = clean_text(element.get_text())
            if text:
                text_data.append(text)

    json_data = []
    for i in range(len(text_data) - 1):
        json_data.append({
            'prompt': text_data[i],
            'completion': text_data[i + 1]
        })
        
    # add to queue
    q.put(json_data)
    
def write_to_file(q):
    with open('../backend/data/e-catalogue-jhu-edu/data.json', 'a') as f:
        while True:
            data = q.get()
            if data is None:
                break
            json.dump(data, f)
            f.write('\n')
            q.task_done()

In [6]:
# # List of URLs to scrape
# urls = df['Link'].tolist()

# # create a queue
# q = queue.Queue()

# # create a separate thread to write data to file
# file_writer = threading.Thread(target=write_to_file, args=(q,))
# file_writer.start()

# with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#     futures = {executor.submit(scrape_page, url, q) for url in urls}
#     concurrent.futures.wait(futures)

# # stop the file writing thread
# q.put(None)
# file_writer.join()


In [7]:
# Load pairs into a dataframe
with open('../backend/data/e-catalogue-jhu-edu/data.json', 'r') as f:
    data = []
    for line in f:
        # Error handling
        try:
            data.extend(json.loads(line))
        except:
            pass
        
pc_pairs_df = pd.DataFrame(data)

print(f"Number of Prompt/Completion pairs before cleaning: {len(pc_pairs_df)}")
print(f"Number of words scraped (prompt): {pc_pairs_df['prompt'].str.split().str.len().sum()}")
print(f"Number of words scraped (completion): {pc_pairs_df['completion'].str.split().str.len().sum()}")

# Remove any prompts/completion pairs that contains characters less than 15
pc_pairs_df = pc_pairs_df[pc_pairs_df['prompt'].str.len() > 15]
pc_pairs_df = pc_pairs_df[pc_pairs_df['completion'].str.len() > 15]

# Remove duplicates
pc_pairs_df.drop_duplicates(subset='prompt', keep='last', inplace=True)

# Save to JSON file in the format of {"prompt": "prompt text", "completion": "completion text"}
pc_pairs_df.to_json('../backend/data/e-catalogue-jhu-edu/prompt-completion-pairs.json', orient='records', lines=True)

print(f"Number of Prompt/Completion pairs after cleaning: {len(pc_pairs_df)}")

pc_pairs_df

Number of Prompt/Completion pairs before cleaning: 21429394
Number of words scraped (prompt): 903011934
Number of words scraped (completion): 902721946
Number of Prompt/Completion pairs after cleaning: 186804


Unnamed: 0,prompt,completion
308,Search Courses Keyword Academic Year Term 2022...,Search Courses Keyword Academic Year Term 2022...
311,Keyword Academic Year Term 2022-23 2020-21 202...,Keyword Academic Year Term 2022-23 2020-21 202...
312,Keyword Academic Year Term 2022-23 2020-21 202...,Keyword Academic Year Term 2022-23 2020-21 202...
319,ADVANCED SEARCH Subject Any Subject Applied Ec...,Subject Any Subject Applied Economics - AS.440...
321,Subject Any Subject Applied Economics - AS.440...,Welcome to Course Search Use the search panel ...
...,...,...
21429387,"Teaching Writing, Certificate",AS.010 (History of Art) AS.020 (Biology) AS.03...
21429388,AS.010 (History of Art) AS.020 (Biology) AS.03...,/​course-​search/​api/​
21429391,Home›/search/›Search Results,"Johns Hopkins University Baltimore, MD 410-516..."
21429392,"Johns Hopkins University Baltimore, MD 410-516...",About Us Academics Schools & Divisions Admissi...


In [8]:
# Get dataset statistics
# Amount of links (from links.txt)
print(f"Number of links: {len(df)}")

Number of links: 11761
