In [11]:
from bs4 import BeautifulSoup
import requests

In [12]:
ROOT_URL = "https://classes.cornell.edu"

In [13]:
def current_semester():
    req = requests.get(ROOT_URL)
    req.raise_for_status()
    return req.url.strip("/").split("/")[-1]

In [14]:
current_semester()

'SP23'

In [46]:
def all_subject_codes():
    req = requests.get(ROOT_URL)
    roster_bs4 = BeautifulSoup(req.text, "html.parser")
    subject_tags = roster_bs4.select(".browse-subjectcode")
    return [str(tag.getText()) for tag in subject_tags]

In [47]:
all_subject_codes()[:10]

['AAP', 'AAS', 'AEM', 'AEP', 'AGSCI', 'AIIS', 'AIRS', 'ALS', 'AMST', 'ANSC']

In [48]:
semester = current_semester()
course_catalog = []

# Iterate through each subject
for subject_code in all_subject_codes():
    subject_req = requests.get(ROOT_URL + "/browse/roster/" + semester + "/subject/" + subject_code)
    subject_bs4 = BeautifulSoup(subject_req.text, "html.parser")
    course_tags = subject_bs4.find_all("div", class_="node")
    if len(course_tags) == 0:
        continue
    course_tags.pop(0)
    
    
    # Iterate through each course and get relevant information
    for tag in course_tags:
        course_num = tag["data-catalog-nbr"]
        course_title = tag.find_all("div", class_="title-coursedescr")[0].getText()
        req_url = ROOT_URL + "/browse/roster/" + semester + "/class/" + subject_code + "/" + course_num
        req_desc = requests.get(req_url)
        desc_bs4 = BeautifulSoup(req_desc.text, "html.parser")
        course_description = desc_bs4.find('p', class_="catalog-descr").text
        course_catalog.append([subject_code, course_num, course_title, course_description])


In [55]:
print(len(course_catalog))

4619


In [56]:
import pandas as pd

In [58]:
df = pd.DataFrame(course_catalog, 
                            columns=['subject code', 'course number', 'course title', 'course description'])

In [59]:
df.head(10)

Unnamed: 0,subject code,course number,course title,course description
0,AAP,3099,Special Topics in AAP,\n Topics TBA.\n
1,AAS,1100,Introduction to Asian American Studies,\n This interdisciplinary course of...
2,AAS,2620,Introduction to Asian American Literature,\n This course will introduce both ...
3,AAS,3312,Afro-Asia: Futurism and Feminism,\n This course explores cultural re...
4,AAS,3378,Korean American Literature,\n The rapidly growing literature o...
5,AAS,4950,Independent Study,\n Independent reading course in to...
6,AAS,6630,Asian American Theory and Literature,\n This graduate seminar focuses on...
7,AAS,7200,Directed Graduate Individual Study,\n No description available.\n
8,AEM,1200,Introduction to Business Management,\n Provides an overview of manageme...
9,AEM,1500,An Introduction to the Economics of Environmen...,\n This course provides an introduc...


In [60]:
print(df.shape)

(4619, 4)


In [61]:
import pickle

In [62]:
with open('catalog.pkl', 'wb') as f:
    pickle.dump(course_catalog, f)


In [63]:
with open('data.pkl', 'wb') as f:
    pickle.dump(df, f)