In [None]:
import requests
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import pprint
import pickle

In [None]:
# This cell takes about 5 mins to run

res = requests.get("https://classes.cornell.edu/api/2.0/search/classes.json?roster=SP18&subject=AEM")

res_json = res.json() # Is now a dict

class_roster_api_dict = defaultdict(lambda : {"professors":set(), 
                                              "subject-number": set(),
                                              "semesters": set()})

col_mappings = {"crseId":"id",
               "titleLong" : "title",
               "description" : "description",
               "catalogOutcomes" : "outcomes"}

# semesters = ["FA20"]
semesters = ["SP20", "FA19", "SP19", "FA18", "SP18"]
semesters = ["SP20"]



# Get list of subjects (majors) taught during these semesters

subjects = set() 

for semester in semesters:
    for subject in requests.get("https://classes.cornell.edu/api/2.0/config/subjects.json?roster="+semester)\
                    .json()["data"]["subjects"]:
        subjects.add(subject["value"])

        
for semester in semesters:            
    for subject in tqdm(subjects):
        res = requests.get("https://classes.cornell.edu/api/2.0/search/classes.json?roster=" 
                           + semester + "&subject=" + subject)
        if(res.json()["data"] is not None and res.json()["data"]["classes"] is not None):
            for res_class in res.json()["data"]["classes"]:
                professors = set()
                
                course_id = str(res_class["crseId"])
                
                if(res_class["enrollGroups"] is not None):
                    for section in res_class["enrollGroups"][0]["classSections"]:
                        for meeting in section["meetings"]:
                            for professor in meeting["instructors"]:
                                professors.add(" ".join([professor["firstName"], 
                                                          professor["middleName"], 
                                                          professor["lastName"]]))
                        
                for prof in professors:
                    class_roster_api_dict[course_id]["professors"].add(prof)
                    
                    
                    
                class_roster_api_dict[course_id]["subject-number"].add(" ".join([res_class["subject"], 
                                                                       str(res_class["catalogNbr"])]))
                
                class_roster_api_dict[course_id]["semesters"].add(semester)
                if("description" not in class_roster_api_dict[course_id]
                   or ("description" in class_roster_api_dict[course_id]
                          and class_roster_api_dict[course_id]["description"] == "")):
                    for catalogKey, dictKey in col_mappings.items():
                        class_roster_api_dict[course_id][dictKey] = res_class[catalogKey]
                    if(class_roster_api_dict[course_id]["outcomes"] == None):
                        class_roster_api_dict[course_id]["outcomes"] = ""
                    else:
                        class_roster_api_dict[course_id]["outcomes"] = " ".join(
                            class_roster_api_dict[course_id]["outcomes"])
                   
                    if(class_roster_api_dict[course_id]["description"] == None):
                        class_roster_api_dict[course_id]["description"] = ""
    
def get_latest_sem(sems):
    return sorted(sems, key=lambda sem:(-int(sem[2:]), sem[0:2]))[0]

for key in class_roster_api_dict.keys():
    c = class_roster_api_dict[key]
    latest_sem = get_latest_sem(list(c["semesters"]))
    subject, number = list(c["subject-number"])[0].split()
    url = "https://classes.cornell.edu/browse/roster/"+ latest_sem+"/class/"+subject.upper()+"/"+number
    class_roster_api_dict[key]["roster_url"] = url


In [28]:
# Condense cross-listed upper level classes
# E.g CS 4670 and CS 5670
title_dict = defaultdict(set)

for key in class_roster_api_dict.keys():
    title = class_roster_api_dict[key]["title"]
    title_dict[title].add(key)

for title in title_dict.keys():
    min_id = min(title_dict[title])
    for id in title_dict[title]:
        if(id != min_id):
            for s in ["professors", "subject-number", "semesters"]:
                class_roster_api_dict[min_id][s] = class_roster_api_dict[min_id][s].union(class_roster_api_dict[id][s])
            del class_roster_api_dict[id]

In [29]:
# Generate URLs for each class

def get_latest_sem(sems):
    return sorted(sems, key=lambda sem:(-int(sem[2:]), sem[0:2]))[0]

for key in class_roster_api_dict.keys():
    c = class_roster_api_dict[key]
    latest_sem = get_latest_sem(list(c["semesters"]))
    subject, number = list(c["subject-number"])[0].split()
    url = "https://classes.cornell.edu/browse/roster/"+ latest_sem+"/class/"+subject.upper()+"/"+number
    class_roster_api_dict[key]["roster_url"] = url

In [30]:
# Quick test for CS 2110 and CS 4670:

print([class_roster_api_dict[key] for key in class_roster_api_dict.keys() 
 if "CS 2110" in class_roster_api_dict[key]["subject-number"]][0])

print()

print([class_roster_api_dict[key] for key in class_roster_api_dict.keys() 
 if "CS 4670" in class_roster_api_dict[key]["subject-number"]][0])

{'professors': {'David Joseph Gries', 'Anne  Bracy', 'Eleanor Jane Birrell', 'Michael Ryan Clarkson'}, 'subject-number': {'ENGRD 2110', 'CS 2110'}, 'semesters': {'SP19', 'SP18', 'FA19', 'SP20', 'FA18'}, 'id': 358546, 'title': 'Object-Oriented Programming and Data Structures', 'description': 'Intermediate programming in a high-level language and introduction to computer science. Topics include object-oriented programming (classes, objects, subclasses, types), graphical user interfaces, algorithm analysis (asymptotic complexity, big "O" notation), recursion, testing, program correctness (loop invariants), searching/sorting, data structures (lists, trees, stacks, queues, heaps, search trees, hash tables, graphs), graph algorithms. Java is the principal programming language.', 'outcomes': 'Be fluent in the use of recursion and object-oriented programming concepts (e.g. classes, objects, inheritance, and interfaces). Be able to design and implement nontrivial Java programs (roughly 1000 lin

In [31]:
#Write classes to pickle

# Convert defaultdict to dict

pickle.dump(dict(class_roster_api_dict), open("class_roster_api_dict.pickle", "wb"))

In [None]:
# Generate average length of descriptions by major
# DEPRECATED CODE, was used just for P01 stats

subject_desc_len = dict()

for subject in subjects:
    sum_words = 0
    for index, row in classes_df[classes_df["subject"] == subject].iterrows():
        if(row["description"] is not None):
            sum_words += len(row["description"].split())
        
    if(len(classes_df[classes_df["subject"] == subject]) > 0):
        subject_desc_len[subject] = sum_words / len(classes_df[classes_df["subject"] == subject])
    else:
        subject_desc_len[subject] = 0
    
# for key in subject_desc_len.keys():
#     print(key + "," + str(subject_desc_len[key]))

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
def get_professor_page(professor):
    professor = professor.replace(", Professor", "")
    url = "https://www.ratemyprofessors.com/search.jsp?queryoption=HEADER&" \
          "queryBy=teacherName&schoolID=%s&query=" % str(298) + "+".join(professor.split())
    page = requests.get(url=url)
    pageData = page.text
    pageDataTemp = re.findall(r'ShowRatings\.jsp\?tid=\d+', pageData)
    if(len(pageDataTemp) > 0):
        pageDataTemp = re.findall(r'ShowRatings\.jsp\?tid=\d+', pageData)[0]
        finalUrl = "https://www.ratemyprofessors.com/" + pageDataTemp
        page = requests.get(finalUrl)
        
        return page
    
    elif(len(professor.split()) > 3):
        page = get_professor_page(" ".join([professor.split()[0], professor.split()[3]]))
        if(page == None):
            page = get_professor_page(" ".join([professor.split()[0],
                                                 professor.split()[1],
                                                 professor.split()[3]]))
        if(page == None):
            page = get_professor_page(" ".join([professor.split()[0],
                                                 professor.split()[2],
                                                 professor.split()[3]]))
    elif(len(professor.split()) > 2):
        page = get_professor_page(" ".join([professor.split()[0],professor.split()[2]]))
        if(page == None):
            page = get_professor_page(" ".join([professor.split()[0], professor.split()[2]]))
    else:
        return None
    
    return page

def get_professor_tags(page, tags_default):
    
    tags = tags_default
    
    soup = BeautifulSoup(page.content, 'html.parser')

    school = soup.find(class_="NameTitle__Title-dowf0z-1")
    if(school is not None):
        school_split = school.text.split("at ")
        if(len(school_split) > 1 and school_split[1] != "Cornell University"):
            return tags
        
    tags_list = soup.find(class_='TeacherTags__TagsContainer-sc-16vmh1y-0')
    tags = []
    if(tags_list is not None):
        spans = tags_list.findAll("span")
        if(len(spans) > 0):
            tags = [d.text for d in spans]
            
    return tags
        
        
def get_professor_top_review(page, review_default):
    review = review_default
    soup = BeautifulSoup(page.content, 'html.parser')

    school = soup.find(class_="NameTitle__Title-dowf0z-1")
    if(school is not None):
        school_split = school.text.split("at ")
        if(len(school_split) > 1 and school_split[1] != "Cornell University"):
            return review

    most_helpful_review = soup.find(class_='HelpfulRating__StyledComments-sc-4ngnti-1')

    if(most_helpful_review is not None):
        review = most_helpful_review.text

    if(review == ""):
        review_list = soup.findAll(class_='Comments__StyledComments-dzzyvm-0')[:5]
        review = " ".join([d.text for d in review_list])
            
    return review
        
def get_professor_rating(page, rating_default):
    rating = rating_default
    soup = BeautifulSoup(page.content, 'html.parser')

    school = soup.find(class_="NameTitle__Title-dowf0z-1")
    if(school is not None):
        school_split = school.text.split("at ")
        if(len(school_split) > 1 and school_split[1] != "Cornell University"):
            return rating

    rating = str(soup.find(class_='RatingValue__Numerator-qw8sqy-2').text)
    
    if(rating != "N/A"):
        rating = float(rating)
    else:
        rating = rating_default
        
    return rating

def get_professor_stat(page, default_return, scraping_function):
    
    if(page == None):
        return default_return
    
    soup = BeautifulSoup(page.content, 'html.parser')

    school = soup.find(class_="NameTitle__Title-dowf0z-1")
    if(school is not None):
        school_split = school.text.split("at ")
        if(len(school_split) > 1 and school_split[1] != "Cornell University"):
            return default_return

    return scraping_function(page, default_return)

In [None]:
prof_set = set()
for entry in class_roster_api_dict.keys():
    for prof in class_roster_api_dict[entry]["professors"]:
        prof_set.add(prof)
    

In [None]:
# NOTE: This cell takes 2-2.5 hours to run!!

# It scrapes from RateMyProfessor.com

ratemyprofessor_api_dict = dict()

for prof in tqdm(list(prof_set)):
    page = get_professor_page(prof)
    rating = get_professor_stat(page, None, get_professor_rating)
    if(rating is None):
            page = None
    review = get_professor_stat(page, None, get_professor_top_review)
    tags = get_professor_stat(page, [], get_professor_tags)
    
    
    ratemyprofessor_api_dict[prof] = {"review":review, "rating": rating, "tags": tags}
    

In [None]:
# List the professors that have a RateMyProfessor review and tags 

ratemyprofessor_api_dict

In [None]:
#Write professors to CSV

#Write ratemyprofessor data to pickle

# Convert defaultdict to dict

pickle.dump(ratemyprofessor_api_dict, open("ratemyprofessor_api_dict.pickle", "wb"))