In [None]:
import requests
import pandas as pd
from collections import defaultdict

In [None]:
# This cell takes about 5 mins to run

res = requests.get("https://classes.cornell.edu/api/2.0/search/classes.json?roster=SP18&subject=AEM")

res_json = res.json() # Is now a dict

# print(res_json.keys())

col_mappings = {"crseId":"id",
               "subject" : "subject",
               "catalogNbr" : "number",
               "titleLong" : "title",
               "description" : "description",
               "catalogOutcomes" : "outcomes"}

semesters = ["SP20", "FA19", "SP19", "FA18", "SP18"]

classes_df = pd.DataFrame(columns = col_mappings.values())

# Get list of subjects (majors) taught during these semesters

subjects = set() 

for semester in semesters:
    for subject in requests.get("https://classes.cornell.edu/api/2.0/config/subjects.json?roster="+semester)\
                    .json()["data"]["subjects"]:
        subjects.add(subject["value"])

professor_dict = defaultdict(set)
        
for semester in semesters:            
    for subject in tqdm(subjects):
        res = requests.get("https://classes.cornell.edu/api/2.0/search/classes.json?roster=" 
                           + semester + "&subject=" + subject)
        if(res.json()["data"] is not None and res.json()["data"]["classes"] is not None):
            for res_class in res.json()["data"]["classes"]:
                professors = set()
                if(res_class["enrollGroups"] is not None):
                    for section in res_class["enrollGroups"][0]["classSections"]:
                        for meeting in section["meetings"]:
                            for professor in meeting["instructors"]:
                                professors.add(" ".join([professor["firstName"], 
                                                          professor["middleName"], 
                                                          professor["lastName"]]))
                for prof in professors:
                    professor_dict[res_class["crseId"]].add(prof)
    
                existing_class = classes_df[classes_df["id"] == res_class["crseId"]]
                if(len(existing_class) == 0):
                    row_dict = {value:res_class[key] for key, value in col_mappings.items()}
                    if(row_dict["outcomes"] == None):
                        row_dict["outcomes"] = ""
                    classes_df = classes_df.append(row_dict, ignore_index=True)

def add_prof_col(row):
    row["professors"] = list(professor_dict[row["id"]])
    row = row[list(col_mappings.values()) + ["professors"]]
    print(row)
    return row

classes_df = classes_df.apply(lambda row: add_prof_col(row), axis=1)



In [143]:
#Write classes to CSV
classes_df.to_csv('roster_api_data.csv', index = False)

In [None]:
# Generate average length of descriptions by major

subject_desc_len = dict()

for subject in subjects:
    sum_words = 0
    for index, row in classes_df[classes_df["subject"] == subject].iterrows():
        if(row["description"] is not None):
            sum_words += len(row["description"].split())
        
    if(len(classes_df[classes_df["subject"] == subject]) > 0):
        subject_desc_len[subject] = sum_words / len(classes_df[classes_df["subject"] == subject])
    else:
        subject_desc_len[subject] = 0
    
# for key in subject_desc_len.keys():
#     print(key + "," + str(subject_desc_len[key]))

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
def get_professor_tags(professor):
    professor = professor.replace(", Professor", "")
    url = "https://www.ratemyprofessors.com/search.jsp?queryoption=HEADER&" \
          "queryBy=teacherName&schoolID=%s&query=" % str(298) + "+".join(professor.split())
    page = requests.get(url=url)
    pageData = page.text
    pageDataTemp = re.findall(r'ShowRatings\.jsp\?tid=\d+', pageData)
    tags = []
    if(len(pageDataTemp) > 0):
        pageDataTemp = re.findall(r'ShowRatings\.jsp\?tid=\d+', pageData)[0]
        finalUrl = "https://www.ratemyprofessors.com/" + pageDataTemp
        page = requests.get(finalUrl)

        soup = BeautifulSoup(page.content, 'html.parser')

        school = soup.find(class_="NameTitle__Title-dowf0z-1")
        if(school is not None):
            school_split = school.text.split("at ")
            if(len(school_split) > 1 and school_split[1] != "Cornell University"):
                return tags
        
        tags_list = soup.find(class_='TeacherTags__TagsContainer-sc-16vmh1y-0')
        tags = []
        if(tags_list is not None):
            spans = tags_list.findAll("span")
            if(len(spans) > 0):
                tags = [d.text for d in spans]
                
    elif(len(professor.split()) > 3):
        tags = get_professor_tags(" ".join([professor.split()[0], professor.split()[3]]))
        if(tags == []):
            tags = get_professor_tags(" ".join([professor.split()[0],
                                                 professor.split()[1],
                                                 professor.split()[3]]))
        if(tags == []):
            tags = get_professor_tags(" ".join([professor.split()[0],
                                                 professor.split()[2],
                                                 professor.split()[3]]))
    elif(len(professor.split()) > 2):
        tags = get_professor_tags(" ".join([professor.split()[0],professor.split()[2]]))
        if(tags == []):
            tags = get_professor_tags(" ".join([professor.split()[0], professor.split()[2]]))
    return tags
        
        
def get_professor_top_review(professor):
    professor = professor.replace(", Professor", "")
    url = "https://www.ratemyprofessors.com/search.jsp?queryoption=HEADER&" \
          "queryBy=teacherName&schoolID=%s&query=" % str(298) + "+".join(professor.split())
    page = requests.get(url=url)
    pageData = page.text
    pageDataTemp = re.findall(r'ShowRatings\.jsp\?tid=\d+', pageData)
    review = ""
    if(len(pageDataTemp) > 0):
        pageDataTemp = re.findall(r'ShowRatings\.jsp\?tid=\d+', pageData)[0]
        finalUrl = "https://www.ratemyprofessors.com/" + pageDataTemp
#         print(finalUrl)
        page = requests.get(finalUrl)

        soup = BeautifulSoup(page.content, 'html.parser')
        
        school = soup.find(class_="NameTitle__Title-dowf0z-1")
        if(school is not None):
            school_split = school.text.split("at ")
            if(len(school_split) > 1 and school_split[1] != "Cornell University"):
                return review
        
        most_helpful_review = soup.find(class_='HelpfulRating__StyledComments-sc-4ngnti-1')
        
        if(most_helpful_review is not None):
            review = most_helpful_review.text

        if(review == ""):
            review_list = soup.findAll(class_='Comments__StyledComments-dzzyvm-0')[:5]
            review = " ".join([d.text for d in review_list])
        
    elif(len(professor.split()) > 3):
        review = get_professor_top_review(" ".join([professor.split()[0], professor.split()[3]]))
        if(review == ""):
            review = get_professor_top_review(" ".join([professor.split()[0],
                                                 professor.split()[1],
                                                 professor.split()[3]]))
        if(review == ""):
            review = get_professor_top_review(" ".join([professor.split()[0],
                                                 professor.split()[2],
                                                 professor.split()[3]]))
    elif(len(professor.split()) > 2):
        review = get_professor_top_review(" ".join([professor.split()[0],professor.split()[2]]))
        if(review == ""):
            review = get_professor_top_review(" ".join([professor.split()[0], professor.split()[2]]))
    return review
        


In [None]:
prof_set = set()
for prof_list in classes_df["professors"]:
    for prof in prof_list:
        prof_set.add(prof)

In [None]:
# NOTE: This cell takes 1-2 hours to run!!

# It scrapes from RateMyProfessor.com

professors_df = pd.DataFrame(columns = ["professor", "review", "tags"])

for prof in list(prof_set):
    review = get_professor_top_review(prof)
    tags = get_professor_tags(prof)
    professors_df = professors_df.append(pd.Series([prof, review, tags], index=professors_df.columns ), ignore_index=True)
    
professors_df.head()
    

In [None]:
# List the professors that have a RateMyProfessor review and tags 

professors_df[professors_df["review"] != ""]

In [None]:
#Write professors to CSV

professors_df.to_csv('ratemyprofessor_api_data.csv', index = False)