# Rate my professor review scraper

Scraping data from https://www.ratemyprofessors.com/
    
Output: For each teacher, all their reviews

In [1]:
# Data manipulation libraries
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
import requests

tqdm.pandas()

In [11]:
from time import sleep
import json

def get_review_data(teacher_id,increment=200):
    query = {"query":"query RatingsListQuery(\n  $count: Int!\n  $id: ID!\n  $courseFilter: String\n  $cursor: String\n) {\n  node(id: $id) {\n    __typename\n    ... on Teacher {\n      ...RatingsList_teacher_4pguUW\n    }\n    id\n  }\n}\n\nfragment RatingsList_teacher_4pguUW on Teacher {\n  id\n  lastName\n  numRatings\n  school {\n    id\n    legacyId\n    name\n    city\n    state\n    avgRating\n    numRatings\n  }\n  ...Rating_teacher\n  ...NoRatingsArea_teacher\n  ratings(first: $count, after: $cursor, courseFilter: $courseFilter) {\n    edges {\n      cursor\n      node {\n        ...Rating_rating\n        id\n        __typename\n      }\n    }\n    pageInfo {\n      hasNextPage\n      endCursor\n    }\n  }\n}\n\nfragment Rating_teacher on Teacher {\n  ...RatingFooter_teacher\n  ...RatingSuperHeader_teacher\n  ...ProfessorNoteSection_teacher\n}\n\nfragment NoRatingsArea_teacher on Teacher {\n  lastName\n  ...RateTeacherLink_teacher\n}\n\nfragment Rating_rating on Rating {\n  comment\n  teacherNote {\n    id\n  }\n  ...RatingHeader_rating\n  ...RatingSuperHeader_rating\n  ...RatingValues_rating\n  ...CourseMeta_rating\n  ...RatingTags_rating\n  ...RatingFooter_rating\n  ...ProfessorNoteSection_rating\n}\n\nfragment RatingHeader_rating on Rating {\n  date\n  class\n  helpfulRating\n  clarityRating\n  isForOnlineClass\n}\n\nfragment RatingSuperHeader_rating on Rating {\n  legacyId\n}\n\nfragment RatingValues_rating on Rating {\n  helpfulRating\n  clarityRating\n  difficultyRating\n}\n\nfragment CourseMeta_rating on Rating {\n  attendanceMandatory\n  wouldTakeAgain\n  grade\n  textbookUse\n  isForOnlineClass\n  isForCredit\n}\n\nfragment RatingTags_rating on Rating {\n  ratingTags\n}\n\nfragment RatingFooter_rating on Rating {\n  id\n  comment\n  adminReviewedAt\n  flagStatus\n  legacyId\n  thumbsUpTotal\n  thumbsDownTotal\n  thumbs {\n    userId\n    thumbsUp\n    thumbsDown\n    id\n  }\n  teacherNote {\n    id\n  }\n}\n\nfragment ProfessorNoteSection_rating on Rating {\n  teacherNote {\n    ...ProfessorNote_note\n    id\n  }\n  ...ProfessorNoteEditor_rating\n}\n\nfragment ProfessorNote_note on TeacherNotes {\n  comment\n  ...ProfessorNoteHeader_note\n  ...ProfessorNoteFooter_note\n}\n\nfragment ProfessorNoteEditor_rating on Rating {\n  id\n  legacyId\n  class\n  teacherNote {\n    id\n    teacherId\n    comment\n  }\n}\n\nfragment ProfessorNoteHeader_note on TeacherNotes {\n  createdAt\n  updatedAt\n}\n\nfragment ProfessorNoteFooter_note on TeacherNotes {\n  legacyId\n  flagStatus\n}\n\nfragment RateTeacherLink_teacher on Teacher {\n  legacyId\n  numRatings\n  lockStatus\n}\n\nfragment RatingFooter_teacher on Teacher {\n  id\n  legacyId\n  lockStatus\n  isProfCurrentUser\n}\n\nfragment RatingSuperHeader_teacher on Teacher {\n  firstName\n  lastName\n  legacyId\n  school {\n    name\n    id\n  }\n}\n\nfragment ProfessorNoteSection_teacher on Teacher {\n  ...ProfessorNote_teacher\n  ...ProfessorNoteEditor_teacher\n}\n\nfragment ProfessorNote_teacher on Teacher {\n  ...ProfessorNoteHeader_teacher\n  ...ProfessorNoteFooter_teacher\n}\n\nfragment ProfessorNoteEditor_teacher on Teacher {\n  id\n}\n\nfragment ProfessorNoteHeader_teacher on Teacher {\n  lastName\n}\n\nfragment ProfessorNoteFooter_teacher on Teacher {\n  legacyId\n  isProfCurrentUser\n}\n",
             "variables": { "count": increment,
                           "id": teacher_id}}
    has_next_page = True
    review_data = []
    cursor = None
    headers = {
        'Host': 'www.ratemyprofessors.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Accept': '*/*',
        'Accept-Language': 'en-CA,en-US;q=0.7,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Content-Type': 'application/json',
        'Authorization':'Basic dGVzdDp0ZXN0',
        'Origin': 'https://www.ratemyprofessors.com',
        'Content-Length': '1161',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Cookie': '_scid=786e8c25-e183-4670-a4bf-14c4e2f886f7; __browsiUID=5bc15d35-c2d0-44a9-854f-d74d758417c4; previousSchoolID=12502; promotionIndex=0; ad_blocker_overlay_2019=true; ccpa-notice-viewed-02=true; __browsiSessionID=8b668abf-46aa-465a-96d2-0f624e32a868&true&false&DEFAULT&gb&desktop-3.8.1&false'
    }
    url = 'https://www.ratemyprofessors.com/graphql'
    while has_next_page:
        sleep(1)
        r = requests.post(url,json=query,headers = headers)
        json_data = json.loads(r.text)['data']['node']['ratings']
        if not r.status_code == 200:
            print(r)
            break
        if 'edges' in json_data:
            teacher_page = json_data['edges']
            review_data += teacher_page
        # update cursor to start at next page
        if len(teacher_page): cursor = teacher_page[-1]['cursor']
        if query['variables'].get('cursor') == cursor:
            break
        query['variables']['cursor'] = cursor
#         print(cursor,has_next_page,len(review_data))
    return [{**x['node'], 'teacherId': teacher_id} for x in review_data if 'node' in x]


In [12]:
teacher_overview_df = pd.read_csv("data/output/teachers.csv").head()
# Scrape each page
review_data = teacher_overview_df.progress_apply(lambda x : get_review_data(x['id']),
                                                 axis=1).tolist()
# Flatten the list of lists into a single list
flattened = [element for list_ in review_data for element in list_]
review_df = pd.DataFrame.from_records(flattened)
# teacher_df[['schoolId','schoolName']] = pd.json_normalize(teacher_df['school'])
review_df = review_df.merge(teacher_overview_df[["id",
                                                 "firstName",
                                                 "lastName",
                                                 "department",
                                                 "schoolName"]],
                            how="left",
                            left_on="teacherId",
                            right_on="id")
review_df.drop(['__typename','adminReviewedAt','id_y','flagStatus'],axis=1,inplace=True)
review_df.rename({'id_x': 'reviewId'},axis=1,inplace=True)
review_df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,attendanceMandatory,clarityRating,class,comment,date,difficultyRating,grade,helpfulRating,reviewId,isForCredit,...,textbookUse,thumbs,thumbsDownTotal,thumbsUpTotal,wouldTakeAgain,teacherId,firstName,lastName,department,schoolName
0,,5,PIA210,Professor Gold&#39;s greatest strength is in h...,2019-05-16 02:14:47 +0000 UTC,3,A,5,UmF0aW5nLTMxODg2OTEy,True,...,5.0,[],0,2,1.0,VGVhY2hlci01MTM3MA==,Jerry,Gold,Psychology,Adelphi University
1,mandatory,5,PSY502001,I took professor Gold for MHC1+MHC2. He is won...,2019-05-13 20:57:38 +0000 UTC,4,A,5,UmF0aW5nLTMxODYxMDc2,True,...,5.0,[],0,1,1.0,VGVhY2hlci01MTM3MA==,Jerry,Gold,Psychology,Adelphi University
2,mandatory,4,PMH501001,Great Class! Be prepared to work hard though. ...,2018-12-11 15:32:59 +0000 UTC,4,A,4,UmF0aW5nLTMwOTY3MTkw,True,...,5.0,[],0,0,1.0,VGVhY2hlci01MTM3MA==,Jerry,Gold,Psychology,Adelphi University
3,mandatory,5,PSY283,Doctor Gold is hilarious and overall a great g...,2018-10-16 12:53:04 +0000 UTC,3,A,5,UmF0aW5nLTMwNTc0NDI3,True,...,5.0,[],0,0,1.0,VGVhY2hlci01MTM3MA==,Jerry,Gold,Psychology,Adelphi University
4,mandatory,5,PIA379,Gold is great and does care if you put in the ...,2018-04-19 09:06:38 +0000 UTC,3,A,5,UmF0aW5nLTI5ODQwMjM4,True,...,5.0,[],0,1,1.0,VGVhY2hlci01MTM3MA==,Jerry,Gold,Psychology,Adelphi University
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,,3,MGM666,Great professor. Maybe at the beginning you ar...,2008-05-28 23:58:00 +0000 UTC,3,,5,UmF0aW5nLTE0NTMzOTYw,False,...,5.0,[],1,0,,VGVhY2hlci01NzA3Mg==,Jeffrey,Goldstein,Philosophy,Adelphi University
154,,5,ORG366,I LUV this professor,2006-12-08 18:23:43 +0000 UTC,1,,5,UmF0aW5nLTEyNTU0MDI2,False,...,5.0,[],0,0,,VGVhY2hlci01NzA3Mg==,Jeffrey,Goldstein,Philosophy,Adelphi University
155,,1,BUS666,"Very, very, very confusing and vague teacher.....",2005-12-22 22:50:20 +0000 UTC,4,,3,UmF0aW5nLTExMjQ2NDAz,False,...,,[],0,1,,VGVhY2hlci01NzA3Mg==,Jeffrey,Goldstein,Philosophy,Adelphi University
156,,5,MGT666,"Interesting guy, likes to incorporate a lot of...",2004-11-22 12:01:46 +0000 UTC,1,,4,UmF0aW5nLTI4MTE3MTk=,False,...,,[],1,0,,VGVhY2hlci01NzA3Mg==,Jeffrey,Goldstein,Philosophy,Adelphi University


In [16]:
review_df = review_df[["firstName",
                       "lastName",
                       'teacherId',
                       "department",
                       "schoolName",
                       'class',
                       'date',
                       'reviewId',
                       'clarityRating',
                       'difficultyRating',
                       'helpfulRating',
                       'wouldTakeAgain',
                       'textbookUse',
                       'comment',
                       'ratingTags',
                       'teacherNote',
                       'grade',
                       'attendanceMandatory',
                       'isForCredit', 
                       'isForOnlineClass',
                       'thumbs', 
                       'thumbsDownTotal', 
                       'thumbsUpTotal']]
review_df.to_csv(f"data/output/reviews.csv",index=False)

23