# Rate my professor scraper

Scraping data from https://www.ratemyprofessors.com/
    
2 outputs
1. Teacher profile including teacher name, university, average score, average difficulty rating, and top tags
2. For each teacher, all their reviews

In [1]:
# Data manipulation libraries
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
import requests

tqdm.pandas()

In [24]:
from time import sleep
import json

def get_teacher_data(school_id="U2Nob29sLTE0NjY=",increment=100):
    query = {'query': 'query TeacherSearchPaginationQuery(\n  $count: Int!\n  $cursor: String\n  $query: TeacherSearchQuery!\n) {\n  search: newSearch {\n    ...TeacherSearchPagination_search_1jWD3d\n  }\n}\n\nfragment TeacherSearchPagination_search_1jWD3d on newSearch {\n  teachers(query: $query, first: $count, after: $cursor) {\n    edges {\n      cursor\n      node {\n        ...TeacherCard_teacher\n        id\n        __typename\n      }\n    }\n    pageInfo {\n      hasNextPage\n      endCursor\n    }\n    resultCount\n  }\n}\n\nfragment TeacherCard_teacher on Teacher {\n  id\n  legacyId\n  avgRating\n  numRatings\n  ...CardFeedback_teacher\n  ...CardSchool_teacher\n  ...CardName_teacher\n  ...TeacherBookmark_teacher\n}\n\nfragment CardFeedback_teacher on Teacher {\n  wouldTakeAgainPercent\n  avgDifficulty\n}\n\nfragment CardSchool_teacher on Teacher {\n  department\n  school {\n    name\n    id\n  }\n}\n\nfragment CardName_teacher on Teacher {\n  firstName\n  lastName\n}\n\nfragment TeacherBookmark_teacher on Teacher {\n  id\n  isSaved\n}\n',
     'variables': {'count': increment,
      'query': {'text': '', 'schoolID': school_id}}}
    has_next_page = True
    teacher_data = []
    cursor = None
    while has_next_page:
        sleep(1)
        r = requests.post(url,json=query,headers = headers)
        json_data = json.loads(r.text)['data']['search']['teachers']
        if not r.status_code == 200:
            print(r)
            break
        teacher_page = json_data['edges']
        teacher_data += teacher_page
        # update cursor to start at next page
        if len(teacher_page): cursor = teacher_page[-1]['cursor']
        if query['variables'].get('cursor') == cursor:
            break
        query['variables']['cursor'] = cursor
#         print(cursor,has_next_page,len(teacher_data))
    return [x['node'] for x in teacher_data]

school_ids = ['U2Nob29sLTEyNTAy','U2Nob29sLTE0NjY=']

teacher_data = []
for sid in tqdm(school_ids):
    print(sid)
    teacher_data += get_teacher_data(sid)
    
teacher_df = pd.DataFrame.from_records(teacher_data)
teacher_df[['schoolId','schoolName']] = pd.json_normalize(teacher_df['school'])
teacher_df.drop(['__typename','isSaved','school'],axis=1,inplace=True)
teacher_df

  0%|          | 0/2 [00:00<?, ?it/s]

U2Nob29sLTEyNTAy
U2Nob29sLTE0NjY=


Unnamed: 0,avgDifficulty,avgRating,department,firstName,id,lastName,legacyId,numRatings,wouldTakeAgainPercent,schoolId,schoolName
0,3.3,4.5,Law,Federico,VGVhY2hlci04MjQzODM=,Varese,824383,6,100.0000,U2Nob29sLTEyNTAy,Oxford University
1,0.0,0.0,Education,Chris,VGVhY2hlci04MjY1ODk=,Davies,826589,0,-1.0000,U2Nob29sLTEyNTAy,Oxford University
2,4.3,4.0,Geography,Barbara,VGVhY2hlci04MzI0Mjk=,Kennedy,832429,3,100.0000,U2Nob29sLTEyNTAy,Oxford University
3,2.5,4.5,Mathematics,David,VGVhY2hlci04NTUxMjU=,Stirzaker,855125,2,-1.0000,U2Nob29sLTEyNTAy,Oxford University
4,4.0,3.8,Physics,Julia,VGVhY2hlci04NTc2ODU=,Yeomans,857685,4,100.0000,U2Nob29sLTEyNTAy,Oxford University
...,...,...,...,...,...,...,...,...,...,...,...
1678,2.6,3.5,Chemistry,Stephen,VGVhY2hlci0yMjUyNTg=,Brown,225258,23,100.0000,U2Nob29sLTE0NjY=,Queen's University at Kingston
1679,4.5,2.0,Psychology,Meghan,VGVhY2hlci0yMzY5NTk2,Norris,2369596,20,30.0000,U2Nob29sLTE0NjY=,Queen's University at Kingston
1680,2.7,3.3,Computer Science,Wendy L,VGVhY2hlci0xNjM3OTk=,Powley,163799,42,63.6364,U2Nob29sLTE0NjY=,Queen's University at Kingston
1681,2.8,4.7,Film,Philippe,VGVhY2hlci0yMzAxNzQy,Gauthier,2301742,377,94.6950,U2Nob29sLTE0NjY=,Queen's University at Kingston


In [29]:
schools = ",".join(teacher_df['schoolName'].unique())
teacher_df.to_csv(f"data/output/{schools}-teachers.csv")