In [1]:
import pandas as pd
import numpy as np
import requests
import os
from ratelimit import limits, sleep_and_retry
from pathlib import Path
from backoff import on_exception, expo
from math import inf

DATA_DIR = Path("./data").resolve()
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)

In [6]:
SALARIES_CSV_PATH = os.path.join(DATA_DIR, "salaries.csv")

if os.path.exists(SALARIES_CSV_PATH):
    print("[i] salaries.csv exists, skipping...")
    raise

# max_pages = 3
max_pages = float("inf")

salaries = []

# Diamondback has data from 2013 to 2022
for year in range(2013, 2023):
    page = 1
    page_data = [0]

    while len(page_data) and page <= max_pages:
        print(f"[i] Getting page {page} for year {year}")
        r = requests.get(
            f"https://api.dbknews.com/salary/year/{year}", params={"page": page}
        )

        page_data = list(map(lambda x: {"year": year, **x}, r.json()["data"]))
        page += 1

        salaries += page_data

df = pd.DataFrame.from_records(salaries)
df.columns = df.columns.str.lower()

df.to_csv(SALARIES_CSV_PATH, index=False)
print(f"[i] Writing to {SALARIES_CSV_PATH}")

[i] salaries.csv exists, skipping...


Exception: 

In [7]:
df = pd.read_csv(SALARIES_CSV_PATH)
df["employee"] = df["employee"].str.replace("\n", " ")

for i, row in df.iterrows():
    parts = row["employee"].split(", ")
    # assert len(parts) == 2

    df.at[i, "name"] = ' '.join(parts[::-1])
df.head()

# df[(df["name"].str.split(" ").str.len() > 2) & (df["department"].str.lower().str.contains("professor"))]
# df[(df["employee"].str.split(", ").str.len() > 2)]

Unnamed: 0,year,employee,department,division,title,salary,name
0,2013,"Abed, Eyad H",ENGR-Electrical & Computer Engineering,A. James Clark School of Engineering,Prof,"$216,648.00",Eyad H Abed
1,2013,"Abshire, Pamela A.",ENGR-Electrical & Computer Engineering,A. James Clark School of Engineering,Assoc Prof,"$82,872.96",Pamela A. Abshire
2,2013,"Abshire, Pamela A.",ENGR-Institute for Systems Research,A. James Clark School of Engineering,Assoc Prof,"$55,149.36",Pamela A. Abshire
3,2013,"Abts, Leigh R",ENGR-Fischell Department of Bioengineering,A. James Clark School of Engineering,Res Assoc Prof,"$126,334.14",Leigh R Abts
4,2013,"Adams, Douglas J",ENGR-Continuing & Distance Learning in Engr,A. James Clark School of Engineering,Engineer,"$64,260.00",Douglas J Adams


In [2]:
def scrape_planetterp(max_pages=inf):
    @sleep_and_retry
    @on_exception(
        expo,
        [
            requests.exceptions.JSONDecodeError,
            requests.exceptions.Timeout,
            requests.exceptions.ConnectTimeout,
            requests.exceptions.ConnectionError,
        ],
    )
    @limits(calls=2, period=1)
    def call_api(page: int):
        r = requests.get(
            "https://planetterp.com/api/v1/professors",
            params={"offset": page, "reviews": "true"},
        )
        return r.json()

    rows = []
    page = 0
    page_data = [0]

    while len(page_data) and page < max_pages:
        print(f"[i] Getting page {page}")
        page_data = call_api(page)
        page += 1
        rows += page_data

    print("[i] Creating dataframe...")
    df = pd.DataFrame(rows)
    return df


pt_df = scrape_planetterp()


[i] Getting page 0
[i] Getting page 1
[i] Getting page 2
[i] Getting page 3
[i] Getting page 4
[i] Getting page 5
[i] Getting page 6
[i] Getting page 7
[i] Getting page 8
[i] Getting page 9
[i] Getting page 10
[i] Getting page 11
[i] Getting page 12
[i] Getting page 13
[i] Getting page 14
[i] Getting page 15
[i] Getting page 16
[i] Getting page 17
[i] Getting page 18
[i] Getting page 19
[i] Getting page 20
[i] Getting page 21
[i] Getting page 22
[i] Getting page 23
[i] Getting page 24
[i] Getting page 25
[i] Getting page 26
[i] Getting page 27
[i] Getting page 28
[i] Getting page 29
[i] Getting page 30
[i] Getting page 31
[i] Getting page 32
[i] Getting page 33
[i] Getting page 34
[i] Getting page 35
[i] Getting page 36
[i] Getting page 37
[i] Getting page 38
[i] Getting page 39
[i] Getting page 40
[i] Getting page 41
[i] Getting page 42
[i] Getting page 43
[i] Getting page 44
[i] Getting page 45
[i] Getting page 46
[i] Getting page 47
[i] Getting page 48
[i] Getting page 49
[i] Gettin

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [30]:
pt_df

Unnamed: 0,courses,average_rating,type,reviews,name,slug
0,[],,professor,[],Chen Feng,feng_chen
1,[],,professor,[],Esther Wood,wood_esther
2,[],,professor,[],Liangjun Shi,shi_liangjun
3,[],,professor,[],Rochelle Ford,ford_rochelle
4,[],,professor,[],Mark Stephen Graham,graham
5,[],,professor,[],Lena Morreale Scott,scott
6,[],,professor,[],Theresa Nebel Robinson,robinson
7,[],,professor,[],Esther Wood,wood_esther
8,[],,professor,[],Liangjun Shi,shi_liangjun
9,[],,professor,[],Rochelle Ford,ford_rochelle
