In [None]:
import pandas as pd
import numpy as np
import requests
import os
from ratelimit import limits
from pathlib import Path

SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_DIR = os.path.join(SCRIPT_PATH, "data")

Path(DATA_DIR).mkdir(parents=True, exist_ok=True)

: 

In [None]:
SALARIES_CSV_PATH = os.path.join(DATA_DIR, "salaries.csv")

if os.path.exists(SALARIES_CSV_PATH):
    print("[i] salaries.csv exists, skipping...")
    raise

# max_pages = 3
max_pages = float("inf")

salaries = []

# Diamondback has data from 2013 to 2022
for year in range(2013, 2023):
    page = 1
    page_data = [0]

    while len(page_data) and page <= max_pages:
        print(f"[i] Getting page {page} for year {year}")
        r = requests.get(
            f"https://api.dbknews.com/salary/year/{year}", params={"page": page}
        )

        page_data = list(map(lambda x: {"year": year, **x}, r.json()["data"]))
        page += 1

        salaries += page_data

df = pd.DataFrame.from_records(salaries)
df.columns = df.columns.str.lower()

df.to_csv(SALARIES_CSV_PATH, index=False)
print(f"[i] Writing to {SALARIES_CSV_PATH}")

In [5]:
df = pd.read_csv("./data/salaries.csv")
df["employee"] = df["employee"].str.replace("\n", " ")

for i, row in df.iterrows():
    parts = row["employee"].split(", ")
    # assert len(parts) == 2

    df.at[i, "name"] = ' '.join(parts[::-1])
df.head()

# df[(df["name"].str.split(" ").str.len() > 2) & (df["department"].str.lower().str.contains("professor"))]
# df[(df["employee"].str.split(", ").str.len() > 2)]

Unnamed: 0,year,employee,department,division,title,salary,name
0,2013,"Abed, Eyad H",ENGR-Electrical & Computer Engineering,A. James Clark School of Engineering,Prof,"$216,648.00",Eyad H Abed
1,2013,"Abshire, Pamela A.",ENGR-Electrical & Computer Engineering,A. James Clark School of Engineering,Assoc Prof,"$82,872.96",Pamela A. Abshire
2,2013,"Abshire, Pamela A.",ENGR-Institute for Systems Research,A. James Clark School of Engineering,Assoc Prof,"$55,149.36",Pamela A. Abshire
3,2013,"Abts, Leigh R",ENGR-Fischell Department of Bioengineering,A. James Clark School of Engineering,Res Assoc Prof,"$126,334.14",Leigh R Abts
4,2013,"Adams, Douglas J",ENGR-Continuing & Distance Learning in Engr,A. James Clark School of Engineering,Engineer,"$64,260.00",Douglas J Adams
...,...,...,...,...,...,...,...
107390,2022,"McIntyre, Michael Flannery",VPUR-VP University Relations,VP University\nRelations,Research Asst,"$69,366.68",Michael Flannery McIntyre
107391,2022,"Traeger, Christa J.",VPUR-VP University Relations,VP University\nRelations,Research Asst,"$50,673.71",Christa J. Traeger
107392,2022,"Williams, Aaron J",VPUR-VP University Relations,VP University\nRelations,Research Asst,"$57,426.38",Aaron J Williams
107393,2022,"Kalkanis, Sophia J.",VPUR-VP University Relations,VP University\nRelations,Sr Res Asst,"$72,069.29",Sophia J. Kalkanis


In [21]:
@limits(calls=5, period=1)
def scrape_planetterp():
    rows = []
    names = df["name"].unique()
    num_names = len(names)

    for i, name in enumerate(names):
        print(f"[{i + 1}/{num_names}] Searching for {name}")
        r = requests.get("https://planetterp.com/api/v1/professor", params={
            "name": name,
            "reviews": "true"
        })

        data = r.json()

        if "error" not in data:
            rows.append(data)

    return rows

pt_data = scrape_planetterp()

[1/26440] Searching for Eyad H Abed
[2/26440] Searching for Pamela A. Abshire
[3/26440] Searching for Leigh R Abts
[4/26440] Searching for Douglas J Adams
[5/26440] Searching for Vincent G. Adams
[6/26440] Searching for Kofi Frimpong Addo
[7/26440] Searching for Raymond A. Adomaitis
[8/26440] Searching for Mohamed Sherif Aggour
[9/26440] Searching for Carol-Ann Wood Agustin
[10/26440] Searching for Farshad Ahdi
[11/26440] Searching for Bulent Akgun
[12/26440] Searching for David L. Akin
[13/26440] Searching for Akin Akturk
[14/26440] Searching for Janet L. Alessandrini
[15/26440] Searching for India T Alexander
[16/26440] Searching for Paul Allenza
[17/26440] Searching for Mohamad I. Al-Sheikhly
[18/26440] Searching for Amde M. Amde
[19/26440] Searching for Mehdi Amiri Darehbidi
[20/26440] Searching for Davinder K. Anand
[21/26440] Searching for Russell Edwards Anderson
[22/26440] Searching for Weston Anderson
[23/26440] Searching for Mikhail A. Anisimov
[24/26440] Searching for Sreera

In [22]:
len(pt_data)

2720

In [25]:
pt_df = pd.DataFrame(pt_data)
pt_df.to_csv("./data/pt_ratings.csv", index=False)