# Salaries over Time from 2013 - 2019

https://salaryguide.dbknews.com/#/salGuide

This URL contains a database of all the salaries of all faculty at UMD from 2013 to 2019. At first, we were going to scrape this data from it's web pages, but after talking to the staff who work on "The Diamondback", we learned that there is an API endpoint at `https://api.dbknews.com/`.

In [1]:
import numpy
import pandas as pd
import json
import pickle
import time

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

This code is updated as of 2019. If you would like to query data beyond 2019 (or potentially before 2013), you can modify the parameters shown directly below:

In [2]:
start_year = 2013
end_year = 2019

Since there's thouands of faculty and therefore thousands of data points in a single year, all the data cannot be queried at once. Each query will give 10 faculty salaries and this is looped for all the salaries in the year.

In [3]:
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [4]:
yearQueries = list(map(str, range(start_year, end_year + 1)))

json_responses = {}
years = list(range(start_year, end_year + 1))
for yr in years:
    json_responses[str(yr)] = []

In [None]:
for query in yearQueries: 
    response = session.get('https://api.dbknews.com/salary/year/' + query)
    
    data_raw = json.loads(response.content)
    for i in range(0, int(data_raw["count"] / 10 + 2)):
        response = session.get('https://api.dbknews.com/salary/year/'+ query + '/?page=' + str(i))
        if response.status_code == 200:
            data = json.loads(response.content)
            json_responses[query].append(data)
        else:
            print('Error ->\tYear: ', query, "\tPage #: ", i)

This data is then merged together and concatted. 

In [None]:
# Put dictionary array values into main dictinoary

years = sorted(list(map(int, list(json_responses.keys()))))
salary_dfs = {}
for yr in years:
    salary_dfs[str(yr)] = []

for year in json_responses.keys():
    for page in json_responses[year]:
        if year in salary_dfs:
            salary_dfs[year].extend(page['data'])

In [None]:
# Convert dictionary array values into dataframes and concat dataframes

for key in salary_dfs.keys():
    salary_dfs[key] = pd.DataFrame(salary_dfs[key])
    salary_dfs[key]['Year'] = key
salaries = pd.concat(salary_dfs.values(), sort=True)

We then dropped any duplicates and type casted the columns into the correct types.

In [None]:
salaries = salaries.drop_duplicates()
salaries = salaries.reset_index()

In [None]:
salaries['Year'] = salaries['Year'].astype(int)
salaries['Salary'] = salaries['Salary'].replace('[\$,]', '', regex=True).astype(float)

In [None]:
salaries['School'] = salaries['Department'].apply(lambda x : x.partition('-')[0])

In [None]:
salaries = salaries[['Year', 'School', 'Department', 'Division', 'Title', 'Employee', 'Salary']]

In [None]:
salaries

In [None]:
salaries.to_pickle('df/salaries')

In [None]:
salaries1.loc[(salaries1['Employee'] == 'Varshney, Amitabh') & (salaries1['Year'] == 2019)].drop_duplicates()

In [None]:
salaries2.loc[(salaries2['Employee'] == 'Varshney, Amitabh')]