In [39]:
import pandas as pd
import numpy as np
from functools import reduce
import json

In [40]:
# 1. Extract time-invariant vars (locations, adm criteria, tuition) from the most recent year

'''
    We will include all the universities appearing in our dataset of the following year: 2016 - 2019
    and the data in the most recent year will be selected.
    
    E.g., universities that appeared in 2019 will have their data in 2019 chosen; 
          those that didn't appear in 2019 but appeared in 2018 will also be included with their 2018 records.
    
'''

col_geo = ['UNITID', 'INSTNM', 'ADDR', 'CITY', 'STABBR', 'LONGITUD', 'LATITUDE']
col_adm = ['UNITID'] + [f'ADMCON{n}' for n in range(1, 10)]
col_fee = ['UNITID', 'TUITION2', 'TUITION3']

df_lists = []
for i in range(2016, 2020):
    geo = pd.read_csv(f"./University_data/hd{i}.csv", encoding='ISO-8859-1').loc[:, col_geo]
    adm = pd.read_csv(f"./University_data/adm{i}.csv").loc[:, col_adm]
    fee = pd.read_csv(f"./University_data/ic{i}_ay.csv").loc[:, col_fee].replace('.', np.nan).dropna().astype(int)
    geo['YEAR'] = i
    adm['YEAR'] = i
    fee['YEAR'] = i
    df_lists.append([geo, adm, fee])

def append_dfs(pos):
    df = reduce(lambda top, bottom: top.append(bottom), [df_list[pos] for df_list in df_lists])
    return df.sort_values(['UNITID', 'YEAR']).groupby('UNITID').apply(lambda df: df.iloc[-1])

df_geo = append_dfs(0)
df_admc = append_dfs(1)
df_fee = append_dfs(2)

In [41]:
# 2. Calculate the average admission rates (disaggregated by gender) in the past 10 years

col_rate = ["UNITID", "APPLCN", "ADMSSN", "APPLCNM", "ADMSSNM", "APPLCNW", "ADMSSNW"]
df_admr = pd.DataFrame()
for i in range(2010, 2020):
    admr = pd.read_csv(f"./University_data/adm{i}.csv")
    admr = admr.loc[:, col_rate].replace(".", np.nan).astype(float)
    admr["ADMR"] = admr["ADMSSN"] / admr["APPLCN"]
    admr["ADMRM"] = admr["ADMSSNM"] / admr["APPLCNM"]
    admr["ADMRW"] = admr["ADMSSNW"] / admr["APPLCNW"]
    df_admr = df_admr.append(admr)
    
df_admr = df_admr.groupby("UNITID")[["ADMR", "ADMRM", "ADMRW"]].mean().dropna().round(4)

In [42]:
# 3. Average the parameters (mean, SD) of SAT/ACT scores in the past 10 years (assuming normal distribution)

col_sco = ["SATVR25", "SATVR75", "SATMT25", "SATMT75", 
           "ACTCM25", "ACTCM75", "ACTEN25", "ACTEN75", "ACTMT25", "ACTMT75"]
df_adms = pd.DataFrame()
for i in range(2010, 2020):
    adms = pd.read_csv(f"./University_data/adm{i}.csv", index_col="UNITID")
    adms.columns = [col.strip() for col in adms.columns]
    adms = adms.loc[:, col_sco].replace(".", np.nan).astype(float)
    adms["YEAR"] = i
    for col in ["SATVR", "SATMT", "ACTCM", "ACTEN", "ACTMT"]:
        adms[f"{col}M"] = adms[f"{col}25"] + (adms[f"{col}75"] - adms[f"{col}25"]) / 2
        adms[f"{col}SD"] = (adms[f"{col}75"] - adms[f"{col}M"]) / 0.675  #z-score for 75% percentile
    df_adms = df_adms.append(adms)
    
df_adms = df_adms.iloc[:, 10:].groupby('UNITID').mean().dropna().round(4)

In [43]:
# 4. Merge
dfls = [df_geo.iloc[:, 1:-1], df_fee.iloc[:, 1:-1], df_admc.iloc[:, 1:-1], df_admr, df_adms.iloc[:, 1:]]
final = reduce(lambda l, r: pd.merge(l, r, how='inner', left_index=True, right_index=True), dfls).dropna()

In [45]:
# 5. University ranking

with open("./University_data/correct_data.json") as datafile:
    correct_data = json.load(datafile)
    
rank = pd.read_csv("./University_data/cwurData.csv").rename(columns={'institution':'INSTNM','national_rank':'rank'})
rank = rank.drop_duplicates(subset='INSTNM').loc[rank['country'] == 'USA', ['INSTNM', 'rank']]
rank['INSTNM'].replace({", ": "-", '–': '-'}, inplace=True, regex=True)
rank['INSTNM'].replace(correct_data['rename'], inplace=True)

final_rank = pd.merge(final.reset_index(), rank, on='INSTNM', how='left').sort_values('rank')
final_rank['rank'].fillna(9999, inplace=True)

# handle some empty address
for school, address in correct_data['addr'].items():
    final_rank.loc[final_rank['INSTNM'] == school, 'ADDR'] = address

In [46]:
# Export data
sup = pd.read_excel("./University_data/suppliment.xlsx", index_col='UNITID').iloc[:, 2:]
sup['image_url'].fillna('https://imgur.com/tH33XHm', inplace=True)
sup.fillna(9999, inplace=True)
final_sup = pd.merge(final_rank.set_index('UNITID'), sup, how='left', left_index=True, right_index=True)
final_sup.to_csv('./University_data/ranked_merged_data_sup.csv')