In [87]:
import pandas as pd
import numpy as np
from functools import reduce

In [88]:
# 1. Extract time-invariant from the most recent year
col_geo = ["UNITID", "INSTNM", "ADDR", "CITY", "STABBR", "LONGITUD", "LATITUDE"]
col_adm = ["UNITID"] + [f"ADMCON{n}" for n in range(1, 10)]
col_fee = ["UNITID", "TUITION2", "TUITION3"]

geo = pd.read_csv("./University_data/hd2019.csv", encoding="ISO-8859-1").loc[:, col_geo]
adm = pd.read_csv("./University_data/adm2019.csv").loc[:, col_adm]
fee = pd.read_csv("./University_data/ic2019_ay.csv").loc[:, col_fee].replace(".", np.nan).dropna().astype(int)

# 2. Calculate gender-specific admission rates in the past 10 years
col_gen = ["UNITID", "APPLCNM", "ADMSSNM", "APPLCNW", "ADMSSNW"]
adm_gender = pd.DataFrame()
for i in range(2014, 2020):
    gen = pd.read_csv(f"./University_data/adm{i}.csv").loc[:, col_gen]
    gen["YEAR"] = i
    gen["ADMRM"] = gen["ADMSSNM"] / gen["APPLCNM"]
    gen["ADMRW"] = gen["ADMSSNW"] / gen["APPLCNW"]
    adm_gender = adm_gender.append(gen)
    
adm_gender = adm_gender.groupby("UNITID")[["ADMRM", "ADMRW"]].mean().dropna()

# 3. Calculate SAT/ACT scores
col_sco = ["UNITID", "SATVR25", "SATVR75", "SATMT25", "SATMT75", 
           "ACTCM25", "ACTCM75", "ACTEN25", "ACTEN75", "ACTMT25", "ACTMT75"]
adm_sco = pd.DataFrame()
for i in range(2014, 2020):
    sco = pd.read_csv(f"./University_data/adm{i}.csv")
    sco.columns = [col.strip() for col in sco.columns]
    sco = sco.loc[:, col_sco]
    adm_sco = adm_sco.append(sco)
    
adm_sco = adm_sco.groupby("UNITID").mean().dropna()

# 4. Merge
df = reduce(lambda left, right: pd.merge(left, right, how="inner", on="UNITID"), 
            [geo, adm, fee, adm_gender, adm_sco])

In [89]:
# Check the school with annual tuition more than 50000
df[df["TUITION3"] > 50000].drop_duplicates(subset="INSTNM")

Unnamed: 0,UNITID,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,ADMCON1,ADMCON2,ADMCON3,...,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,ACTEN25,ACTEN75,ACTMT25,ACTMT75
52,110404,California Institute of Technology,1200 E California Blvd,Pasadena,CA,-118.125878,34.137349,2,2,1,...,738.333333,788.333333,778.333333,800.000000,34.333333,35.500000,34.333333,35.666667,34.833333,36.000000
72,111948,Chapman University,One University Dr.,Orange,CA,-117.852518,33.793020,1,2,1,...,575.000000,661.666667,573.333333,670.000000,25.000000,30.000000,25.166667,32.333333,24.000000,28.833333
76,112260,Claremont McKenna College,500 E 9th St,Claremont,CA,-117.711188,34.102154,2,2,1,...,663.333333,740.000000,680.000000,770.000000,30.333333,33.500000,30.666667,34.333333,28.500000,33.666667
79,115409,Harvey Mudd College,301 Platt Blvd.,Claremont,CA,-117.709837,34.106515,5,5,1,...,696.333333,768.333333,751.666667,800.000000,32.833333,35.000000,33.166667,35.500000,33.166667,35.166667
90,120254,Occidental College,1600 Campus Rd,Los Angeles,CA,-118.210908,34.127124,5,5,1,...,623.333333,706.666667,620.000000,720.000000,27.666667,31.500000,28.833333,34.000000,26.000000,30.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,237057,Whitman College,345 Boyer Ave,Walla Walla,WA,-118.331546,46.069815,5,5,1,...,601.666667,711.666667,600.000000,713.333333,27.666667,32.166667,28.000000,35.000000,25.000000,31.000000
1134,238333,Beloit College,700 College St,Beloit,WI,-89.029285,42.504022,1,5,1,...,546.666667,676.666667,541.666667,693.333333,23.000000,29.666667,23.333333,31.333333,21.333333,27.666667
1157,243744,Stanford University,,Stanford,CA,-122.167359,37.429434,2,3,1,...,690.000000,773.333333,710.000000,795.000000,31.500000,34.833333,32.833333,35.500000,30.000000,35.000000
1178,441982,Franklin W Olin College of Engineering,1000 Olin Way,Needham,MA,-71.263378,42.293527,1,2,1,...,701.666667,771.666667,735.000000,796.666667,32.833333,34.833333,33.000000,35.000000,32.500000,35.000000


In [90]:
# Check info by specific schools
df[df["INSTNM"] == "University of Pennsylvania"]

Unnamed: 0,UNITID,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,ADMCON1,ADMCON2,ADMCON3,...,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,ACTEN25,ACTEN75,ACTMT25,ACTMT75
889,215062,University of Pennsylvania,34th & Spruce Street,Philadelphia,PA,-75.19391,39.950929,1,3,1,...,686.666667,765.0,715.0,791.666667,31.833333,34.666667,32.833333,35.333333,30.5,34.833333


In [91]:
# Export data
df.to_csv("./University_data/merged_data.csv")