In [3]:
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# 1. Extract time-invariant from the most recent year
col_geo = ["UNITID", "INSTNM", "ADDR", "CITY", "STABBR", "LONGITUD", "LATITUDE"]
col_adm = ["UNITID"] + [f"ADMCON{n}" for n in range(1, 10)]
col_fee = ["UNITID", "TUITION2", "TUITION3"]

geo = pd.read_csv("./University_data/hd2019.csv", encoding="ISO-8859-1").loc[:, col_geo]
adm = pd.read_csv("./University_data/adm2019.csv").loc[:, col_adm]
fee = pd.read_csv("./University_data/ic2019_ay.csv").loc[:, col_fee].replace(".", np.nan).dropna().astype(int)

# 2. Calculate gender-specific admission rates in the past 10 years
col_gen = ["UNITID", "APPLCNM", "ADMSSNM", "APPLCNW", "ADMSSNW"]
adm_gender = pd.DataFrame()
for i in range(2014, 2020):
    gen = pd.read_csv(f"./University_data/adm{i}.csv").loc[:, col_gen]
    gen["ADMRM"] = gen["ADMSSNM"] / gen["APPLCNM"]
    gen["ADMRW"] = gen["ADMSSNW"] / gen["APPLCNW"]
    adm_gender = adm_gender.append(gen)
    
adm_gender = adm_gender.groupby("UNITID")[["ADMRM", "ADMRW"]].mean().dropna().round(4)

# 3. Calculate SAT/ACT scores
col_sco = ["UNITID", "SATVR25", "SATVR75", "SATMT25", "SATMT75", 
           "ACTCM25", "ACTCM75", "ACTEN25", "ACTEN75", "ACTMT25", "ACTMT75"]
adm_sco = pd.DataFrame()
for i in range(2014, 2020):
    sco = pd.read_csv(f"./University_data/adm{i}.csv")
    sco.columns = [col.strip() for col in sco.columns]
    sco = sco.loc[:, col_sco]
    adm_sco = adm_sco.append(sco)
    
adm_sco = adm_sco.groupby("UNITID").mean().dropna().round(4)

# 4. Merge
df = reduce(lambda left, right: pd.merge(left, right, how="inner", on="UNITID"), 
            [geo, adm, fee, adm_gender, adm_sco]).set_index("UNITID")

In [5]:
# Check the school with annual tuition more than 50000
df[df["TUITION3"] > 50000].drop_duplicates(subset="INSTNM")

Unnamed: 0_level_0,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,ADMCON1,ADMCON2,ADMCON3,ADMCON4,...,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,ACTEN25,ACTEN75,ACTMT25,ACTMT75
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110404,California Institute of Technology,1200 E California Blvd,Pasadena,CA,-118.125878,34.137349,2,2,1,1,...,738.3333,788.3333,778.3333,800.0000,34.3333,35.5000,34.3333,35.6667,34.8333,36.0000
111948,Chapman University,One University Dr.,Orange,CA,-117.852518,33.793020,1,2,1,1,...,575.0000,661.6667,573.3333,670.0000,25.0000,30.0000,25.1667,32.3333,24.0000,28.8333
112260,Claremont McKenna College,500 E 9th St,Claremont,CA,-117.711188,34.102154,2,2,1,1,...,663.3333,740.0000,680.0000,770.0000,30.3333,33.5000,30.6667,34.3333,28.5000,33.6667
115409,Harvey Mudd College,301 Platt Blvd.,Claremont,CA,-117.709837,34.106515,5,5,1,1,...,696.3333,768.3333,751.6667,800.0000,32.8333,35.0000,33.1667,35.5000,33.1667,35.1667
120254,Occidental College,1600 Campus Rd,Los Angeles,CA,-118.210908,34.127124,5,5,1,2,...,623.3333,706.6667,620.0000,720.0000,27.6667,31.5000,28.8333,34.0000,26.0000,30.8333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237057,Whitman College,345 Boyer Ave,Walla Walla,WA,-118.331546,46.069815,5,5,1,2,...,601.6667,711.6667,600.0000,713.3333,27.6667,32.1667,28.0000,35.0000,25.0000,31.0000
238333,Beloit College,700 College St,Beloit,WI,-89.029285,42.504022,1,5,1,2,...,546.6667,676.6667,541.6667,693.3333,23.0000,29.6667,23.3333,31.3333,21.3333,27.6667
243744,Stanford University,,Stanford,CA,-122.167359,37.429434,2,3,1,2,...,690.0000,773.3333,710.0000,795.0000,31.5000,34.8333,32.8333,35.5000,30.0000,35.0000
441982,Franklin W Olin College of Engineering,1000 Olin Way,Needham,MA,-71.263378,42.293527,1,2,1,1,...,701.6667,771.6667,735.0000,796.6667,32.8333,34.8333,33.0000,35.0000,32.5000,35.0000


In [6]:
# Check info by specific schools
df[df["INSTNM"] == "University of Pennsylvania"]

Unnamed: 0_level_0,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,ADMCON1,ADMCON2,ADMCON3,ADMCON4,...,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,ACTEN25,ACTEN75,ACTMT25,ACTMT75
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
215062,University of Pennsylvania,34th & Spruce Street,Philadelphia,PA,-75.19391,39.950929,1,3,1,2,...,686.6667,765.0,715.0,791.6667,31.8333,34.6667,32.8333,35.3333,30.5,34.8333


In [7]:
# Export data
df.to_csv("./University_data/merged_data.csv")

In [8]:
rank = pd.read_csv("cwurData.csv")
merged = df

In [10]:
# data wrangling 
rank = rank[rank['country'] == 'USA']
rank = rank[['institution','national_rank']]
rank = rank.drop_duplicates(subset='institution')

In [11]:
full = pd.merge(merged, rank, left_on='INSTNM', right_on='institution', how='left')

In [12]:
# drop duplicate column & rearrange
full = full.drop(columns=['institution'])
full = full[ ['national_rank'] + [ col for col in full.columns if col != 'national_rank' ] ]

In [13]:
# sort by rank
full = full.sort_values(by='national_rank').reset_index(drop=True)
full

Unnamed: 0,national_rank,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,ADMCON1,ADMCON2,ADMCON3,...,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,ACTEN25,ACTEN75,ACTMT25,ACTMT75
0,1.0,Harvard University,Massachusetts Hall,Cambridge,MA,-71.118313,42.374471,2,2,1,...,711.6667,790.0000,725.0000,800.0000,32.3333,35.0000,33.5000,35.5000,31.1667,35.1667
1,2.0,Massachusetts Institute of Technology,77 Massachusetts Avenue,Cambridge,MA,-71.093226,42.359243,2,3,1,...,705.0000,775.0000,763.3333,800.0000,33.3333,35.3333,33.6667,35.5000,34.0000,36.0000
2,3.0,Stanford University,,Stanford,CA,-122.167359,37.429434,2,3,1,...,690.0000,773.3333,710.0000,795.0000,31.5000,34.8333,32.8333,35.5000,30.0000,35.0000
3,4.0,California Institute of Technology,1200 E California Blvd,Pasadena,CA,-118.125878,34.137349,2,2,1,...,738.3333,788.3333,778.3333,800.0000,34.3333,35.5000,34.3333,35.6667,34.8333,36.0000
4,5.0,Princeton University,1 Nassau Hall,Princeton,NJ,-74.659365,40.348732,2,2,1,...,700.0000,783.3333,720.0000,798.3333,31.8333,35.0000,33.0000,35.3333,30.5000,35.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,,Emory University-Oxford College,801 Emory Street,Oxford,GA,-83.871141,33.619100,2,5,1,...,640.0000,713.0000,658.0000,776.0000,28.8000,32.6000,28.0000,33.0000,27.0000,33.0000
1210,,Husson University,1 College Circle,Bangor,ME,-68.791476,44.829002,1,2,1,...,456.0000,556.0000,456.0000,554.0000,18.2000,23.2000,16.8000,23.0000,17.0000,23.6000
1211,,University of Saint Katherine,1637 Capalina Road,San Marcos,CA,-117.196552,33.151283,1,5,1,...,448.0000,545.0000,448.0000,563.0000,17.0000,18.0000,15.0000,19.0000,17.0000,20.0000
1212,,Piedmont International University,420 S. Broad St.,Winston-Salem,NC,-80.250153,36.087962,1,3,3,...,402.5000,538.7500,392.5000,526.2500,15.5000,19.5000,14.0000,19.5000,15.7500,19.0000


In [14]:
full[full['ADDR'] == " "]['INSTNM'].drop_duplicates().tolist()

['Stanford University',
 'Michigan State University',
 'Baylor University',
 'Auburn University',
 'University of Maryland-College Park',
 'Hiram College',
 'Abilene Christian University']

In [15]:
# handle some empty address
full.loc[full['INSTNM'] == 'Stanford University','ADDR'] = "450 Serra Mall"
full.loc[full['INSTNM'] == 'University of Florida','ADDR'] = "Tigert Hall"
full.loc[full['INSTNM'] == 'Michigan State University','ADDR'] = "220 Trowbridge Rd"
full.loc[full['INSTNM'] == 'Baylor University','ADDR'] = "500 Speight Ave"
full.loc[full['INSTNM'] == 'Auburn University','ADDR']= "1161 West Samford Avenue"
full.loc[full['INSTNM'] == 'University of Maryland-College Park','ADDR'] = "College Park"
full.loc[full['INSTNM'] == 'Hiram College','ADDR'] = "11715 Garfield Road"
full.loc[full['INSTNM'] == 'Abilene Christian University','ADDR'] = "1600 Campus Ct"

In [16]:
# Export data
full.to_csv('ranked_merged_data.csv')