In [462]:
import json
import pandas as pd

In [463]:
# before you run this cell, unzip the data in data-raw directory

fname = '../data-raw/Most+Recent+Cohorts+(All+Data+Elements).csv'
df = pd.read_csv(fname, na_values=['NULL', 'PrivacySuppressed'])

print df.shape
print df.iloc[0]

(7804, 1728)
UNITID                                                     100654
OPEID                                                      100200
opeid6                                                       1002
INSTNM                                   Alabama A & M University
CITY                                                       Normal
STABBR                                                         AL
ZIP                                                         35762
AccredAgency    Southern Association of Colleges and Schools C...
INSTURL                                             www.aamu.edu/
NPCURL             galileo.aamu.edu/netpricecalculator/npcalc.htm
HCM2                                                            0
main                                                            1
NUMBRANCH                                                       1
PREDDEG                                                         3
HIGHDEG                                                        

In [464]:
# Predominantly bachelor's degree granting (PREDDEG = 3)
# No Special Focus Institutions (using CCBASIC)
# 4-year schools, all sizes (CCSIZSET >= 6)

dg = df.copy()
print len(dg)

dg = dg[dg.PREDDEG == 3]
print len(dg)

dg = dg[(~(dg.CCBASIC.isin([24, 25, 26]))) & (dg.CCBASIC.notnull())]
print len(dg)

dg = dg[~dg.INSTNM.str.lower().str.contains('maritime|marine ')]
print len(dg)

dg = dg[(dg.CCSIZSET >= 6) | (pd.isnull(dg.CCSIZSET))]
print len(dg)

7804
2133
1786
1781
1775


In [465]:
# % Who Graduate In 6 years = C150_4_POOLED
# % Of Students Who Are First Generation College Students = PAR_ED_PCT_1STGEN
# % Of Students Recieving Federal Loans = PCTFLOAN
# % Of Students Recieving Pell Grants = PCTPELL

# Default Rate = CDR3

# Average Wage, 6 Years After Entry = md_earn_wne_p6 (median), mn_earn_wne_p6 (mean), sd_earn_wne_p6 (sd)
# Median Wage, 10 years After Entry = md_earn_wne_p10 (median), mn_earn_wne_p10 (mean), sd_earn_wne_p10 (sd)

# Average Net Price = COALESCE(NPT4_PUB, NPT4_PRIV)
# Net Price For Students Whose Families Earn less than $48,000 = COALESCE(NPT4_048_PUB, NPT4_048_PRIV)

In [466]:
cols_to_keep = [
    'UNITID', 'INSTNM', 'CITY', 'STABBR', 'ZIP', 'INSTURL',
    'C150_4_POOLED', 'PAR_ED_PCT_1STGEN', 'PCTFLOAN', 'PCTPELL', 'CDR3',
    'md_earn_wne_p6', 'mn_earn_wne_p6', 'sd_earn_wne_p6',
    'md_earn_wne_p10', 'mn_earn_wne_p10', 'sd_earn_wne_p10',
    'NPT4_PUB', 'NPT4_PRIV', 'NPT4_048_PUB', 'NPT4_048_PRIV',
]

dh = dg[cols_to_keep].copy()

# lowercase all column names
dh.columns = [c.lower() for c in dh.columns]

dh["npt4_pub_priv"] = dh[["npt4_pub", "npt4_priv"]].sum(axis=1)
dh["npt4_048_pub_priv"] = dh[["npt4_048_pub", "npt4_048_priv"]].sum(axis=1)
dh.drop(['npt4_pub', 'npt4_priv', 'npt4_048_pub', 'npt4_048_priv'],inplace=True,axis=1)

# convert CDR3 (default rate) to a rate (between 0 and 1)
dh['cdr3'] = dh['cdr3'] / 100.0

dh.iloc[0]

unitid                                 100654
instnm               Alabama A & M University
city                                   Normal
stabbr                                     AL
zip                                     35762
insturl                         www.aamu.edu/
c150_4_pooled                       0.3087183
par_ed_pct_1stgen                   0.3899018
pctfloan                               0.8204
pctpell                                0.7115
cdr3                                    0.163
md_earn_wne_p6                          22800
mn_earn_wne_p6                          26100
sd_earn_wne_p6                          21100
md_earn_wne_p10                         31400
mn_earn_wne_p10                         35300
sd_earn_wne_p10                         27800
npt4_pub_priv                           13415
npt4_048_pub_priv                       12807
Name: 0, dtype: object

In [467]:
# summary stats of numeric cols

# (metric_id, higher_is_better)
numeric_cols = [
    ('c150_4_pooled', True),
    ('par_ed_pct_1stgen', True), 
    ('pctfloan', True),
    ('pctpell', True), 
    ('cdr3', False),
    ('md_earn_wne_p6', True),
    ('mn_earn_wne_p6', True), 
    ('sd_earn_wne_p6', True),
    ('md_earn_wne_p10', True), 
    ('mn_earn_wne_p10', True), 
    ('sd_earn_wne_p10', True),
    ('npt4_pub_priv', False), 
    ('npt4_048_pub_priv', False)   
]

numeric_cols_names = [c[0] for c in numeric_cols]

# get summary stats
dh_ss = dh[numeric_cols_names].describe()

# limit to count, mean, and standard deviation
dh_ss = dh_ss.loc['count':'std']

# convert to json
stats = json.loads(dh_ss.to_json())

print stats

{u'par_ed_pct_1stgen': {u'count': 1718.0, u'std': 0.114615799, u'mean': 0.3539928224}, u'sd_earn_wne_p6': {u'count': 1725.0, u'std': 9273.1625063746, u'mean': 25200.231884058}, u'c150_4_pooled': {u'count': 1715.0, u'std': 0.2001952273, u'mean': 0.5047869299}, u'cdr3': {u'count': 1769.0, u'std': 0.0629652896, u'mean': 0.0920864895}, u'mn_earn_wne_p10': {u'count': 1705.0, u'std': 12631.635453404, u'mean': 46822.7565982405}, u'md_earn_wne_p6': {u'count': 1725.0, u'std': 7969.3617048492, u'mean': 32724.0579710145}, u'mn_earn_wne_p6': {u'count': 1725.0, u'std': 9194.0706327188, u'mean': 36009.2753623188}, u'pctpell': {u'count': 1774.0, u'std': 0.1773811107, u'mean': 0.4050651635}, u'md_earn_wne_p10': {u'count': 1705.0, u'std': 10097.5280988055, u'mean': 41775.1319648094}, u'pctfloan': {u'count': 1774.0, u'std': 0.1804809532, u'mean': 0.6068392897}, u'npt4_048_pub_priv': {u'count': 1726.0, u'std': 6446.2170752131, u'mean': 15797.6477404403}, u'npt4_pub_priv': {u'count': 1726.0, u'std': 7084.

In [468]:
# add normalized z-score for each numeric col

for col in numeric_cols:
    c = col[0]
    adj = 1 if col[1] else -1 # if lower val is better than high (i.e., default rate), then multiply z-score by -1
    dh['{}_z'.format(c)] = dh[c].apply(lambda x: 0 if pd.isnull(x) else (x - stats[c]['mean']) / stats[c]['std'] * adj)

dh.iloc[0]

unitid                                   100654
instnm                 Alabama A & M University
city                                     Normal
stabbr                                       AL
zip                                       35762
insturl                           www.aamu.edu/
c150_4_pooled                         0.3087183
par_ed_pct_1stgen                     0.3899018
pctfloan                                 0.8204
pctpell                                  0.7115
cdr3                                      0.163
md_earn_wne_p6                            22800
mn_earn_wne_p6                            26100
sd_earn_wne_p6                            21100
md_earn_wne_p10                           31400
mn_earn_wne_p10                           35300
sd_earn_wne_p10                           27800
npt4_pub_priv                             13415
npt4_048_pub_priv                         12807
c150_4_pooled_z                      -0.9793871
par_ed_pct_1stgen_z                   0.

In [469]:
# output to json

dh.to_json('../data-clean/college-data.json', orient='records')