In [1]:
from collections import defaultdict
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Create a data dictionary we can use for reference.

We load `data_dict.csv` to get the meaning and data type of each column, as well as possible value-label pairings.

In [2]:
dd = pd.read_csv("data_dict.csv")
dd = dd.drop(["NOTES", "SOURCE"], axis=1)
# clean up the column names
dd.columns = ["_".join(re.split("-| ", col.lower())) for col in dd.columns]
dd = dd.set_index("variable_name")
# remove all value-label info; we'll deal with that next
data_dict = dd[~pd.isna(dd.index)].drop(["value", "label"], axis=1)

In [3]:
data_dict.loc['UNITID']

name_of_data_element       Unit ID for institution
dev_category                                  root
developer_friendly_name                         id
api_data_type                              integer
Name: UNITID, dtype: object

## Create a mapping of integer factors to their actual meanings

In [4]:
label_dict = defaultdict(dict)
last_col = dd.index[0]
for row in dd.iterrows():
    index = row[0]
    if pd.isna(index):
        index = last_col
    else:
        last_col = index
    if not pd.isna(row[1].value):
        label_dict[index][row[1].value] = row[1].label

In [5]:
label_dict['PREDDEG']

{0.0: 'Not classified',
 1.0: 'Predominantly certificate-degree granting',
 2.0: "Predominantly associate's-degree granting",
 3.0: "Predominantly bachelor's-degree granting",
 4.0: 'Entirely graduate-degree granting'}

## First attept of choosing candidate columns

In [6]:
cols = [
    "UNITID",  # maybe concatenate with year
    "INSTNM",
    "STABBR",
    "ACCREDAGENCY",
    "HCM2",
    "MAIN",
    "NUMBRANCH",
    "PREDDEG",  # filter out 4
    "HIGHDEG",  # filter out 0, 1
    "CONTROL",
    "REGION",  # possibly instead of state
    "LOCALE",  # could backfill
   # "LOCALE2",  # was blank in 2017-18
    "CCBASIC",  # could backfill
    "CCUGPROF",  # could backfill
    "CCSIZSET",  # could backfill
    "HBCU",
    "MENONLY",
    "WOMENONLY",
    "RELAFFIL",  # could bin this
    "ADM_RATE",
    "SATVR25",
    "SATVR75",
    "SATMT25",
    "SATMT75",
    "SATWR25",
    "SATWR75",
    "SATVRMID",
    "SATMTMID",
    "SATWRMID",
    "ACTCM25",
    "ACTCM75",
    "ACTEN25",
    "ACTEN75",
    "ACTMT25",
    "ACTMT75",
    "ACTWR25",
    "ACTWR75",
    "ACTCMMID",
    "ACTENMID",
    "ACTMTMID",
    "ACTWRMID",
    "PCIP01",
    "PCIP03",
    "PCIP04",
    "PCIP05",
    "PCIP09",
    "PCIP10",
    "PCIP11",
    "PCIP12",
    "PCIP13",
    "PCIP14",
    "PCIP15",
    "PCIP16",
    "PCIP19",
    "PCIP22",
    "PCIP23",
    "PCIP24",
    "PCIP25",
    "PCIP26",
    "PCIP27",
    "PCIP29",
    "PCIP30",
    "PCIP31",
    "PCIP38",
    "PCIP39",
    "PCIP40",
    "PCIP41",
    "PCIP42",
    "PCIP43",
    "PCIP44",
    "PCIP45",
    "PCIP46",
    "PCIP47",
    "PCIP48",
    "PCIP49",
    "PCIP50",
    "PCIP51",
    "PCIP52",
    "PCIP54",
    "DISTANCEONLY",
    "UGDS",
    "PPTUG_EF",
    "CURROPER",  # filter
    "COSTT4_A",  # collapse with below
    "COSTT4_P",  # collapse with above
    "NUM4_PUB",  # collapse with below; use as filter
    "NUM4_PRIV",  # collapse with above; use as filter
    "COSTT4_A",  # collapse with below
    "COSTT4_P",  # collapse with above
    "TUITIONFEE_IN",  # use as flag for "in-state is different"?
    "TUITIONFEE_OUT",  # see above
    "TUITIONFEE_PROG",  # see above
    "INEXPFTE",
    "AVGFACSAL",
    "PFTFAC",
    "PCTPELL",
    "C150_4",
    "PCTFLOAN",
    "UG25ABV",
    "COMPL_RPY_1YR_RT",
    "COMPL_RPY_3YR_RT",
    "COMPL_RPY_5YR_RT",
    "COMPL_RPY_7YR_RT",
    "GRAD_DEBT_MDN",  # potential part of target
    "COUNT_NWNE_P6",  # potentially filter? /analysis
    "COUNT_WNE_P6",  # see above
    "MN_EARN_WNE_P6",  # target?
    "MD_EARN_WNE_P6",  # target 2?
    "ICLEVEL",  # potentially redundant
    "PRGMOFR",
]

In [7]:
print(f'{len(cols)} candidate columns')

110 candidate columns


## Make a version of our data dictionary just for the columns we're using

In [8]:
df_cols = pd.DataFrame(cols)
df_cols.columns = ["variable"]
df_cols["category"] = df_cols.variable.apply(lambda x: data_dict.loc[x, "dev_category"])
df_cols["type"] = df_cols.variable.apply(lambda x: data_dict.loc[x, "api_data_type"])
df_cols["friendly_name"] = df_cols.variable.apply(
    lambda x: data_dict.loc[x, "developer_friendly_name"]
)
df_cols["explanation"] = df_cols.variable.apply(
    lambda x: data_dict.loc[x, "name_of_data_element"]
)
df_cols.set_index("variable", inplace=True)
df_cols.head()

Unnamed: 0_level_0,category,type,friendly_name,explanation
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
UNITID,root,integer,id,Unit ID for institution
INSTNM,school,autocomplete,name,Institution name
STABBR,school,string,state,State postcode
ACCREDAGENCY,school,string,accreditor,Accreditor for institution
HCM2,school,integer,under_investigation,Schools that are on Heightened Cash Monitoring...


## Ingest Data

In [9]:
df = pd.read_csv(
    "data/MERGED2017_18_PP.csv", usecols=cols, na_values=["NU", "PrivacySuppressed"]
)

In [35]:
df.CCBASIC.value_counts()

-2.0     2343
 18.0     349
 22.0     331
 24.0     301
 10.0     265
 26.0     264
 21.0     240
 19.0     196
 23.0     157
 17.0     152
 20.0     139
 7.0      139
 16.0     135
 15.0     131
 4.0      123
 1.0      122
 30.0     121
 14.0     121
 2.0      118
 6.0      111
 5.0      106
 9.0      102
 8.0       97
 3.0       82
 29.0      77
 11.0      68
 13.0      66
 25.0      56
 32.0      35
 31.0      35
 33.0      34
 12.0      31
 28.0      14
 27.0       7
Name: CCBASIC, dtype: int64

Drop null columns

In [None]:
old_cols = df.columns
new_cols = df.dropna(axis="columns", how="all").columns
df = df[new_cols].set_index("UNITID")
df_cols = df_cols.loc[df.columns]
label_dict_old = label_dict
label_dict = {
    key: label_dict_old[key]
    for key in (set(df.columns).intersection(set(label_dict_old.keys())))
}
print(f"{len(list(set(old_cols) - set(new_cols)))} columns dropped")

In [None]:
label_dict

In [None]:
top_earners = df.sort_values("MN_EARN_WNE_P6", ascending=False)[
    ["INSTNM", "MN_EARN_WNE_P6"]
].dropna()
top_earners

In [None]:
i = 0
df_cols[i * 10 : i * 10 + 10]

In [None]:
cols_drop_pre = ['INSTNM']
cols_drop_post = ['CCUGPROF']

In [None]:
# drop columns deemed unneeded
X = df.drop(cols_drop_pre, axis=1)
# must have at least 50 undergrads
X = X.query("UGDS >= 50")

In [None]:
X

In [None]:
regions = [v.split('(')[0].rstrip() for v in label_dict['REGION'].values()]

In [None]:
label_dict['REGION']

In [None]:
X["REGION"] = X.REGION.apply(lambda x: label_dict["REGION"][x].split("(")[0])

In [None]:
X["CONTROL"] = X.CONTROL.apply(lambda x: label_dict["CONTROL"][x])

In [None]:
X["MAIN"] = X.MAIN.apply(lambda x: label_dict["MAIN"][x])

In [None]:
X["HIGHDEG"] = X.HIGHDEG.apply(lambda x: label_dict["HIGHDEG"][x].split(" ")[0])

In [None]:
X["PREDDEG"] = X.PREDDEG.apply(
    lambda x: ["not_classified", "certificate", "associate", "bachelor", "grad"][x]
)

In [None]:
X = pd.get_dummies(
    X,
    dummy_na=True,
    columns=[*categorical_keys, "STABBR"],
    drop_first=True,
)

In [None]:
X

In [None]:
list(X.columns)

In [None]:
df.HIGHDEG.value_counts()

In [None]:
df.REGION.value_counts()

In [None]:
df.LOCALE

In [None]:
df = pd.read_csv("MERGED2007_08_PP.csv", na_values=["NU","PrivacySuppressed"])