In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [30]:
path = "data/MERGED2015_16_PP.csv"
data = pd.read_csv(path, low_memory=False)

# Data Preprocessing

### Listing out columns of importance

In [31]:
cols = ["OPEID", "UNITID", "INSTNM", "CITY", "STABBR", "ZIP", "CURROPER", "MAIN", "PREDDEG", "HIGHDEG", 
        "CONTROL", "RELAFFIL", "DISTANCEONLY", "ADM_RATE", "SAT_AVG", "ACTCMMID", "UGDS", "UGDS_WHITE", "UGDS_BLACK", "UGDS_HISP", 
        "UGDS_ASIAN", "UGDS_AIAN", "UGDS_NHPI", "UGDS_2MOR", "UGDS_NRA", "UGDS_UNKN", 
        "HBCU", "PBI", "ANNHI", "TRIBAL", "HSI", "NANTI", "MENONLY", "WOMENONLY", "PPTUG_EF", 
        "UG25ABV", "INC_PCT_LO", "INC_PCT_M1", "INC_PCT_M2", "INC_PCT_H1", "INC_PCT_H2", 
        "PAR_ED_PCT_1STGEN", "NPT4_PUB", "NPT4_PRIV", "COSTT4_A", "TUITIONFEE_IN", "TUITIONFEE_OUT", 
        "NPT41_PUB", "NPT42_PUB", "NPT43_PUB", "NPT44_PUB", "NPT45_PUB", "NPT41_PRIV", "NPT42_PRIV", 
        "NPT43_PRIV", "NPT44_PRIV", "NPT45_PRIV", "PCTFLOAN", "PCTPELL", "GRAD_DEBT_MDN", 
        "WDRAW_DEBT_MDN", "GRAD_DEBT_MDN10YR", "CDR3", "RPY_3YR_RT", 
        "RPY_5YR_RT", "RPY_7YR_RT", "C150_4", "D150_4", "CIP01BACHL", "CIP03BACHL", "CIP04BACHL", "CIP05BACHL", "CIP09BACHL", 
        "CIP10BACHL", "CIP11BACHL", "CIP12BACHL", "CIP13BACHL", "CIP14BACHL", "CIP15BACHL", 
        "CIP16BACHL", "CIP19BACHL", "CIP22BACHL", "CIP23BACHL", "CIP24BACHL", "CIP25BACHL", 
        "CIP26BACHL", "CIP27BACHL", "CIP29BACHL", "CIP30BACHL", "CIP31BACHL", "CIP38BACHL", 
        "CIP39BACHL", "CIP40BACHL", "CIP41BACHL", "CIP42BACHL", "CIP43BACHL", "CIP44BACHL", 
        "CIP45BACHL", "CIP46BACHL", "CIP47BACHL", "CIP48BACHL", "CIP49BACHL", "CIP50BACHL", 
        "CIP51BACHL", "CIP52BACHL", "CIP54BACHL", "PCIP01", "PCIP03", "PCIP04", "PCIP05", 
        "PCIP09", "PCIP10", "PCIP11", "PCIP12", "PCIP13", "PCIP14", "PCIP15", "PCIP16", 
        "PCIP19", "PCIP22", "PCIP23", "PCIP24", "PCIP25", "PCIP26", "PCIP27", "PCIP29", 
        "PCIP30", "PCIP31", "PCIP38", "PCIP39", "PCIP40", "PCIP41", "PCIP42", "PCIP43", 
        "PCIP44", "PCIP45", "PCIP46", "PCIP47", "PCIP48", "PCIP49", "PCIP50", "PCIP51", 
        "PCIP52", "PCIP54"]

### Filtering data for just Undergrad programs, correcting data types of columns and getting rid of unwanted literals

In [32]:
df = data[cols]
df = df.loc[(df['PREDDEG'] == 3) & (df['CURROPER'] == 1)]
df.drop(["PREDDEG", "CURROPER"], axis=1, inplace=True)
df = df.replace('PrivacySuppressed', df.replace(['PrivacySuppressed'], [None]))
wrong_data_type_cols = ["INC_PCT_LO", "INC_PCT_M1", "INC_PCT_M2", "INC_PCT_H1", "INC_PCT_H2", 
                        "PAR_ED_PCT_1STGEN", "GRAD_DEBT_MDN", "WDRAW_DEBT_MDN", 
                        "GRAD_DEBT_MDN10YR", "RPY_3YR_RT", "RPY_5YR_RT", "RPY_7YR_RT"]
df[wrong_data_type_cols] = df[wrong_data_type_cols].apply(pd.to_numeric)

## Filling NaN values

In [33]:
def fill_nan(col):
    if df[col].dtype != 'O':
        df[col].fillna(df[col].median(), inplace=True)

### Filling in the SAT_AVGs from merged Scorecard.csv

In [34]:
sat_scores = pickle.load(open("final_sat.p", "rb"))
final_sat = {}
for key in sat_scores:
    if not any(c.isalpha() for c in key):
        final_sat[int(key)] = sat_scores[key]

In [35]:
for opeid in final_sat:
    df.loc[(df['OPEID'] == opeid), 'SAT_AVG'] = final_sat[opeid]

### Merging the financial data columns for Public and Private Institutions and updating column names

In [36]:
cols_pub = ["NPT4_PUB", "NPT41_PUB", "NPT42_PUB", "NPT43_PUB", "NPT44_PUB", "NPT45_PUB"]
cols_priv = ["NPT4_PRIV", "NPT41_PRIV", "NPT42_PRIV", "NPT43_PRIV", "NPT44_PRIV", "NPT45_PRIV"]
for i in range(len(cols_pub)):
    df[cols_pub[i]] = df[cols_pub[i]].fillna(df[cols_priv[i]])
    
df.rename(columns={"NPT4_PUB" : "NPT4", "NPT41_PUB" : "NPT41", "NPT42_PUB" : "NPT42", 
                   "NPT43_PUB" : "NPT43", "NPT44_PUB" : "NPT44", "NPT45_PUB" : "NPT45"}, 
          inplace=True)
df.drop(cols_priv, axis=1, inplace=True)

### Filling NaN values for all financial data columns with mean values based on CONTROL type

In [37]:
for i in range(1,4):
    for col in ['COSTT4_A', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'NPT4', 'NPT41', 'NPT42', 'NPT43', 'NPT44', 'NPT45']:
        df.loc[(df['CONTROL'] == i), col] = df.loc[(df['CONTROL'] == i)][col].fillna(int(df.loc[(df['CONTROL'] == i)][col].mean()))

### Filling the remaining NaN values with median values

In [38]:
#For religious affiliation, if value not present then marking it as "Not Reported (-1)"
df['RELAFFIL'] = df['RELAFFIL'].fillna(-1)

for col in df.columns:
    fill_nan(col)

### Cleaning CIP columns

In [39]:
cip_columns = ["CIP01BACHL", "CIP03BACHL", "CIP04BACHL", "CIP05BACHL", "CIP09BACHL", 
        "CIP10BACHL", "CIP11BACHL", "CIP12BACHL", "CIP13BACHL", "CIP14BACHL", "CIP15BACHL", 
        "CIP16BACHL", "CIP19BACHL", "CIP22BACHL", "CIP23BACHL", "CIP24BACHL", "CIP25BACHL", 
        "CIP26BACHL", "CIP27BACHL", "CIP29BACHL", "CIP30BACHL", "CIP31BACHL", "CIP38BACHL", 
        "CIP39BACHL", "CIP40BACHL", "CIP41BACHL", "CIP42BACHL", "CIP43BACHL", "CIP44BACHL", 
        "CIP45BACHL", "CIP46BACHL", "CIP47BACHL", "CIP48BACHL", "CIP49BACHL", "CIP50BACHL", 
        "CIP51BACHL", "CIP52BACHL", "CIP54BACHL"]

In [41]:
for col in cip_columns:
    df[col] = df[col].clip_upper(1)

### Saving the cleaned data to a new file

In [31]:
df.to_csv("cleaned_data.csv", encoding='utf-8', index=False)