# College Scorecard EDA
## Set Up & Loading

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date

In [2]:
# file path
dataPath = "./"

# file name format MERGEDYYYY_yy_PP.csv

# create a list of tuples with years for filename
# example [('1996', '97'), ('1997', '98'), ...]
yrs = [(date(year = x, month = 1, day = 1).strftime("%Y"),
       date(year = x+1, month = 1, day = 1).strftime("%y")) 
       for x in range(2010,2020)]

# df_dict will map each year to it's dataframe
    # key - 2 digit year
    # value - dataframe for year
    # example: df_dict['05'] returns the dataframe for 2004-05
df_dict = {}

# initialize memory counter
total_memory = 0

for y in yrs:
    start_yr = y[0]
    end_yr = y[1]
    
    # read in dataframe
    df = pd.read_csv(f"MERGED{start_yr}_{end_yr}_PP.csv", low_memory=False)
    
    # add to dictionary
    df_dict.update({end_yr : df})
    
    #sum memory
    total_memory += df.memory_usage(deep = True, index = True).sum()
    
    
print(f"Total Memory Usage: {total_memory/1000000000} GB")

Total Memory Usage: 7.156509104 GB


In [3]:
# check that each dataframe was added to df_dict
df_dict.keys()

dict_keys(['11', '12', '13', '14', '15', '16', '17', '18', '19', '20'])

---
We have added each dataframe to a dictionary, indexable by year (11-20)

Create union set and intersection set of institutions in each dataframe. This is potentially challeninging if institution names differe between dataframes.



In [4]:
'''
inst_union is the set of institutions that ever appear in the dataset.
inst_intersection is the set of institutions that appear every year in the dataset.
'''

inst_union = set()
inst_intersection = set()

for k, df in df_dict.items():
    inst_set = set(df['UNITID'])
    if k == '11':
        inst_union = set(inst_set)
        inst_intersection = set(inst_set)
    else:
        inst_union = inst_union.union(inst_set)
        inst_intersection = inst_intersection.intersection(inst_set)
        
print(f"Total number of institutions in dataset {len(inst_union)}")
print(f"Number of institutions present every year {len(inst_intersection)}")

Total number of institutions in dataset 9390
Number of institutions present every year 5337


In [5]:
for key in df_dict.keys():
    df_dict[key].drop(df_dict[key][~df_dict[key]['UNITID'].isin(inst_intersection)].index, inplace = True)

#for key in df_dict.keys():
#    print(key)
#    print(len(df_dict[key]['MAIN']))

In [6]:
df_dict.keys()

dict_keys(['11', '12', '13', '14', '15', '16', '17', '18', '19', '20'])

In [7]:
grouped_df = df_dict['11'].groupby('INSTNM').count().sort_values(by = ['REGION'], ascending=False)
grouped_df[grouped_df['REGION'] > 1]

Unnamed: 0_level_0,UNITID,OPEID,OPEID6,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,SCH_DEG,...,COUNT_WNE_MALE1_P8,MD_EARN_WNE_MALE1_P8,GT_THRESHOLD_P10,MD_EARN_WNE_INC1_P10,MD_EARN_WNE_INC2_P10,MD_EARN_WNE_INC3_P10,MD_EARN_WNE_INDEP1_P10,MD_EARN_WNE_INDEP0_P10,MD_EARN_WNE_MALE0_P10,MD_EARN_WNE_MALE1_P10
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Brittany Beauty Academy,4,4,4,4,4,4,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
Stevens-Henager College,4,4,4,4,4,4,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
Columbia College,4,4,4,4,4,4,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
Union College,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Arthur's Beauty College,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Bryan University,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Lincoln University,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Interactive College of Technology,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
Western Technical College,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
The Beauty Institute,3,3,3,3,3,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# This is super slow!!!

'''
null_dict is a dictionary that maps a year to its
set of completely null columns
    key - 2 digit year
    value - set of null columns
'''
null_dict = {}

for k, df in df_dict.items():
    null_cols = set()
    for c in df.columns:
        if df[c].notna().sum() == 0:
            null_cols.add(c)
    
    null_dict.update({k : null_cols})

In [9]:
# get count of null cols per df
for k, n in null_dict.items():
    print(f"Year {k} number of null columns: {len(n)}")

Year 11 number of null columns: 1370
Year 12 number of null columns: 1330
Year 13 number of null columns: 1341
Year 14 number of null columns: 1307
Year 15 number of null columns: 1260
Year 16 number of null columns: 1298
Year 17 number of null columns: 1297
Year 18 number of null columns: 1300
Year 19 number of null columns: 1099
Year 20 number of null columns: 1421


In [10]:
'''
total_null_cols is a set of columns that are
completely null in the entire dataset.
There are only 11 such columns.
'''
total_null_cols = set()
for k, n in null_dict.items():
    if k == '11':
        total_null_cols = n
    else:
        total_null_cols = total_null_cols.intersection(n)
        
# convert to list
#total_null_cols = list(total_null_cols)

# print number of completely
print(f"There are {len(total_null_cols)} completely null columns.")

There are 37 completely null columns.


In [11]:
# How  much memory is saved by dropping 11 cols?
total_memory2 = 0

for k, df in df_dict.items():
    df_temp = df.drop(columns = total_null_cols)
    #df_dict.update({k : df})
    
    # compute memory size now
    total_memory2 += df_temp.memory_usage(deep = True, index = True).sum()
    
print(f"Total Memory Usage: {total_memory2/1000000000} GB")
print(f"Memory reduction: {(1-(total_memory2/total_memory))*100}%")

Total Memory Usage: 5.155484497 GB
Memory reduction: 27.96090353440008%


In [12]:
'''
union_null_cols is the set of columns
that are completely null in any given dataframe.
'''
union_null_cols = set()
for k, n in null_dict.items():
    if k == '97':
        union_null_cols = n
    else:
        union_null_cols = union_null_cols.union(n)
        
# convert to list
#total_null_cols = list(total_null_cols)

# print number of completely
print(f"There are {len(union_null_cols)} elements in the union of completely null columns.")

There are 2541 elements in the union of completely null columns.


In [13]:
total_memory3 = 0

for k, df in df_dict.items():
    df_temp = df.drop(columns = union_null_cols)
    #df_dict.update({k : df})
    
    # compute memory size now
    total_memory3 += df_temp.memory_usage(deep = True, index = True).sum()
    
print(f"Total Memory Usage: {total_memory3/1000000000} GB")
print(f"Memory reduction: {(1-(total_memory3/total_memory))*100}%")

Total Memory Usage: 0.307629563 GB
Memory reduction: 95.70140191915559%


In [14]:
len(union_null_cols)

2541

In [15]:
#Actually dropping the union of all null columns
for key in df_dict.keys():
    df_dict[key].drop(columns = union_null_cols, inplace = True)


In [16]:
for item in df_dict['11'].columns:
    print(item)

UNITID
OPEID
OPEID6
INSTNM
CITY
STABBR
ZIP
MAIN
NUMBRANCH
PREDDEG
HIGHDEG
CONTROL
ST_FIPS
REGION
ADM_RATE
ADM_RATE_ALL
SATVR25
SATVR75
SATMT25
SATMT75
SATVRMID
SATMTMID
ACTCM25
ACTCM75
ACTEN25
ACTEN75
ACTMT25
ACTMT75
ACTCMMID
ACTENMID
ACTMTMID
SAT_AVG
SAT_AVG_ALL
PCIP01
PCIP03
PCIP04
PCIP05
PCIP09
PCIP10
PCIP11
PCIP12
PCIP13
PCIP14
PCIP15
PCIP16
PCIP19
PCIP22
PCIP23
PCIP24
PCIP25
PCIP26
PCIP27
PCIP29
PCIP30
PCIP31
PCIP38
PCIP39
PCIP40
PCIP41
PCIP42
PCIP43
PCIP44
PCIP45
PCIP46
PCIP47
PCIP48
PCIP49
PCIP50
PCIP51
PCIP52
PCIP54
CIP01CERT1
CIP01CERT2
CIP01ASSOC
CIP01CERT4
CIP01BACHL
CIP03CERT1
CIP03CERT2
CIP03ASSOC
CIP03CERT4
CIP03BACHL
CIP04CERT1
CIP04CERT2
CIP04ASSOC
CIP04CERT4
CIP04BACHL
CIP05CERT1
CIP05CERT2
CIP05ASSOC
CIP05CERT4
CIP05BACHL
CIP09CERT1
CIP09CERT2
CIP09ASSOC
CIP09CERT4
CIP09BACHL
CIP10CERT1
CIP10CERT2
CIP10ASSOC
CIP10CERT4
CIP10BACHL
CIP11CERT1
CIP11CERT2
CIP11ASSOC
CIP11CERT4
CIP11BACHL
CIP12CERT1
CIP12CERT2
CIP12ASSOC
CIP12CERT4
CIP12BACHL
CIP13CERT1
CIP13CERT2
CIP13ASS

In [17]:
len(df_dict['11'].columns)

448

In [18]:
# compute nullness of dataframes with remaining columns
for k, df in df_dict.items():
    nullness = df.isna().sum().sum()/df.size
    print(f"For year {k}, the nullness is {round(nullness*100,2)}%")

For year 11, the nullness is 27.17%
For year 12, the nullness is 25.05%
For year 13, the nullness is 24.85%
For year 14, the nullness is 24.74%
For year 15, the nullness is 24.87%
For year 16, the nullness is 24.84%
For year 17, the nullness is 24.87%
For year 18, the nullness is 24.89%
For year 19, the nullness is 24.91%
For year 20, the nullness is 24.97%


## Columns I'd like to look at:
* Admissions Rate
* SATVRMid
* SATMMid (we'll have to look at correlation between these, may only need one or two)
* UGDS_White/Black/His
* HIGHDEG
* PREDDEG
* CONTROL
* LOCALE
* PCIP41 Percentage of degrees awarded in technology
* COSTT4_A
* AVGFACSAL
* DEBT MEDIAN


In [19]:
df_dict_shortened={}
for key in df_dict.keys():
    df_dict_shortened[key] = df_dict[key][['ADM_RATE','SATVRMID','SATMTMID','UGDS_WHITE','UGDS_BLACK','UGDS_HISP','HIGHDEG','PREDDEG','CONTROL','PCIP41','COSTT4_A','AVGFACSAL','DEBT_MDN']]
        

In [20]:
# compute nullness of dataframes with interesting columns
for k, df in df_dict_shortened.items():
    nullness = df.isna().sum().sum()/df.size
    print(f"For year {k}, the nullness is {round(nullness*100,2)}%")

For year 11, the nullness is 25.05%
For year 12, the nullness is 24.86%
For year 13, the nullness is 24.91%
For year 14, the nullness is 24.75%
For year 15, the nullness is 25.02%
For year 16, the nullness is 25.06%
For year 17, the nullness is 24.98%
For year 18, the nullness is 25.01%
For year 19, the nullness is 25.0%
For year 20, the nullness is 25.19%


In [21]:
#Need to look at are certain rows more null than others

In [22]:
df_dict_shortened['11'].isnull().sum(axis=1).tolist()

[0,
 0,
 2,
 0,
 0,
 0,
 3,
 4,
 2,
 0,
 0,
 3,
 2,
 3,
 3,
 0,
 3,
 3,
 3,
 3,
 2,
 0,
 3,
 3,
 5,
 0,
 3,
 3,
 0,
 3,
 2,
 3,
 0,
 3,
 0,
 2,
 3,
 0,
 3,
 0,
 3,
 3,
 0,
 3,
 3,
 3,
 2,
 0,
 0,
 3,
 3,
 2,
 0,
 4,
 3,
 3,
 5,
 3,
 3,
 0,
 4,
 3,
 5,
 5,
 3,
 4,
 5,
 5,
 4,
 0,
 3,
 0,
 4,
 3,
 5,
 3,
 5,
 5,
 5,
 3,
 0,
 4,
 3,
 2,
 3,
 3,
 3,
 5,
 3,
 0,
 3,
 4,
 3,
 3,
 4,
 4,
 0,
 4,
 3,
 5,
 3,
 3,
 0,
 4,
 5,
 3,
 2,
 4,
 3,
 5,
 5,
 0,
 5,
 5,
 0,
 0,
 3,
 0,
 0,
 3,
 5,
 3,
 3,
 2,
 0,
 3,
 3,
 3,
 5,
 5,
 3,
 3,
 0,
 0,
 0,
 5,
 4,
 0,
 5,
 5,
 3,
 3,
 5,
 3,
 4,
 0,
 3,
 3,
 0,
 3,
 0,
 3,
 3,
 5,
 3,
 3,
 3,
 5,
 5,
 2,
 3,
 0,
 3,
 5,
 0,
 3,
 8,
 5,
 3,
 3,
 3,
 8,
 8,
 5,
 4,
 4,
 3,
 3,
 2,
 4,
 5,
 0,
 3,
 3,
 5,
 2,
 0,
 5,
 3,
 3,
 4,
 3,
 0,
 2,
 8,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8,
 0,
 0,
 8,
 5,
 3,
 5,
 2,
 0,
 8,
 3,
 3,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 3,
 3,
 3,
 0,
 3,
 0,
 0,
 8,
 3,
 3,
 8,


In [23]:
len([x for x in df_dict_shortened['11'].isnull().sum(axis=1).tolist() if x > 0])

4119

In [24]:
#Rows missing at least one of the 13 values listed above:
for key in df_dict_shortened.keys():
    print(len([x for x in df_dict_shortened[key].isnull().sum(axis=1).tolist() if x > 0]))

4119
4122
4098
4088
4183
4184
4151
4144
4139
4157


In [26]:
# How many are missing debt_median:
df_dict_response={}
for key in df_dict_shortened.keys():
    df_dict_response[key] = df_dict_shortened[key][['DEBT_MDN']]
        

In [27]:
for key in df_dict_response.keys():
    print(len([x for x in df_dict_response[key].isnull().sum(axis=1).tolist() if x > 0]))

53
38
18
16
23
21
21
58
53
66


In [None]:
#Next thing I can try to look at is are these the same 66 schools. I'm growing worried this
#dataset may be too sparse.