In [211]:
import pandas as pd
import pkg_resources
#load pickled data

import pickle

data = pickle.load(open('raw_df.pickle', 'rb'))

We have to match earnings data from certain years to school data from other years to make sure they match up. Additionally, earnings data comes in pooled 2 year averages so for those years, we will take a weighted average of the two years, weighted on the number of students in the data. Here is how the earnings data maps to the college data:

|Earnings data year|College Data years|
|---|---|
|2009|2002, 2003|
|2011|2004, 2005|
|2012|2005, 2006|
|2013|2006, 2007|
|2014|2007, 2008|


In [212]:
data.columns

Index(['INSTNM', 'STABBR', 'ACCREDAGENCY', 'HCM2', 'MAIN', 'NUMBRANCH',
       'PREDDEG', 'HIGHDEG', 'CONTROL', 'REGION',
       ...
       'COMPL_RPY_1YR_RT', 'COMPL_RPY_3YR_RT', 'COMPL_RPY_5YR_RT',
       'COMPL_RPY_7YR_RT', 'GRAD_DEBT_MDN', 'COUNT_WNE_P6', 'MD_EARN_WNE_P6',
       'PRGMOFR', 'YEAR', 'COSTT4'],
      dtype='object', length=101)

In [213]:
#filter the raw data for only years we need
college_data_years = [2002,2003,2004,2005,2006,2007,2008]
earnings_data_years = [2009,2011,2012,2013,2014]
college_data = data[data['YEAR'].isin(college_data_years)]
earnings_data = data[data['YEAR'].isin(earnings_data_years)][['INSTNM','YEAR', 'COUNT_WNE_P6','MD_EARN_WNE_P6']]
print(college_data.shape)
print(earnings_data.shape)

(10792, 101)
(7819, 4)


#### Inflation adjustment

In [188]:
# Inflation adjust earnings data to 2018
year_to_adjust_to = 2018

# Load CPI data obtained from U.S. Bureau of Labor Statistics
CPI = pd.read_csv('CPI_data.csv', index_col='Year')[['Annual']]
CPI['rate'] = (CPI.loc[year_to_adjust_to,'Annual'] / CPI['Annual'])

# Some data are already adjusted per cohort map so need to decide what year to map from
# year : year that it is already adjusted to
CPI_rates = {'years' : [2009, 2011, 2012, 2013, 2014],
             'rates' : [CPI.loc[2014,'rate'], 
                    CPI.loc[2014,'rate'], 
                    CPI.loc[2015,'rate'], 
                    CPI.loc[2016,'rate'],
                    CPI.loc[2017,'rate']]
            }
CPI_rates = pd.DataFrame.from_dict(CPI_rates)
# CPI_rates = CPI_rates.set_index(['years'])
CPI_rates

Unnamed: 0,years,rates
0,2009,1.060705
1,2011,1.060705
2,2012,1.059447
3,2013,1.046249
4,2014,1.024425


In [181]:
#store copy of unaltered data
earnings_data_unaltered = earnings_data.copy()

#merge CPI rates into earnings data table
earnings_data = earnings_data.merge(CPI_rates, left_on='YEAR', right_on='years')
earnings_data['MD_EARN_WNE_P6_unadjusted'] = earnings_data['MD_EARN_WNE_P6'] 
earnings_data['MD_EARN_WNE_P6'] = earnings_data['MD_EARN_WNE_P6'] * earnings_data['rates']
earnings_data.drop(columns=['years','rates'])

Unnamed: 0,INSTNM,YEAR,COUNT_WNE_P6,MD_EARN_WNE_P6,MD_EARN_WNE_P6_unadjusted
0,Alabama A & M University,2009,1291.0,26517.618782,25000.0
1,University of Alabama at Birmingham,2009,2882.0,36806.454870,34700.0
2,University of Alabama in Huntsville,2009,1082.0,41049.273875,38700.0
3,Alabama State University,2009,1520.0,21850.517876,20600.0
4,The University of Alabama,2009,2917.0,37548.948195,35400.0
...,...,...,...,...,...
7814,Central Georgia Technical College,2014,2255.0,20488.495431,20000.0
7815,Arizona State University-Skysong,2014,15223.0,39235.468750,38300.0
7816,Louisiana Delta Community College,2014,354.0,20898.265339,20400.0
7817,University of Phoenix-Arizona,2014,245947.0,29093.663512,28400.0


In [214]:
#put in earnings data for the years that it will map to
# i.e. earnings data from 2009 will map to 2002 (year 1) and 2003 (year 2)
# earnings data from 2011 will map to 2004(year 1) and 2005 (year 2)
year_1_merge_map = {2009:2002,2011:2004,2012:2005,2013:2006,2014:2007}
year_2_merge_map = {2009:2003,2011:2005,2012:2006,2013:2007,2014:2008}
earnings_data['college_year_1'] = earnings_data.YEAR.map(year_1_merge_map)
earnings_data['college_year_2'] = earnings_data.YEAR.map(year_2_merge_map)
earnings_data.groupby('YEAR').mean()[['college_year_1', 'college_year_2']]

Unnamed: 0_level_0,college_year_1,college_year_2
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2009,2002,2003
2011,2004,2005
2012,2005,2006
2013,2006,2007
2014,2007,2008


I then merge college data and earnings data using pd.merge function. I merge on 'INSTNM' and then 'YEAR' (for college_data), 'college_year_1' and 'college_year_2' (for earnings data) because that seemed like the best option to merge on. I'll output statistics on how succesful the merge was.

I had to do two merges, first with the first year earnings and then with the second year earnings.

In [215]:
cols = ['INSTNM', 'COUNT_WNE_P6', 'MD_EARN_WNE_P6', 'college_year_1', 'YEAR']
merged_data = college_data.merge(earnings_data[cols],
                                 how = 'outer', 
                                 left_on = ['INSTNM','YEAR'], 
                                 right_on = ['INSTNM','college_year_1'],
                                 suffixes = ('','_year_1'),
                                 indicator = '_merged_1')
merged_data.groupby('YEAR').count()

cols = ['INSTNM', 'COUNT_WNE_P6', 'MD_EARN_WNE_P6', 'college_year_2', 'YEAR']
merged_data = merged_data.merge(earnings_data[cols],
                                 how = 'outer', 
                                 left_on = ['INSTNM','YEAR'], 
                                 right_on = ['INSTNM','college_year_2'],
                                 suffixes = ('','_year_2'),
                                 indicator = '_merged_2')



merged_data.drop(['YEAR_year_1', 'YEAR_year_2', 'college_year_1', 'college_year_2'],axis = 1, inplace = True)

#Check how many entries merged at least one of the two columns
merged_data['merged'] = merged_data.apply(lambda x: x._merged_1 == 'both' or x._merged_2 == 'both', axis = 1)
print(merged_data.groupby('merged').count()['INSTNM'])

# drop those colleges that didn't match either one
merged_data = merged_data[merged_data['merged'] == True]

merged_data.columns

merged
False      273
True     11332
Name: INSTNM, dtype: int64


Index(['INSTNM', 'STABBR', 'ACCREDAGENCY', 'HCM2', 'MAIN', 'NUMBRANCH',
       'PREDDEG', 'HIGHDEG', 'CONTROL', 'REGION',
       ...
       'PRGMOFR', 'YEAR', 'COSTT4', 'COUNT_WNE_P6_year_1',
       'MD_EARN_WNE_P6_year_1', '_merged_1', 'COUNT_WNE_P6_year_2',
       'MD_EARN_WNE_P6_year_2', '_merged_2', 'merged'],
      dtype='object', length=108)

In [216]:
#Fill the 'nan' values with 0s so we can use them to do the weighted average
columns = ['COUNT_WNE_P6_year_1', 'COUNT_WNE_P6_year_2', 'MD_EARN_WNE_P6_year_1', 'MD_EARN_WNE_P6_year_2']
for column in columns:
    merged_data[column].fillna(0,inplace = True)
merged_data = merged_data[merged_data['YEAR'] != 0]


Because some entries have '0' for 'count' when a median earnings value is not 0, I wrote the weighted average function logic such that, in that case, it weight both years evenly (given both years have earnings). If that is not the case, it will weight the income based on the number given by count for each year

In [217]:

def weighted_average(year_1_earnings,year_1_count,year_2_earnings,year_2_count):
    num_earnings = int(year_1_earnings != 0) + int(year_2_earnings != 0)
    
    if(num_earnings == 2):
        if year_1_count ==0 or year_2_count ==0:
            year_1_count = 1
            year_2_count = 1
    
    if(num_earnings == 1):
        if(year_1_earnings ==0):
            return year_2_earnings
        else: return year_1_earnings
        
    
    if(num_earnings == 0):
        return 0
    else:
        year_1_weight = year_1_count/(year_1_count + year_2_count)
        year_2_weight = year_2_count/(year_1_count + year_2_count)
        return year_1_earnings*year_1_weight + year_2_earnings*year_2_weight

merged_data['MD_EARN_AVG'] = merged_data.apply(lambda x: weighted_average(x['MD_EARN_WNE_P6_year_1'],
                                                                         x['COUNT_WNE_P6_year_1'],
                                                                         x['MD_EARN_WNE_P6_year_2'],
                                                                         x['COUNT_WNE_P6_year_2']),
                                              axis = 1)

In [218]:
merged_data.groupby('YEAR').mean().iloc[:,-6:]

Unnamed: 0_level_0,COUNT_WNE_P6_year_1,MD_EARN_WNE_P6_year_1,COUNT_WNE_P6_year_2,MD_EARN_WNE_P6_year_2,merged,MD_EARN_AVG
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002.0,2141.405594,31997.902098,0.0,0.0,True,31997.902098
2003.0,0.0,0.0,2136.257758,31958.644712,True,31958.644712
2004.0,2087.631111,30267.111111,0.0,0.0,True,30267.111111
2005.0,2037.761705,30197.29892,2025.35054,30175.810324,True,30228.048061
2006.0,2159.811456,30843.019093,2037.577566,30144.093079,True,30556.24576
2007.0,2438.751342,32371.317829,2166.744186,30860.584377,True,31702.873762
2008.0,0.0,0.0,2523.989281,32421.752837,True,32421.752837


In [219]:
merged_data.columns[-20:]

Index(['PCTFLOAN', 'UG25ABV', 'COMPL_RPY_1YR_RT', 'COMPL_RPY_3YR_RT',
       'COMPL_RPY_5YR_RT', 'COMPL_RPY_7YR_RT', 'GRAD_DEBT_MDN', 'COUNT_WNE_P6',
       'MD_EARN_WNE_P6', 'PRGMOFR', 'YEAR', 'COSTT4', 'COUNT_WNE_P6_year_1',
       'MD_EARN_WNE_P6_year_1', '_merged_1', 'COUNT_WNE_P6_year_2',
       'MD_EARN_WNE_P6_year_2', '_merged_2', 'merged', 'MD_EARN_AVG'],
      dtype='object')

In [220]:
#drop columns not needed for analysis
merged_data.drop(['COUNT_WNE_P6', 'MD_EARN_WNE_P6', 'merged'], axis = 1, inplace = True)

In [221]:
with open("merged_earnings_df.pickle", "wb") as f:
    pickle.dump(merged_data, f)