# Final Cleaning

Combining cleaned tables by year. Adding poverty data, which will be our y variable in modeling.

In [1]:
import pandas as pd

In [2]:
# membership and wage data
mmwages2010 = pd.read_csv('../../data/cleaned_data/model_data/2010/mean_medwages2010.csv', index_col='State').drop(columns='Unnamed: 0')

In [3]:
# fatalities data
fatal2010 = pd.read_csv('../../data/cleaned_data/model_data/2010/fatal_2010.csv', index_col='State')

In [4]:
# combining on shared index
combined2010 = mmwages2010.merge(fatal2010, on='State')
combined2010.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,38590,29570,1808807,10.1,35010,31750,100593,4.8,92.0,15.0
Alaska,50350,41640,295063,22.9,59980,59660,17440,21.4,39.0,10.0
Arizona,42390,33040,2506723,6.4,39060,36380,163760,5.3,77.0,13.0
Arkansas,35460,27860,1081711,4.0,34270,31820,47617,6.2,88.0,18.0
California,50730,37870,13891632,17.5,51880,48980,707158,15.7,326.0,45.0


In [5]:
# poverty data; removing total population column
pov2010 = pd.read_csv('../../data/cleaned_data/poverty/state_pov2010.csv', index_col='State').drop(columns=['2010_Total_Pop', '2010_Pop_Below_Pov'])

In [6]:
# combining on shared index
final_2010 = combined2010.merge(pov2010, on='State')
final_2010.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2010_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,38590,29570,1808807,10.1,35010,31750,100593,4.8,92.0,15.0,19.0
Alaska,50350,41640,295063,22.9,59980,59660,17440,21.4,39.0,10.0,9.9
Arizona,42390,33040,2506723,6.4,39060,36380,163760,5.3,77.0,13.0,17.4
Arkansas,35460,27860,1081711,4.0,34270,31820,47617,6.2,88.0,18.0,18.8
California,50730,37870,13891632,17.5,51880,48980,707158,15.7,326.0,45.0,15.8


In [7]:
# loading in data tables for rest of years

years = [2011,2012,2013,2014,2015,2016,2017]

# wages
for year in years:
    globals()[f'mmwages{year}'] = pd.read_csv(f'../../data/cleaned_data/model_data/{year}/mean_medwages{year}.csv', index_col='State').drop(columns='Unnamed: 0')
    
# fatalities
for year in years:
    globals()[f'fatal{year}'] = pd.read_csv(f'../../data/cleaned_data/model_data/{year}/fatal_{year}.csv', index_col='State')
    
# poverty
for year in years:
    globals()[f'pov{year}'] = pd.read_csv(f'../../data/cleaned_data/poverty/state_pov{year}.csv', index_col='State'). drop(columns=[f'{year}_Total_Pop', f'{year}_Pop_Below_Pov'])

In [8]:
#2011

combined2011 = mmwages2011.merge(fatal2011, on='State')
final_2011 = combined2011.merge(pov2011, on='State')
final_2011.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2011_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,39180,29850,1781885,10.0,35530,32520,86771,9.4,92.0,8.0,19.0
Alaska,51590,42960,305485,22.1,60830,59560,18231,21.3,39.0,3.0,10.5
Arizona,43670,34110,2499746,6.0,40770,37620,139825,5.2,69.0,9.0,19.0
Arkansas,36340,28460,1116844,4.2,34730,32400,54557,5.7,93.0,14.0,19.5
California,51910,38530,13927053,17.1,53160,50140,686035,16.9,390.0,56.0,16.6


In [9]:
#2012

combined2012 = mmwages2012.merge(fatal2012, on='State')
final_2012 = combined2012.merge(pov2012, on='State')
final_2012.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2012_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,39550,29950,1812340,9.2,36360,33390,77560,7.1,92.0,14.0,19.0
Alaska,52050,43510,298283,22.4,61120,59690,14486,21.8,31.0,3.0,10.1
Arizona,43950,33980,2433824,5.2,39770,36750,110741,2.7,60.0,7.0,18.7
Arkansas,36850,28690,1155140,3.2,34630,32200,61336,1.0,63.0,12.0,19.8
California,52350,38770,14488778,17.2,53820,50750,746875,15.9,375.0,58.0,17.0


In [10]:
#2013

combined2013 = mmwages2013.merge(fatal2013, on='State')
final_2013 = combined2013.merge(pov2013, on='State')
final_2013.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2013_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,40240,30390,1891629,10.8,37290,34340,91112,3.1,92.0,20.0,18.7
Alaska,53110,44350,306322,23.1,61780,60200,17040,22.3,32.0,0.0,9.3
Arizona,44370,34170,2455146,5.0,40760,37200,127729,4.1,95.0,13.0,18.6
Arkansas,37340,28910,1072119,3.5,35210,32760,43561,2.6,63.0,13.0,19.7
California,53030,38920,14840395,16.4,54130,51090,761854,16.5,396.0,61.0,16.8


In [11]:
#2014

combined2014 = mmwages2014.merge(fatal2014, on='State')
final_2014 = combined2014.merge(pov2014, on='State')
final_2014.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2014_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,40890,30850,1889069,10.8,38210,35040,108463,2.0,92.0,11.0,19.3
Alaska,54040,45200,307285,22.8,63590,62220,17417,32.8,30.0,0.0,11.2
Arizona,44580,34230,2594000,5.3,41690,37630,141828,3.3,88.0,17.0,18.2
Arkansas,37940,29140,1108001,4.7,35530,32980,60688,2.5,67.0,15.0,18.9
California,53890,39190,15134895,16.3,54200,50520,791563,16.3,344.0,49.0,16.4


In [12]:
#2015

combined2015 = mmwages2015.merge(fatal2015, on='State')
final_2015 = combined2015.merge(pov2015, on='State')
final_2015.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2015_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,41920,31550,1863152,10.2,38990,35710,97730,6.8,92.0,14.0,18.5
Alaska,55760,46420,304538,19.6,65300,63530,18874,26.6,14.0,3.0,10.3
Arizona,45310,34680,2661422,5.2,42770,38580,145217,6.8,69.0,12.0,17.4
Arkansas,38540,29420,1156533,5.1,36110,33220,57628,2.3,74.0,8.0,19.1
California,55260,39830,15662847,15.9,55240,51160,820015,18.1,388.0,75.0,15.3


In [13]:
#2016

combined2016 = mmwages2016.merge(fatal2016, on='State')
final_2016 = combined2016.merge(pov2016, on='State')
final_2016.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2016_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,42510,32100,1893731,8.1,39990,36520,96538,11.3,92.0,20.0,17.1
Alaska,56710,47170,297008,18.5,67210,65180,18673,26.6,35.0,2.0,9.9
Arizona,46290,35470,2727866,4.5,43800,40020,150775,7.1,77.0,15.0,16.4
Arkansas,39590,30130,1185496,4.0,36450,33630,69123,1.7,68.0,13.0,17.2
California,56840,40920,16007244,15.9,56770,52280,857489,18.4,376.0,55.0,14.3


In [14]:
#2017

combined2017 = mmwages2017.merge(fatal2017, on='State')
final_2017 = combined2017.merge(pov2017, on='State')
final_2017.head()

Unnamed: 0_level_0,Total_Avg_$,Total_Med_$,Total_Employment,Total_Mem_%,Construction_Avg_$,Construction_Med_$,Construction_Employment,Construction_Mem_%,Total_Fatalities,Construction_Fatalities,2017_Pct_Below_Pov
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,43170,32800,1868005,7.4,40770,37380,112835,7.0,92.0,18.0,16.9
Alaska,57750,47560,304027,18.1,66600,64550,16584,23.4,33.0,0.0,11.1
Arizona,48160,36270,2804949,4.0,44020,40490,163853,1.2,90.0,11.0,14.9
Arkansas,40530,30810,1207917,5.1,37660,34710,60846,2.5,76.0,9.0,16.4
California,57190,40980,16063818,15.5,58010,53440,882269,17.9,376.0,69.0,13.3


Noticing commas that will interfere with modeling, we will remove them and ensure all data is in integer form.

In [15]:
cols = ['Total_Avg_$',
'Total_Med_$',
'Total_Employment',
'Total_Mem_%',
'Construction_Avg_$',
'Construction_Med_$',
'Construction_Employment',
'Construction_Mem_%',
'Total_Fatalities',
'Construction_Fatalities']

def rmv_commas(df):
    for c in cols:
        df[c].replace(',', '', regex=True, inplace=True)
        df[c] = df[c].astype(int)
    return df

In [16]:
# to standardize column names across all tables, the years will be removed from the poverty columns
 
final_2010 = rmv_commas(final_2010.rename(columns={'2010_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2011 = rmv_commas(final_2011.rename(columns={'2011_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2012 = rmv_commas(final_2012.rename(columns={'2012_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2013 = rmv_commas(final_2013.rename(columns={'2013_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2014 = rmv_commas(final_2014.rename(columns={'2014_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2015 = rmv_commas(final_2015.rename(columns={'2015_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2016 = rmv_commas(final_2016.rename(columns={'2016_Pct_Below_Pov':'Pct_Below_Pov'}))
final_2017 = rmv_commas(final_2017.rename(columns={'2017_Pct_Below_Pov':'Pct_Below_Pov'}))

In [17]:
final_2010.to_csv('../../data/cleaned_data/model_data/2010/final_2010.csv')

In [18]:
for year in years:
    globals()[f'final_{year}'].to_csv(f'../../data/cleaned_data/model_data/{year}/final_{year}.csv')

#### Because we are creating a classification model, we need to binarize the 'Pct_Below_Pov' column. The values are based on the U.S. poverty threshold for each given year.

In [19]:
final_2010['Pct_Below_Pov'] = (final_2010['Pct_Below_Pov'] > 15.1).astype(int)

In [22]:
final_2011['Pct_Below_Pov'] = (final_2011['Pct_Below_Pov'] > 15).astype(int)

In [23]:
final_2012['Pct_Below_Pov'] = (final_2012['Pct_Below_Pov'] > 15).astype(int)

In [24]:
final_2013['Pct_Below_Pov'] = (final_2013['Pct_Below_Pov'] > 14.5).astype(int)

In [25]:
final_2014['Pct_Below_Pov'] = (final_2014['Pct_Below_Pov'] > 14.8).astype(int)

In [26]:
final_2015['Pct_Below_Pov'] = (final_2015['Pct_Below_Pov'] > 13.5).astype(int)

In [27]:
final_2016['Pct_Below_Pov'] = (final_2016['Pct_Below_Pov'] > 12.7).astype(int)

In [28]:
final_2017['Pct_Below_Pov'] = (final_2017['Pct_Below_Pov'] > 12.3).astype(int)

In [29]:
final_2010.to_csv('../../data/cleaned_data/model_data/2010/final_2010_bin.csv')

for year in years:
    globals()[f'final_{year}'].to_csv(f'../../data/cleaned_data/model_data/{year}/final_{year}_bin.csv')