# Data Cleaning Cont

- This notebook contains additional cleaning steps taken to convert a lot of the categorical columns back to their categorical values that way I can dummy up the columns for modeling. Each categorical column has number values that represent the category a person falls under but simply just using the given numbers will not make for an easily interpretable model

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/cleaned_data.csv')

In [3]:
df.head(20)

Unnamed: 0,HEFAMINC,HRNUMHOU,GEREG,GEDIV,GCFIP,GTMETSTA,GCTCS,PERRP,PEPARENT,PRTAGE,...,HESP7,HESP7A,HESP8,HESS1,HESH4,HESC1,HESC2,HESC3,HESC3A,food_secure
0,14,4,4,9,2,1,0,1,-1,37,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
1,14,4,4,9,2,1,0,3,-1,35,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
2,14,4,4,9,2,1,0,4,1,16,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
3,14,4,4,9,2,1,0,4,1,13,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
4,13,3,3,7,5,1,0,1,-1,28,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
5,13,3,3,7,5,1,0,3,-1,27,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
6,13,3,3,7,5,1,0,4,1,1,...,-1,-1,-1,0,-1,-1,-1,-1,-1,0
7,-1,0,3,7,5,1,340,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,-1,0,3,7,5,1,340,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,7,3,3,7,5,1,340,1,-1,22,...,-1,2,2,1,2,-1,-1,2,1,1


- Changing up the state/region/division/metro columns to their corresponding real values so that I can dummy the columns up for modeling to get a better sense of food insecurity per state

In [4]:
df['GCFIP'].replace([1, 2, 4, 5, 6, 8, 9, 10], ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE'], inplace=True)

In [5]:
df['GCFIP'].replace([11, 12, 13, 15, 16, 17, 18, 19, 20], ['DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS'], inplace=True)

In [6]:
df['GCFIP'].replace([21, 22, 23, 24, 25, 26, 27, 28, 29], ['KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO'], inplace=True)

In [7]:
df['GCFIP'].replace([30, 31, 32, 33, 34, 35, 36, 37, 38, 39], ['MT', 'NE', 'NV', 'NH', 'NJ', 'MN', 'NY', 'NC', 'ND', 'OH'], inplace=True)

In [8]:
df['GCFIP'].replace([40, 41, 42, 44, 45, 46, 47, 48, 49], ['OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT'], inplace=True)

In [9]:
df['GCFIP'].replace([50, 51, 53, 54, 55, 56], ['VT', 'VA', 'WA', 'WV', 'WI', 'WY'], inplace=True)

In [10]:
df['state'] = df['GCFIP']

In [11]:
df.drop(['GCFIP'], axis=1, inplace=True)

In [12]:
df['GTMETSTA'].replace([1, 2, 3], ['Metro', 'No', 'No'], inplace=True)

In [13]:
df['is_metro'] = df['GTMETSTA']

In [14]:
df.drop(['GTMETSTA'], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,GEREG,GEDIV,GCTCS,PERRP,PEPARENT,PRTAGE,PRTFAGE,PEMARITL,...,HESP8,HESS1,HESH4,HESC1,HESC2,HESC3,HESC3A,food_secure,state,is_metro
0,14,4,4,9,0,1,-1,37,0,1,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
1,14,4,4,9,0,3,-1,35,0,1,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
2,14,4,4,9,0,4,1,16,0,6,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
3,14,4,4,9,0,4,1,13,0,-1,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
4,13,3,3,7,0,1,-1,28,0,1,...,-1,0,-1,-1,-1,-1,-1,0,AR,Metro


In [16]:
df['GEREG'].replace([1, 2, 3, 4], ['North', 'MidW', 'South', 'West'], inplace=True)

In [17]:
df['GEDIV'].replace([1, 2, 3, 4, 5, 6, 7, 8, 9], ['NewEng', 'MidATL', 'ENC', 'WNC', 'SouthATL', 'ESC', 'WSC', 'MNTN', 'Pacif'], inplace=True)

In [18]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,GEREG,GEDIV,GCTCS,PERRP,PEPARENT,PRTAGE,PRTFAGE,PEMARITL,...,HESP8,HESS1,HESH4,HESC1,HESC2,HESC3,HESC3A,food_secure,state,is_metro
0,14,4,West,Pacif,0,1,-1,37,0,1,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
1,14,4,West,Pacif,0,3,-1,35,0,1,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
2,14,4,West,Pacif,0,4,1,16,0,6,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
3,14,4,West,Pacif,0,4,1,13,0,-1,...,-1,0,-1,-1,-1,-1,-1,0,AK,Metro
4,13,3,South,WSC,0,1,-1,28,0,1,...,-1,0,-1,-1,-1,-1,-1,0,AR,Metro


In [19]:
df['region'] = df['GEREG']
df['division'] = df['GEDIV']

In [20]:
df.drop(['GEREG', 'GEDIV'], axis=1, inplace=True)

In [21]:
df['PESEX'].replace([1, 2, -1], ['Male', 'Female', 'NaN'], inplace=True)

In [22]:
df['PESEX'].value_counts()

Female    60640
Male      57292
NaN       21032
Name: PESEX, dtype: int64

In [23]:
df['sex'] = df['PESEX']

In [24]:
df.drop('PESEX', axis=1, inplace=True)

- Dropping additional columns that ask the same questions over and over about a person's job status, disability status, etc.

In [25]:
df.drop(['PERRP', 'PEPARENT', 'PRTFAGE', 'PESPOUSE', 'PEAFNOW', 'PENATVTY', 'PEMNTVTY', 'PEFNTVTY', 'PEDADTYP', 
         'PEMOMTYP', 'PEMLR', 'PEHRUSL1', 'PEHRUSL2', 'PEHRFTPT', 'PRTFAGE', 'PRFAMREL', 'PRCITSHP',
         'PRINUYER', 'PUSLFPRX', 'PRAGNA', 'PRCOW1', 'PRCOW2', 'PRDTIND1', 'PRDTIND2', 'PRNAGWS', 'PRSJMJ', 
         'PRERELG', 'PRERNWA', 'PRDTHSP', 'PRDASIAN'], axis=1, inplace=True)

In [26]:
df.drop(['PUCHINHH', 'PULINENO', 'PUWK'], axis=1, inplace=True)

- The education column has values from grades k - college so I will be changing this to represent elementary, high school, and college to dummy up for modeling as well.

In [27]:
df['PEEDUCA'].replace([31, 32, 33, 34, 35, 36, 37, 38, 39], ['elem', 'elem', 'elem', 'elem', 'HS', 'HS', 'HS', 'HS', 'HS'], inplace=True)

In [28]:
df['PEEDUCA'].replace([40, 41, 42, 43, 44, 45, 46], ['Col', 'ASDeg', 'ASDeg', 'BachDeg', 'MasD', 'DocD', 'DocD'], inplace=True)

In [29]:
df['PEEDUCA'].replace([-1], ['NaN'], inplace=True)

In [30]:
df['PEEDUCA'].value_counts()

NaN        41838
HS         36428
BachDeg    19850
Col        16501
ASDeg       9419
MasD        8448
elem        3434
DocD        3046
Name: PEEDUCA, dtype: int64

In [31]:
df['education'] = df['PEEDUCA']

In [32]:
df.drop(['PEEDUCA'], axis=1, inplace=True)

In [33]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,GCTCS,PRTAGE,PEMARITL,PEAFEVER,PTDTRACE,PEHSPNON,PUDIS,PEMJOT,...,HESC2,HESC3,HESC3A,food_secure,state,is_metro,region,division,sex,education
0,14,4,0,37,1,2,1,2,-1,2,...,-1,-1,-1,0,AK,Metro,West,Pacif,Male,Col
1,14,4,0,35,1,2,1,2,1,-1,...,-1,-1,-1,0,AK,Metro,West,Pacif,Female,ASDeg
2,14,4,0,16,6,-1,1,2,-1,-1,...,-1,-1,-1,0,AK,Metro,West,Pacif,Male,HS
3,14,4,0,13,-1,-1,1,2,-1,-1,...,-1,-1,-1,0,AK,Metro,West,Pacif,Female,
4,13,3,0,28,1,2,1,2,-1,2,...,-1,-1,-1,0,AR,Metro,South,WSC,Male,Col


- Changing Race numerical values back to categorical values to dummy up for modeling

In [34]:
df['PTDTRACE'].value_counts()

 1     94830
-1     21032
 2     12251
 4      6279
 3      1557
 6       770
 7       731
 8       504
 5       472
 10       99
 9        96
 16       86
 21       72
 15       62
 11       37
 17       24
 20       17
 19       15
 13       15
 12        6
 18        4
 14        3
 26        1
 25        1
Name: PTDTRACE, dtype: int64

In [35]:
df['PTDTRACE'].replace([1, 2, 3, 4, 5, -1], ['White', 'Black', 'AmerInd', 'Asain', 'Hawaiian', 'NaN'], inplace=True)

In [36]:
df['PTDTRACE'].replace([6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26], 
                       ['mix', 'mix', 'mix', 'mix', 'mix', 'mix', 'mix', 'mix','mix', 'mix', 'mix', 'mix', 
                        'mix', 'mix', 'mix', 'mix','mix', 'mix', 'mix', 'mix', 'mix'], inplace=True)

In [37]:
df['race'] = df['PTDTRACE']

In [38]:
df.drop(['PTDTRACE'], axis=1, inplace=True)

In [39]:
df.columns

Index(['HEFAMINC', 'HRNUMHOU', 'GCTCS', 'PRTAGE', 'PEMARITL', 'PEAFEVER',
       'PEHSPNON', 'PUDIS', 'PEMJOT', 'PEMJNUM', 'PEHRUSLT', 'PEHRRSN1',
       'PULK', 'PELKLL1O', 'PELKLL2O', 'PEJHRSN', 'PRHRUSL', 'PEIO1COW',
       'PUIO1MFG', 'PEIO2COW', 'PUIO2MFG', 'PRDTOCC1', 'PRDTOCC2', 'PEERNUOT',
       'PEERNPER', 'PEERNRT', 'PEERNHRY', 'PUERNH1C', 'PEERNH2', 'PEERNHRO',
       'PEERN', 'PEERNWKP', 'PEERNLAB', 'PEERNCOV', 'PENLFACT', 'PESCHFT',
       'PESCHLVL', 'PRNLFSCH', 'PRNMCHLD', 'QSTNUM', 'OCCURNUM', 'PEDIPGED',
       'PECYC', 'PEAFWHN1', 'PRDISFLG', 'HXFAMINC', 'HETSP3O', 'HESP6',
       'HESP7', 'HESP7A', 'HESP8', 'HESS1', 'HESH4', 'HESC1', 'HESC2', 'HESC3',
       'HESC3A', 'food_secure', 'state', 'is_metro', 'region', 'division',
       'sex', 'education', 'race'],
      dtype='object')

- switching maritial status back to categorical values to dummy up for modeling

In [40]:
df['PEMARITL'].value_counts()

 1    48834
-1    41838
 6    28643
 4    10445
 3     6341
 5     1542
 2     1321
Name: PEMARITL, dtype: int64

In [41]:
df['PEMARITL'].replace([-1, 1, 2, 3, 4, 5, 6], ['No', 'Yes', 'Yes', 'Widow', 'Divorced', 'Separate', 'No'], inplace=True)

In [42]:
df['marital_status'] = df['PEMARITL']

In [43]:
df.drop(['PEMARITL'], axis=1, inplace=True)

In [44]:
#dropping the zipcode column since we already have region/division/state and this column doesn't indicate where the zipcodes come from
df.drop(['GCTCS'], axis=1, inplace=True)

In [45]:
df['HESC3A'].value_counts()

-1    109001
 1     19364
 2      7587
-2      2989
-3        19
-9         4
Name: HESC3A, dtype: int64

In [46]:
df['HESC3A'].replace([-1, 1, 2, -2, -3, -9], ['Unknown', 'Yes', 'No', 'Unknown', 'Unknown', 'No'], inplace=True)

In [47]:
df['food_pantry'] = df['HESC3A']

In [48]:
df.drop(['HESC3A'], axis=1, inplace=True)

In [49]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEAFEVER,PEHSPNON,PUDIS,PEMJOT,PEMJNUM,PEHRUSLT,PEHRRSN1,...,food_secure,state,is_metro,region,division,sex,education,race,marital_status,food_pantry
0,14,4,37,2,2,-1,2,-1,60,-1,...,0,AK,Metro,West,Pacif,Male,Col,White,Yes,Unknown
1,14,4,35,2,2,1,-1,-1,-1,-1,...,0,AK,Metro,West,Pacif,Female,ASDeg,White,Yes,Unknown
2,14,4,16,-1,2,-1,-1,-1,-1,-1,...,0,AK,Metro,West,Pacif,Male,HS,White,No,Unknown
3,14,4,13,-1,2,-1,-1,-1,-1,-1,...,0,AK,Metro,West,Pacif,Female,,White,No,Unknown
4,13,3,28,2,2,-1,2,-1,40,-1,...,0,AR,Metro,South,WSC,Male,Col,White,Yes,Unknown


In [50]:
df['PUDIS'].replace([1, 2, 3, -1], ['Yes', 'No', 'No', 'No'], inplace=True)

In [51]:
df['has_dis'] = df['PUDIS']

In [52]:
df.drop(['PUDIS'], axis=1, inplace=True)

In [71]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEHRUSLT,PEHRRSN1,PULK,PELKLL1O,PELKLL2O,PEJHRSN,PRHRUSL,...,division,sex,education,race,marital_status,food_pantry,has_dis,mul_jobs,service_status,is_hispanic
0,14,4,37,60,-1,-1,-1,-1,-1,6,...,Pacif,Male,Col,White,Yes,Unknown,No,No,No,No
1,14,4,35,-1,-1,-1,-1,-1,-1,-1,...,Pacif,Female,ASDeg,White,Yes,Unknown,Yes,,No,No
2,14,4,16,-1,-1,2,-1,-1,-1,-1,...,Pacif,Male,HS,White,No,Unknown,No,,No,No
3,14,4,13,-1,-1,-1,-1,-1,-1,-1,...,Pacif,Female,,White,No,Unknown,No,,No,No
4,13,3,28,40,-1,-1,-1,-1,-1,4,...,WSC,Male,Col,White,Yes,Unknown,No,No,No,No


In [54]:
df['PEMJNUM'].value_counts()

-1    135812
 2      2876
 3       225
 4        51
Name: PEMJNUM, dtype: int64

In [58]:
df['PEMJOT'].value_counts()

NaN    81960
No     53852
Yes     3152
Name: PEMJOT, dtype: int64

In [57]:
df['PEMJOT'].replace([1, 2, -1], ['Yes', 'No', 'NaN'], inplace=True)

In [59]:
df['mul_jobs'] = df['PEMJOT']

In [60]:
df.drop(['PEMJOT', 'PEMJNUM'], axis=1, inplace=True)

In [76]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEHRUSLT,PEHRRSN1,PULK,PELKLL1O,PELKLL2O,PEJHRSN,PRHRUSL,...,division,sex,education,race,marital_status,food_pantry,has_dis,mul_jobs,service_status,is_hispanic
0,14,4,37,60,-1,-1,-1,-1,-1,6,...,Pacif,Male,Col,White,Yes,Unknown,No,No,No,No
1,14,4,35,-1,-1,-1,-1,-1,-1,-1,...,Pacif,Female,ASDeg,White,Yes,Unknown,Yes,,No,No
2,14,4,16,-1,-1,2,-1,-1,-1,-1,...,Pacif,Male,HS,White,No,Unknown,No,,No,No
3,14,4,13,-1,-1,-1,-1,-1,-1,-1,...,Pacif,Female,,White,No,Unknown,No,,No,No
4,13,3,28,40,-1,-1,-1,-1,-1,4,...,WSC,Male,Col,White,Yes,Unknown,No,No,No,No


In [63]:
df['PEAFEVER'].replace([1, 2, -1], ['Yes', 'No', 'No'], inplace=True)

In [64]:
df['service_status'] = df['PEAFEVER']

In [65]:
df.drop(['PEAFEVER'], axis=1, inplace=True)

In [67]:
df['PEHSPNON'].value_counts()

 2    100673
-1     21032
 1     17259
Name: PEHSPNON, dtype: int64

In [68]:
df['PEHSPNON'].replace([1, 2, -1], ['Yes', 'No', 'No'], inplace=True)

In [69]:
df['is_hispanic'] = df['PEHSPNON']

In [70]:
df.drop(['PEHSPNON'], axis=1, inplace=True)

In [81]:
df['PELKLL2O'].value_counts()

NaN     138057
Lost       434
Quit       277
Temp       196
Name: PELKLL2O, dtype: int64

In [80]:
df['PELKLL2O'].replace([-1, 1, 2, 3], ['NaN', 'Lost', 'Quit', 'Temp'], inplace=True)

In [82]:
df['job_loss'] = df['PELKLL2O']

In [83]:
df.drop(['PEHRRSN1', 'PULK', 'PELKLL1O', 'PELKLL2O'], axis=1, inplace=True)

In [84]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEHRUSLT,PEJHRSN,PRHRUSL,PEIO1COW,PUIO1MFG,PEIO2COW,PUIO2MFG,...,sex,education,race,marital_status,food_pantry,has_dis,mul_jobs,service_status,is_hispanic,job_loss
0,14,4,37,60,-1,6,4,1,-1,-1,...,Male,Col,White,Yes,Unknown,No,No,No,No,
1,14,4,35,-1,-1,-1,-1,-1,-1,-1,...,Female,ASDeg,White,Yes,Unknown,Yes,,No,No,
2,14,4,16,-1,-1,-1,-1,-1,-1,-1,...,Male,HS,White,No,Unknown,No,,No,No,
3,14,4,13,-1,-1,-1,-1,-1,-1,-1,...,Female,,White,No,Unknown,No,,No,No,
4,13,3,28,40,-1,4,4,2,-1,-1,...,Male,Col,White,Yes,Unknown,No,No,No,No,


In [87]:
df['PEIO1COW'].value_counts()

NaN         79564
ForProf     40432
Gov          8478
Self-emp     6291
NonProf      4166
w/o pay        33
Name: PEIO1COW, dtype: int64

In [86]:
df['PEIO1COW'].replace([-1, 1, 2, 3, 4, 5, 6, 7, 8], ['NaN', 'Gov', 'Gov', 'Gov', 'ForProf', 'NonProf', 'Self-emp', 'Self-emp', 'w/o pay'], inplace=True)

In [88]:
df['type_job'] = df['PEIO1COW']

In [89]:
df.drop(['PEJHRSN', 'PRHRUSL', 'PEIO1COW', 'PUIO1MFG', 'PEIO2COW', 'PUIO2MFG'], axis=1, inplace=True)

In [93]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEHRUSLT,PRDTOCC1,PRDTOCC2,PEERNUOT,PEERNPER,PEERNRT,PEERNHRY,...,race,marital_status,food_pantry,has_dis,mul_jobs,service_status,is_hispanic,job_loss,type_job,pay_period
0,14,4,37,60,19,-1,-1,,-1,-1,...,White,Yes,Unknown,No,No,No,No,,ForProf,
1,14,4,35,-1,-1,-1,-1,,-1,-1,...,White,Yes,Unknown,Yes,,No,No,,,
2,14,4,16,-1,-1,-1,-1,,-1,-1,...,White,No,Unknown,No,,No,No,,,
3,14,4,13,-1,-1,-1,-1,,-1,-1,...,White,No,Unknown,No,,No,No,,,
4,13,3,28,40,16,-1,2,bi-week,1,1,...,White,Yes,Unknown,No,No,No,No,,ForProf,bi-week


In [91]:
df['PEERNPER'].replace([-1, 1, 2, 3, 4, 5, 6, 7], ['NaN', 'hour', 'week', 'bi-week', 'month', 'month', 'year', 'other'], inplace=True)

In [92]:
df['pay_period'] = df['PEERNPER']

In [94]:
df.drop(['PRDTOCC1', 'PRDTOCC2', 'PEERNUOT', 'PEERNPER', 'PEERNRT', 'PEERNHRY'], axis=1, inplace=True)

In [95]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEHRUSLT,PUERNH1C,PEERNH2,PEERNHRO,PEERN,PEERNWKP,PEERNLAB,...,race,marital_status,food_pantry,has_dis,mul_jobs,service_status,is_hispanic,job_loss,type_job,pay_period
0,14,4,37,60,-1,-1,-1,-1,-1,-1,...,White,Yes,Unknown,No,No,No,No,,ForProf,
1,14,4,35,-1,-1,-1,-1,-1,-1,-1,...,White,Yes,Unknown,Yes,,No,No,,,
2,14,4,16,-1,-1,-1,-1,-1,-1,-1,...,White,No,Unknown,No,,No,No,,,
3,14,4,13,-1,-1,-1,-1,-1,-1,-1,...,White,No,Unknown,No,,No,No,,,
4,13,3,28,40,-1,1545,-1,-1,-1,2,...,White,Yes,Unknown,No,No,No,No,,ForProf,bi-week


In [99]:
df['PEERNLAB'].value_counts()

No     137685
Yes      1279
Name: PEERNLAB, dtype: int64

In [97]:
df['PEERNLAB'].replace([-1, 1, 2], ['No', 'Yes', 'No'], inplace=True)

In [98]:
df['in_union'] = df['PEERNLAB'] 

In [100]:
df.drop(['PEERNH2', 'PUERNH1C', 'PEERNHRO', 'PEERN', 'PEERNWKP', 'PEERNLAB'], axis=1, inplace=True)

In [107]:
df['PESCHLVL'].replace([-1, 1, 2], ['NaN', 'HS', 'College'], inplace=True)

In [109]:
df['in_school'] = df['PESCHLVL']

In [111]:
df.drop(['PEERNCOV', 'PENLFACT', 'PESCHFT', 'PESCHLVL', 'PRNLFSCH'], axis=1, inplace=True)

In [115]:
df.drop(['OCCURNUM', 'PEDIPGED', 'PECYC', 'PEAFWHN1'], axis=1, inplace=True)

In [117]:
df.drop(['PRDISFLG', 'HXFAMINC'], axis=1, inplace=True)

In [118]:
df.head()

Unnamed: 0,HEFAMINC,HRNUMHOU,PRTAGE,PEHRUSLT,PRNMCHLD,QSTNUM,HETSP3O,HESP6,HESP7,HESP7A,...,food_pantry,has_dis,mul_jobs,service_status,is_hispanic,job_loss,type_job,pay_period,in_union,in_school
0,14,4,37,60,2,1,-1,-1,-1,-1,...,Unknown,No,No,No,No,,ForProf,,No,
1,14,4,35,-1,2,1,-1,-1,-1,-1,...,Unknown,Yes,,No,No,,,,No,
2,14,4,16,-1,0,1,-1,-1,-1,-1,...,Unknown,No,,No,No,,,,No,HS
3,14,4,13,-1,0,1,-1,-1,-1,-1,...,Unknown,No,,No,No,,,,No,
4,13,3,28,40,1,2,-1,-1,-1,-1,...,Unknown,No,No,No,No,,ForProf,bi-week,No,College


In [119]:
#data frame now contains categorical variables and quantified variables to use for modeling
df.to_csv('./data/cleaned_data.csv', index=False)