# Parse 2018 Data

Per the data dictionary, household, family, and person data are all in the same file. So will need to split them out to import into the main notebook.

[Useful link on how to use Pandas to parse fix width files.](https://towardsdatascience.com/parsing-fixed-width-text-files-with-pandas-f1db8f737276)

Records are organized by:

- Household 92,139 1,076 Characters - **identified by first value = 1**
- Family    79,236 1,076 Characters - **identified by first value = 2**
- Person   180,084 1,076 Characters - **identified by first value = 3**


In [86]:
# grab the imports needed for the project
import pandas as pd
import glob

In [127]:
data_path = '~/Documents/CNM/DataScience/'
file_name = 'asec2018_pubuse.dat'
full_file_name = data_path + file_name
hh_rec_type = 1
ff_rec_type = 2
pp_rec_type = 3

# 2018

## Household Record - 2018

In [145]:
household_cols = ['GTMETSTA','GEDIV','GESTFIPS','HHINC','H_TENURE','H_LIVQRT']

# tuples for start and end positions of columns
hh_specs = [(0,1),(343,358),(319,324),(1,6),(52,53),(328,329),(41,43),(271,273),(34,35),(30,32)]

# Household Columns
all_hh_cols = ['REC_TYPE','H_IDNUM1','H_IDNUM2','H_SEQ'] + household_cols

In [146]:
# Run command to pull data into a dataframe
hh_data = pd.read_fwf(full_file_name, skiprows=0, 
                      skipfooter=0, colspecs=hh_specs, names=all_hh_cols)

In [147]:
# Post processing
hh_data_only = hh_data[hh_data['REC_TYPE']==hh_rec_type].copy()
hh_data_only['H_IDNUM'] = hh_data_only['H_IDNUM1'].map(str) + hh_data_only['H_IDNUM2'].map(str)
hh_data_only.drop(['H_IDNUM1', 'H_IDNUM2'], axis=1, inplace=True)
hh_data_only['DATA_YEAR'] = '2018'
hh_data_only.to_csv(data_path + 'hhpub18.csv')
# hh_data_only.shape

## Family Record - 2018

In [148]:
# FKINDEX, 'FINC_ANN', 'FINC_DST', 'FINC_PEN' not in 2018

family_cols = ['FINC_FR','FINC_SE','FINC_WS','FINC_CSP','FINC_DIS','FINC_DIV','FINC_RNT',
               'FINC_ED','FINC_SS','FINC_SSI','FINC_FIN','FINC_SUR','FINC_INT','FINC_UC',
               'FINC_OI','FINC_VET','FINC_PAW','FINC_WC']

# tuples for start and end positions of columns
ff_specs = [(0,1),(1,5),(62,63),(54,55),(46,47),(172,173),(124,125),(148,149),(156,157),
            (164,165),(86,87),(94,95),(188,189),(116,117),(140,141),(70,71),
            (196,197),(108,109),(101,102),(78,79)]

# Household Columns
all_ff_cols = ['REC_TYPE','FH_SEQ'] + family_cols

In [149]:
# Run command to pull data into a dataframe
ff_data = pd.read_fwf(full_file_name, skiprows=0, 
                      skipfooter=0, colspecs=ff_specs, names=all_ff_cols)

In [150]:
# Post processing
ff_data_only = ff_data[ff_data['REC_TYPE']==ff_rec_type].copy()
ff_data_only['DATA_YEAR'] = '2018'
ff_data_only.to_csv(data_path + 'ffpub18.csv')
# ff_data_only.shape

## Person Record - 2018

In [160]:
person_cols = ['OCCUP','A_MJOCC','A_DTOCC','AGE1','A_SEX','PRDTRACE','PXRACE1','PRCITSHP',
               'A_HGA','PRERELG', 'A_GRSWK', 'HRCHECK','HRSWK','PEARNVAL','A_CLSWKR','WEIND',
               'A_MARITL','A_HSCOL','A_WKSTAT','HEA','PEINUSYR']

# tuples for start and end positions of columns
pp_specs = [(0,1),(95,117),(295,298),(210,212),(212,214),(43,45),(23,24),(26,28),(858,860),(94,95),
            (24,26),(182,183),(190,193),(269,270),(267,269),(587,594),(175,176),(286,288),
            (20,21),(197,198),(201,202),(690,691),(92,94)]

# Household Columns
all_pp_cols = ['REC_TYPE','PERIDNUM'] + person_cols

In [161]:
# Run command to pull data into a dataframe
pp_data = pd.read_fwf(full_file_name, skiprows=0, 
                      skipfooter=0, colspecs=pp_specs, names=all_pp_cols)

In [162]:
# Post processing
pp_data_only = pp_data[pp_data['REC_TYPE']==pp_rec_type].copy()
pp_data_only['DATA_YEAR'] = '2018'
pp_data_only.to_csv(data_path + 'pppub18.csv')
# pp_data_only.shape

In [163]:
pp_data_only

Unnamed: 0,REC_TYPE,PERIDNUM,OCCUP,A_MJOCC,A_DTOCC,AGE1,A_SEX,PRDTRACE,PXRACE1,PRCITSHP,...,HRSWK,PEARNVAL,A_CLSWKR,WEIND,A_MARITL,A_HSCOL,A_WKSTAT,HEA,PEINUSYR,DATA_YEAR
3,3,6411300043201450801101,0,0,0,16,1,1,0,1,...,0,0,0,23,7,0,1,4,0,2018
7,3,0140236332005010811101,1,5,17,10,1,1,0,1,...,40,120,2,7,1,0,5,3,0,2018
8,3,0140236332005010811102,42,1,1,12,2,1,0,1,...,40,5600,1,15,1,0,2,2,0,2018
9,3,0140236332005010811103,0,0,0,0,1,1,0,1,...,0,0,0,0,7,0,0,1,0,2018
10,3,0140236332005010811104,0,0,0,0,2,2,0,1,...,0,0,0,0,7,0,0,1,0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351452,3,6102090711296250811101,470,4,16,8,2,5,0,1,...,40,2400,1,7,7,0,2,1,0,2018
351453,3,6102090711296250811102,364,3,11,7,2,5,0,1,...,40,2400,1,16,7,0,2,1,0,2018
351456,3,6061207225101990811101,201,2,6,8,2,21,0,1,...,39,400,1,16,5,0,2,4,0,2018
351457,3,6061207225101990811102,0,0,0,0,2,21,0,1,...,0,0,0,0,7,0,0,1,0,2018
