### bd econ CPS extract

bd_CPS_reader.ipynb

September 20, 2018

@bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

In [1]:
# Import python packages
import pandas as pd
import numpy as np
import struct
import string
import os
import pickle

os.chdir('/home/brian/Documents/CPS/data')


def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo['HUHHNUM'][np_mo['HUHHNUM'] < 0] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='int64'))

In [12]:
def cps_to_feather(year_list):
    """Annual partial extracts of monthly CPS files"""
    female = lambda x: np.where(x['PESEX'] == 2, 1, 0)
    state = lambda x: pd.Categorical(x['GESTFIPS'].map(maps['state']))
    educ = lambda x: pd.Categorical(x['PEEDUCA'].map(maps['educ']))
    married = lambda x: np.where(x['PRMARSTA'].isin([1,2,3]), 1, 0)
    wbhao = lambda x: pd.Categorical(np.where(x['PRDTHSP'].isin(maps['hisp']), 
                                              'Hispanic', 
                                              x['PRDTRACE'].map(maps['race'])))
    emp = lambda x: np.where(x['PREMPNOT']==1, 1, 0)
    hrwage = lambda x: np.where((x['PRERNHLY'] < 0) & (x['PEHRUSL1'] > 0 ) & 
                                (x['PRERNWA'] > 0), x['PRERNWA'] / x['PEHRUSL1'], 
                                np.where(x['PRERNHLY'] > 0, x['PRERNHLY'], 
                                         np.nan)) / 100
    cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
    data_path = '/home/brian/Documents/CPS/data/'
    data_dir = os.listdir(data_path)
    dd_file = None
    for year in year_list:
        file_ending = f'{str(year)[2:]}pub.dat'
        mo_dat_files = [file for file in data_dir
                        if file.endswith(file_ending)]   
        combined_data = [] 
        for file in mo_dat_files:
            if dd_file != cpsdd['matcher'][file]:
                dd_file = cpsdd['matcher'][file]
                dd = cpsdd[dd_file]['dd']
                dtypes = [(k, v[-1]) for k, v in dd.items()]
                maps = cpsdd[dd_file]['map']
                ws, we = dd['PWSSWGT'][:2]
                unpack_fmt = cpsdd[dd_file]['unpack_fmt']
                unpacker = struct.Struct(unpack_fmt).unpack_from
            raw_mo_data = open(f'{data_path}{file}', 'rb')
            mo_data = [unpacker(row) for row in raw_mo_data]  
            np_mo = np.array(mo_data, dtype=dtypes)
            np_mo = np_mo[(np_mo['PRTAGE'] > 15) & 
                          (np_mo['PWSSWGT'] > 0)]
            for wgt in maps['wgt']:
                np_mo[wgt] = np.divide(np_mo[wgt], 10000)
            dfm = (pd.DataFrame(np_mo)
                     .rename({'PRTAGE':'AGE'}, axis=1)
                     .assign(FEMALE = female,
                             STATE = state,
                             EDUC = educ,
                             MARRIED = married,
                             WBHAO = wbhao,
                             EMP = emp,
                             HRWAGE = hrwage)
                     .drop(['PESEX', 'GESTFIPS'], axis=1))
            if year < 1998:
                dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
                dfm = dfm.drop(['HRYEAR'], axis=1)
            if maps['id2'] == True:
                dfm['HRHHID2'] = id2_gen(np_mo)
                dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], axis=1)
            combined_data.append(dfm)
        df = (pd.concat(combined_data, sort=False)
              .reset_index(drop=True)
              .assign(HRYEAR4 = lambda x: pd.Categorical(x['HRYEAR4'])))
        df.to_feather(f'clean/cps{year}.ft')
        print(f'{year} Done: ({len(df):,} records, {len(df.keys())} variables)')

In [13]:
cps_to_feather(range(1994, 2019))

1994 Done: (1,264,881 records, 52 variables)
1995 Done: (1,245,737 records, 52 variables)
1996 Done: (1,103,811 records, 52 variables)
1997 Done: (1,109,347 records, 52 variables)
1998 Done: (1,112,012 records, 53 variables)
1999 Done: (1,119,277 records, 53 variables)
2000 Done: (1,116,447 records, 53 variables)
2001 Done: (1,197,384 records, 53 variables)
2002 Done: (1,306,879 records, 53 variables)
2003 Done: (1,297,060 records, 54 variables)
2004 Done: (1,277,835 records, 55 variables)
2005 Done: (1,273,399 records, 55 variables)
2006 Done: (1,266,304 records, 55 variables)
2007 Done: (1,255,147 records, 55 variables)
2008 Done: (1,252,180 records, 55 variables)
2009 Done: (1,268,277 records, 56 variables)
2010 Done: (1,271,629 records, 56 variables)
2011 Done: (1,260,276 records, 56 variables)
2012 Done: (1,253,486 records, 56 variables)
2013 Done: (1,248,291 records, 56 variables)
2014 Done: (1,256,358 records, 56 variables)
2015 Done: (1,240,426 records, 56 variables)
2016 Done: