### bd econ CPS extract

bd_CPS_reader.ipynb

September 20, 2018

@bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

In [None]:
# Import python packages
import pandas as pd
import numpy as np
import struct
import string
import os
import pickle


def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo['HUHHNUM'][np_mo['HUHHNUM'] < 0] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='int64'))

In [None]:
def cps_to_feather(year_list):
    """Annual partial extracts of monthly CPS files"""
    cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
    data_path = 'E:/08_Other/Archive/data/'
    data_dir = os.listdir(data_path)
    dd_file = None
    for year in year_list:
        file_ending = f'{str(year)[2:]}pub.dat'
        mo_dat_files = [file for file in data_dir
                        if file.endswith(file_ending)]   
        combined_data = [] 
        for file in mo_dat_files:
            if dd_file != cpsdd['matcher'][file]:
                dd_file = cpsdd['matcher'][file]
                dd = cpsdd[dd_file]['dd']
                dtypes = [(k, v[-1]) for k, v in dd.items()]
                maps = cpsdd[dd_file]['map']
                ws, we = dd['PWSSWGT'][:2]
                unpack_fmt = cpsdd[dd_file]['unpack_fmt']
                unpacker = struct.Struct(unpack_fmt).unpack_from
            raw_mo_data = open(f'{data_path}{file}', 'rb')
            mo_data = [unpacker(row) for row in raw_mo_data]  
            np_mo = np.array(mo_data, dtype=dtypes)
            np_mo = np_mo[(np_mo['PRTAGE'] > 15) & 
                          (np_mo['PWSSWGT'] > 0)]
            for wgt in maps['wgt']:
                np_mo[wgt] = np.divide(np_mo[wgt], 10000)
            dfm = pd.DataFrame(np_mo)
            dfm['EDUC'] = (pd.Categorical(dfm['PEEDUCA']
                                          .map(maps['educ'])))
            dfm['WBHAO'] = (pd.Categorical(
                np.where(dfm['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                         dfm['PRDTRACE'].map(maps['race']))))
            dfm['HRWAGE'] = np.where((dfm['PRERNHLY'] < 0) & 
                                     (dfm['PEHRUSL1'] > 0 ) & 
                                     (dfm['PRERNWA'] > 0),
                                     dfm['PRERNWA'] / dfm['PEHRUSL1'], 
                                     np.where(dfm['PRERNHLY'] > 0, 
                                              dfm['PRERNHLY'], 
                                              np.nan)) / 100
            if year < 1998:
                dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
                dfm = dfm.drop(['HRYEAR'], axis=1)
            if maps['id2'] == True:
                dfm['HRHHID2'] = id2_gen(np_mo)
                dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], 
                               axis=1)
            dfm.assign(HRYEAR4 = lambda x: pd.Categorical(x['HRYEAR4']))
            if 'CBSA' in dfm.keys():
                dfm.assign(GTCBSA = lambda x: pd.Categorical(x['GTCBSA']))
            combined_data.append(dfm)
        df = (pd.concat(combined_data, sort=False)
              .reset_index(drop=True))
        df.to_feather(f'clean/cps_{year}x.ft')
        print(f'{year} Done: ({len(df):,} records, {len(df.keys())} variables)')

In [None]:
cps_to_feather([2018])