### bd econ CPS extract

bd_CPS_reader.ipynb

October 15, 2018

Contact: Brian Dew, @bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

-----

See [readme](https://github.com/bdecon/econ_data/tree/master/bd_CPS) for documentation.

In [1]:
# Import python packages
import pandas as pd
import numpy as np
import struct
import string
import os
import pickle

os.chdir('/home/brian/Documents/CPS/data')


def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo['HUHHNUM'][np_mo['HUHHNUM'] < 0] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='int64'))

In [2]:
def cps_to_feather(year_list):
    """Annual partial extracts of monthly CPS files"""
    pr_goods = ['Manufacturing', 'Trade, transportation, and utilities', 
                'Construction and mining']
    pr_servs = ['Finance and business services', 'Education and health', 
                'Leisure and hospitality', 'Public administration']
    female = lambda x: np.where(x['PESEX'] == 2, 1, 0)
    state = lambda x: pd.Categorical(x['GESTFIPS'].map(maps['state']))
    educ = lambda x: pd.Categorical(x['PEEDUCA'].map(maps['educ']))
    married = lambda x: np.where(x['PRMARSTA'].isin([1,2,3]), 1, 0)
    wbhao = lambda x: (    # If not hispanic, map race to racial groups
        pd.Categorical(np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                                x['PRDTRACE'].map(maps['race']))))
    veteran = lambda x: np.where(x['PEAFEVER'] == 1, 1,
                                 np.where(x['PEAFEVER'] == 2, 0, np.nan))
    cert = lambda x: np.where(x['PECERT1'] == 1, 1, 
                              np.where(x['PRPERTYP']==2, 0, np.nan))
    forborn = lambda x: np.where(x['PRCITSHP'].isin([4, 5]), 1, 
                                 np.where(x['PRCITSHP'].isin([1, 2, 3]), 0, np.nan))
    indgrp =  lambda x: pd.Categorical(x['PRMJIND1'].map(maps['ind']))
    lmstat = lambda x: (
        pd.Categorical(np.where((x['AGE'] <= 15), 'Under 16', 
                       np.where(((x['PRWNTJOB']==2) & 
                                 ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2])))), 
                                'NILF - Disabled/ill',
                       np.where(((x['PRWNTJOB']==2) & (x['PENLFACT']==4)), 
                                'NILF - Family',
                       np.where(((x['PRWNTJOB']==2) & 
                                 ((x['PEMLR']==5) | (x['PENLFACT']==5))), 
                                'NILF - Retired',
                       np.where(((x['PRWNTJOB']==2) & (x['PENLFACT']==3)), 
                                'NILF - School',
                       np.where((x['PEDWWNTO']==1), 'NILF - Discouraged',
                       np.where((x['PEMLR'].isin([5, 6, 7])), 'NILF - Other',
                       np.where((x['PEMLR'].isin([3, 4])), 'Unemployed',
                       np.where(((x['PEMLR'].isin([1, 2])) & 
                                 ((x['PRFTLF'] == 2) | (x['PEIO1COW'] == 8))),
                                'Employed - PT or unpaid',
                       np.where((x['PEIO1COW'].isin([1, 2, 3, 5])),
                                'Employed - FT - government or nonprofit',
                       np.where((x['PEIO1COW'].isin([6, 7])),
                                'Employed - FT - self-employed',
                       np.where(((x['PEIO1COW'] == 4) & (x['INDGRP'].isin(pr_goods))),
                                'Employed - FT - private goods producing',
                       np.where(((x['PEIO1COW'] == 4) & (x['INDGRP'].isin(pr_servs))),
                                'Employed - FT - private services producing', 'Other'
                               )))))))))))))))
    emp = lambda x: np.where(x['PREMPNOT']==1, 1, 0)
    unempdur = lambda x: np.where(x['PRUNEDUR'] > 0, x['PRUNEDUR'], np.nan)
    unemptype = lambda x: (
        pd.Categorical(np.where(x['PRUNTYPE'].isin([1, 2, 3]), 'Job Loser',
                       np.where(x['PRUNTYPE'] == 4, 'Job Leaver',
                       np.where(x['PRUNTYPE'] == 5, 'Re-entrant',
                       np.where(x['PRUNTYPE'] == 6, 'New Entrant', np.nan))))))
    ptecon = lambda x: np.where(x['PRWKSTAT'].between(2, 10), 0, 
                               np.where(x['PRWKSTAT'].isin([3, 6]), 1, np.nan))
    wkwage = lambda x: np.where(x['PRERNWA'] > 0, x['PRERNWA'] / 100.0, np.nan)
    hrwage = lambda x: np.where((x['PRERNHLY'] < 0) & (x['PEHRUSL1'] > 0 ) & 
                                (x['PRERNWA'] > 0), x['PRERNWA'] / x['PEHRUSL1'], 
                                np.where(x['PRERNHLY'] > 0, x['PRERNHLY'], 
                                         np.nan)) / 100                       
    rhrwage = lambda x: (
        np.where(x['HRWAGE'] > 0, 
                 x['HRWAGE'] * x['STATE'].map(maps['region']).map(cpi_vals), 
                 np.nan))
    rwkwage = lambda x: (
        np.where(x['WKWAGE'] > 0, 
                 x['WKWAGE'] * x['STATE'].map(maps['region']).map(cpi_vals), 
                 np.nan))
    drop_vars = ['PESEX', 'PRUNTYPE', 'PRERNHLY', 'PRERNWA', 'PRMARSTA', 
                 'PRCITSHP', 'PENLFRET', 'PRWNTJOB', 'PEMLR','PREMPNOT',
                 'PRUNEDUR', 'PEAFEVER', 'PENLFACT', 'GESTFIPS']
    cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
    cpi = pd.read_csv('clean/cpi.csv', index_col=[0], parse_dates=True)
    data_path = '/home/brian/Documents/CPS/data/'
    data_dir = os.listdir(data_path)
    dd_file = None
    for year in year_list:
        file_ending = f'{str(year)[2:]}pub.dat'
        mo_dat_files = [file for file in data_dir
                        if file.endswith(file_ending)]   
        combined_data = [] 
        for file in mo_dat_files:
            cpi_vals = cpi.loc[pd.to_datetime(f'{year}-{file[:3]}-01')].to_dict()
            if dd_file != cpsdd['matcher'][file]:
                dd_file = cpsdd['matcher'][file]
                dd = cpsdd[dd_file]['dd']
                dtypes = [(k, v[-1]) for k, v in dd.items()]
                maps = cpsdd[dd_file]['map']
                ws, we = dd['PWSSWGT'][:2]
                unpack_fmt = cpsdd[dd_file]['unpack_fmt']
                unpacker = struct.Struct(unpack_fmt).unpack_from
            raw_mo_data = open(f'{data_path}{file}', 'rb')
            mo_data = [unpacker(row) for row in raw_mo_data]  
            np_mo = np.array(mo_data, dtype=dtypes)
            np_mo = np_mo[(np_mo['PRTAGE'] > -1) & 
                          (np_mo['PWSSWGT'] > 0)]
            for wgt in maps['wgt']:
                np_mo[wgt] = np.divide(np_mo[wgt], 10000)
            dfm = (pd.DataFrame(np_mo)
                     .rename({'PRTAGE':'AGE'}, axis=1)
                     .assign(FEMALE = female,
                             STATE = state,
                             EDUC = educ,
                             MARRIED = married,
                             WBHAO = wbhao,
                             FORBORN = forborn,
                             VETERAN = veteran,
                             INDGRP = indgrp,
                             LMSTAT = lmstat,
                             EMP = emp,
                             UNEMPTYPE = unemptype,
                             UNEMPDUR = unempdur,
                             PTECON = ptecon,
                             WKWAGE = wkwage,
                             HRWAGE = hrwage,
                             RHRWAGE = rhrwage,
                             RWKWAGE = rwkwage)
                     .drop(drop_vars, axis=1))
            if year < 1998:
                dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
                dfm = dfm.drop(['HRYEAR'], axis=1)
            if year > 2016:
                dfm = dfm.assign(CERT = cert).drop(['PECERT1'], axis=1)
            if maps['id2'] == True:
                dfm['HRHHID2'] = id2_gen(np_mo)
                dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], axis=1)
            combined_data.append(dfm)
        df = (pd.concat(combined_data, sort=False)
              .reset_index(drop=True)
              .assign(HRYEAR4 = lambda x: pd.Categorical(x['HRYEAR4'])))
        df.to_feather(f'clean/cps{year}.ft')
        print(f'{year} Done: ({len(df):,} records, {len(df.keys())} variables)')

In [3]:
cps_to_feather(range(2018, 2019))

2018 Done: (1,354,423 records, 65 variables)
