### bd econ CPS extract

bd_CPS_reader.ipynb

February 1, 2019

Contact: Brian Dew, @bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

-----

See [readme](https://github.com/bdecon/econ_data/tree/master/bd_CPS) for documentation.

In [1]:
# Import python packages
import pandas as pd
import numpy as np
import struct
import string
import os
import pickle

os.chdir('/home/brian/Documents/CPS/data')

def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo['HUHHNUM'][np_mo['HUHHNUM'] < 0] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='int64'))

In [2]:
def cps_to_feather(year_list):
    """Annual partial extracts of monthly CPS files"""
    pr_goods = ['Manufacturing', 'Trade, transportation, and utilities', 
                'Construction and mining']
    pr_servs = ['Finance and business services', 'Education and health', 
                'Leisure and hospitality', 'Public administration']
    female = lambda x: np.where(x['PESEX'] == 2, 1, 0)
    state = lambda x: pd.Categorical(x['GESTFIPS'].map(maps['state']))
    region = lambda x: x['STATE'].map(maps['region'])
    educ = lambda x: pd.Categorical(x['PEEDUCA'].map(maps['educ']))
    schenr = lambda x: np.where(x['PESCHENR'] == 1, 1, 
                                np.where(x['PESCHENR'] == 2, 0, np.nan))
    married = lambda x: np.where(x['PRMARSTA'].isin([1, 2, 3]), 1, 0)
    wbhao = lambda x: (    # If not hispanic, map race to racial groups
        pd.Categorical(np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                                x['PRDTRACE'].map(maps['race']))))
    wbhaom = lambda x: (    # If not hispanic, map race to racial groups
        pd.Categorical(np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                                x['PRDTRACE'].map(maps['racem']))))
    veteran = lambda x: np.where(x['PEAFEVER'] == 1, 1,
                                 np.where(x['PEAFEVER'] == 2, 0, np.nan))
    cert = lambda x: np.where(x['PECERT1'] == 1, 1, 
                              np.where(x['PECERT1']==2, 0, np.nan))
    forborn = lambda x: np.where(x['PRCITSHP'].isin([4, 5]), 1, 
                                 np.where(x['PRCITSHP'].isin([1, 2, 3]), 0, np.nan))
    ctybirth94 = lambda x: pd.Categorical(x['PENATVTY'].map(cob[1994]), ordered=False)
    ctybirth07 = lambda x: pd.Categorical(x['PENATVTY'].map(cob[2007]), ordered=False)
    indgrp =  lambda x: pd.Categorical(x['PRMJIND1'].map(maps['ind']))
    lmstat = lambda x: (
        pd.Categorical(np.where((x['AGE'] <= 15), 'Under 16', 
                       np.where(((x['PRWNTJOB']==2) & 
                                 ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2])))), 
                                'NILF - Disabled/ill',
                       np.where(((x['PRWNTJOB']==2) & (x['PENLFACT']==4)), 
                                'NILF - Family',
                       np.where(((x['PRWNTJOB']==2) & 
                                 ((x['PEMLR']==5) | (x['PENLFACT']==5))), 
                                'NILF - Retired',
                       np.where(((x['PRWNTJOB']==2) & (x['PENLFACT']==3)), 
                                'NILF - School',
                       np.where((x['PEDWWNTO']==1), 'NILF - Discouraged',
                       np.where((x['PEMLR'].isin([5, 6, 7])), 'NILF - Other',
                       np.where((x['PEMLR'].isin([3, 4])), 'Unemployed',
                       np.where(((x['PEMLR'].isin([1, 2])) & 
                                 ((x['PRFTLF'] == 2) | (x['PEIO1COW'] == 8))),
                                'Employed - PT or unpaid',
                       np.where((x['PEIO1COW'].isin([1, 2, 3, 5])),
                                'Employed - FT - government or nonprofit',
                       np.where((x['PEIO1COW'].isin([6, 7])),
                                'Employed - FT - self-employed',
                       np.where(((x['PEIO1COW'] == 4) & (x['INDGRP'].isin(pr_goods))),
                                'Employed - FT - private goods producing',
                       np.where(((x['PEIO1COW'] == 4) & (x['INDGRP'].isin(pr_servs))),
                                'Employed - FT - private services producing', 'Other'
                               )))))))))))))))
    emp = lambda x: np.where(x['PREMPNOT']==1, 1, 0)
    mjh = lambda x: np.where(x['PRSJMJ']==2, 1, np.where(x['PRSJMJ']==1, 0, np.nan))
    unempdur = lambda x: x['PRUNEDUR']
    unemptype = lambda x: (
        pd.Categorical(np.where(x['PRUNTYPE'].isin([1, 2, 3]), 'Job Loser',
                       np.where(x['PRUNTYPE'] == 4, 'Job Leaver',
                       np.where(x['PRUNTYPE'] == 5, 'Re-entrant',
                       np.where(x['PRUNTYPE'] == 6, 'New Entrant', np.nan))))))
    ptecon = lambda x: np.where(x['PRWKSTAT'].isin([3, 6]), 1, 
                               np.where(x['PRWKSTAT'].between(2, 10), 0, np.nan))
    wkwage = lambda x: np.where(x['PRERNWA'] > 0, x['PRERNWA'] / 100.0, np.nan)
    hrwage = lambda x: np.where((x['PRERNHLY'] < 0) & (x['PEHRUSL1'] > 0 ) & 
                                (x['PRERNWA'] > 0), x['PRERNWA'] / x['PEHRUSL1'], 
                                np.where(x['PRERNHLY'] > 0, x['PRERNHLY'], 
                                         np.nan)) / 100  
    ottcamt = lambda x: np.where(x['PUERN2'] > 0, x['PUERN2'] / 100.0, np.nan)
    rhrwage = lambda x: (np.where(x['HRWAGE'] > 0, 
                 x['HRWAGE'] * x['REGION'].map(cpi_vals), np.nan))
    rhrwage2 = lambda x: (
        np.where(x['HRWAGE'] > 0, x['HRWAGE'] * cpi_vals['ALL'], np.nan))
    rwkwage = lambda x: (np.where(x['WKWAGE'] > 0, 
                 x['WKWAGE'] * x['REGION'].map(cpi_vals), np.nan))
    rottcamt = lambda x: (np.where(x['OTTCAMT'] > 0, 
                 x['OTTCAMT'] * x['REGION'].map(cpi_vals), np.nan))
    drop_vars = ['PESEX', 'PRUNTYPE', 'PRERNWA', 'PRWKSTAT', 'PEDWWNTO',
                 'PRCITSHP', 'PENLFRET', 'PESCHENR', 'GESTFIPS', 'PRWNTJOB',
                 'PRUNEDUR', 'PEAFEVER', 'PRSJMJ', 'PUERN2']
    cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
    cob = pickle.load(open('country_of_birth.pkl', 'rb')) # From CEPR program
    cpi = pd.read_csv('clean/cpi.csv', index_col=[0], parse_dates=True)
    data_path = '/home/brian/Documents/CPS/data/'
    data_dir = os.listdir(data_path)
    dd_file = None
    for year in year_list:
        file_ending = f'{str(year)[2:]}pub.dat'
        mo_dat_files = [file for file in data_dir
                        if file.endswith(file_ending)]   
        combined_data = [] 
        for file in mo_dat_files:
            date = pd.to_datetime(f'{year}-{file[:3]}-01')
            cpi_vals = cpi.loc[date].to_dict()
            if dd_file != cpsdd['matcher'][file]:
                dd_file = cpsdd['matcher'][file]
                dd = cpsdd[dd_file]['dd']
                dtypes = [(k, v[-1]) for k, v in dd.items()]
                maps = cpsdd[dd_file]['map']
                ws, we = dd['PWSSWGT'][:2]
                unpack_fmt = cpsdd[dd_file]['unpack_fmt']
                unpacker = struct.Struct(unpack_fmt).unpack_from
            raw_mo_data = open(f'{data_path}{file}', 'rb')
            if file == 'mar04pub.dat':
                mo_data = [unpacker(row) for row in raw_mo_data 
                           if b'**' not in row]
            else:
                mo_data = [unpacker(row) for row in raw_mo_data]  
            np_mo = np.array(mo_data, dtype=dtypes)
            np_mo = np_mo[(np_mo['PRTAGE'] > -1) & 
                          (np_mo['PWSSWGT'] > 0)]
            for wgt in maps['wgt']:
                np_mo[wgt] = np.divide(np_mo[wgt], 10000)
            dfm = (pd.DataFrame(np_mo)
                     .rename({'PRTAGE':'AGE'}, axis=1)
                     .assign(FEMALE = female,
                             STATE = state,
                             REGION = region,
                             EDUC = educ,
                             SCHENR = schenr,
                             MARRIED = married,
                             WBHAO = wbhao,
                             FORBORN = forborn,
                             VETERAN = veteran,
                             INDGRP = indgrp,
                             LMSTAT = lmstat,
                             EMP = emp,
                             MJH = mjh,
                             UNEMPTYPE = unemptype,
                             UNEMPDUR = unempdur,
                             PTECON = ptecon,
                             WKWAGE = wkwage,
                             HRWAGE = hrwage,
                             OTTCAMT = ottcamt,
                             RHRWAGE = rhrwage,
                             RHRWAGE2 = rhrwage2,
                             RWKWAGE = rwkwage,
                             ROTTCAMT = rottcamt)
                     .drop(drop_vars, axis=1))
            # December 2007 revised weights
            if date == pd.to_datetime('2007-12-01'):
                df_rev = pd.read_feather(f'{data_path}clean/cps_dec07_rev.ft')
                dfm = pd.merge(dfm, df_rev)
                dfm['PWSSWGT'] = dfm['NWSSWGT']
                dfm['PWCMPWGT'] = dfm['NWCMPWGT']
                dfm = dfm.drop(['NWSSWGT', 'NWCMPWGT'], axis=1)
            if date >= pd.to_datetime('1999-11-01'):
                resize_vars = ['PRNMCHLD', 'PRCHLD']
                dfm[resize_vars] = dfm[resize_vars].astype('int8')
            if year < 1997:
                dfm = dfm.assign(CTYBIRTH = ctybirth94)
            if year >= 1997:
                dfm = dfm.assign(CTYBIRTH = ctybirth07)
            dfm = dfm.drop(['PENATVTY'], axis=1)
            if year < 1998:
                dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
                dfm = dfm.drop(['HRYEAR'], axis=1)
                dfm['BASICWGT'] = dfm['PWSSWGT']
            if year > 1997:
                dfm['BASICWGT'] = dfm['PWCMPWGT']
            if year > 2002:
                dfm = dfm.assign(WBHAOM = wbhaom)
            if year > 2016:
                dfm = dfm.assign(CERT = cert).drop(['PECERT1'], axis=1)
                dfm['CERT'] = dfm['CERT'].astype('category')
            if maps['id2'] == True:
                dfm['HRHHID2'] = id2_gen(np_mo)
                dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], axis=1)
            dfm = dfm.rename({'HRYEAR4':'YEAR', 'HRMONTH': 'MONTH'}, axis=1)
            resize_vars = ['SCHENR', 'FORBORN', 'VETERAN', 'MJH', 'MARRIED',
                           'EMP', 'FEMALE', 'PTECON', 'REGION', 'UNEMPTYPE']
            dfm[resize_vars] = dfm[resize_vars].astype('category')
            wage_vars = ['WKWAGE', 'HRWAGE', 'OTTCAMT', 'RHRWAGE', 'RHRWAGE2', 
                         'RWKWAGE', 'ROTTCAMT']
            dfm[wage_vars] = dfm[wage_vars].astype('float32')
            combined_data.append(dfm)
        df = (pd.concat(combined_data, sort=False)
              .reset_index(drop=True)
              .assign(YEAR = lambda x: pd.Categorical(x['YEAR'])))
        df['CTYBIRTH'] = df['CTYBIRTH'].astype('category')
        df = df.rename({'PRTAGE': 'AGE', 'HRYEAR4':'YEAR', 
                        'HRMONTH': 'MONTH', 'PEHRUSLT': 'HRSUSLT',
                        'PEHRUSL1': 'HRSUSL1', 'PEHRUSL2': 'HRSUSL2',
                        'PEHRACT1': 'HRSACT1', 'PEHRACT2': 'HRSACT2',
                        'PEHRACTT': 'HRSACTT'}, axis=1)
        # Merge in the 2000-revised weights here
        if 2000 <= year <= 2002:
            rev_wgts = (pd.read_feather(f'{data_path}clean/cps_wgt_rev.ft')
                          .query('YEAR == @year'))
            df = pd.merge(df, rev_wgts)
            df['BASICWGT'] = df['NWCMPWGT']
            df['PWORWGT'] = df['NWORWGT']
            df['PWSSWGT'] = df['NWSSWGT']
            df = (df.drop(['NWCMPWGT', 'NWORWGT', 'NWSSWGT'], axis=1)
                    .assign(YEAR = lambda x: pd.Categorical(x['YEAR'])))
        # Merge in revised union data
        if year in [2001, 2002]:
            rev_df = (pd.read_feather(f'{data_path}clean/cps_union_rev.ft')
                        .query('YEAR == @year'))
            df = pd.merge(df, rev_df)
            df['PEERNLAB'] = df['NEERNLAB']
            df['PEERNCOV'] = df['NEERNCOV']
            df = (df.drop(['NEERNLAB', 'NEERNCOV'], axis=1)
                    .assign(YEAR = lambda x: pd.Categorical(x['YEAR'])))
        if year >= 1998:    
            df = df.drop(['QSTNUM', 'OCCURNUM'], axis=1)
        df.to_feather(f'clean/cps{year}.ft')
        print(f'{year} Done: ({len(df):,} records, {len(df.keys())} variables)')

In [3]:
cps_to_feather(range(1994, 2019))

1994 Done: (1,672,934 records, 64 variables)
1995 Done: (1,648,060 records, 64 variables)
1996 Done: (1,461,469 records, 64 variables)
1997 Done: (1,462,817 records, 64 variables)
1998 Done: (1,461,394 records, 65 variables)
1999 Done: (1,465,602 records, 67 variables)
2000 Done: (1,460,724 records, 67 variables)
2001 Done: (1,560,960 records, 67 variables)
2002 Done: (1,703,017 records, 67 variables)
2003 Done: (1,685,264 records, 68 variables)
2004 Done: (1,656,144 records, 70 variables)
2005 Done: (1,644,787 records, 70 variables)
2006 Done: (1,628,798 records, 70 variables)
2007 Done: (1,611,901 records, 70 variables)
2008 Done: (1,600,790 records, 70 variables)
2009 Done: (1,617,099 records, 71 variables)
2010 Done: (1,621,021 records, 71 variables)
2011 Done: (1,600,068 records, 71 variables)
2012 Done: (1,588,264 records, 71 variables)
2013 Done: (1,576,085 records, 71 variables)
2014 Done: (1,582,739 records, 71 variables)
2015 Done: (1,561,469 records, 71 variables)
2016 Done: