### bd econ CPS extract

bd_CPS_reader.ipynb

April 16, 2019

Contact: Brian Dew, @bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

-----

See [readme](https://github.com/bdecon/econ_data/tree/master/bd_CPS) for documentation.

In [1]:
# Import python packages
import pandas as pd
print('pandas:', pd.__version__)
import numpy as np
print('numpy:', np.__version__)
import struct
import string
import os
import pickle
from scipy.stats import norm

#Statsmodels used to impute hours worked on first job
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Map codes for country of birth to country/area names
from bd_CPS_details import COB1994Map, COB2007Map, AsianMap, EducDTMap, DropVars, INDMMap

os.chdir('/home/brian/Documents/CPS/data')

pandas: 2.2.2
numpy: 1.26.4


In [2]:
# Settings
cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
data_path = '/home/brian/Documents/CPS/data/'
data_files = os.listdir(data_path)

# Consumer Price Index (retrieve using bd_CPS_cpi)
cpi = pd.read_csv('clean/cpi.csv', index_col=[0], parse_dates=True)

# Federal minimum wage from DOL:
min_wage = [('1981-01-01', '1990-04-01', 3.35),
            ('1990-04-01', '1991-04-01', 3.80),
            ('1991-04-01', '1996-10-01', 4.25),
            ('1996-10-01', '1997-09-01', 4.75),
            ('1997-09-01', '2007-07-24', 5.15),
            ('2007-07-24', '2008-07-24', 5.85),
            ('2008-07-24', '2009-07-24', 6.55),
            ('2009-07-24', '2025-12-01', 7.25)]

min_wage = [(pd.to_datetime(i[0]), pd.to_datetime(i[1]), i[2])
            for i in min_wage]

# Unique ID list
ids_file = 'CPS_unique_ids.pkl'
if os.path.isfile(ids_file):
    cps_ids_full = pickle.load(open(ids_file, 'rb'))

ids_file = 'CPSID_89-93.pkl'
if os.path.isfile(ids_file):
    cps_ids_full_early = pickle.load(open(ids_file, 'rb'))

In [3]:
def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo.loc[np_mo['HUHHNUM'] < 0, 'HUHHNUM'] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='uint32'))

In [4]:
def data_file_reader(file, unpacker, dtypes, ws, we):
    
    # If person weight > 0, unpack the raw file
    if file == 'mar04pub.dat':
        data = [unpacker(row) for row in open(file, 'rb')
                if b'**' not in row and row[ws:we].strip() > b'0']
    elif file[3:5] in ['94', '95']:
        data = [unpacker(row.replace(b'\x00\x00', b'-1')) 
                for row in open(file, 'rb')
                if row[ws:we].strip() > b'0']
    elif file[3:5] in ['23']:
        data = [unpacker(row.replace(b'--', b'0-')) 
                for row in open(file, 'rb')
                if row[ws:we].strip() > b'0']
    else:
        data = [unpacker(row) for row in open(file, 'rb') 
                if row[ws:we].strip() > b'0']
        
    # Convert to dataframe using specified weights
    df = pd.DataFrame(np.array(data, dtype=dtypes))
    return df

In [5]:
def clean_all(df, maps, cpi_vals, date):
    
    # New variables to add
    age = lambda x: np.where(x['PRTAGE'] > 80, 80, x['PRTAGE'])
    female = lambda x: pd.Categorical(np.where(x['PESEX'] == 2, 1, 0))
    state = lambda x: pd.Categorical(x['GESTFIPS'].map(maps['state']))
    region = lambda x: x['STATE'].map(maps['region'])
    educdt = lambda x: pd.Categorical(x['PEEDUCA'].map(EducDTMap))
    educ = lambda x: pd.Categorical(x['PEEDUCA'].map(maps['educ']))
    schenr = lambda x: pd.Categorical(
        np.where(x['PESCHENR'] == 1, 1, 
        np.where(x['PESCHENR'] == 2, 0, None)))
    married = lambda x: pd.Categorical(
        np.where(x['PRMARSTA'].isin([1, 2, 3]), 1, 0))
    wbhao = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                 x['PRDTRACE'].map(maps['race'])))
    wbao = lambda x: pd.Categorical(x['PRDTRACE'].map(maps['race']))
    hispanic = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 1, 0))
    veteran = lambda x: pd.Categorical(
        np.where(x['PEAFEVER'] == 1, 1, 
        np.where(x['PEAFEVER'] == 2, 0, None)))
    forborn = lambda x: pd.Categorical(
        np.where(x['PRCITSHP'].isin([4, 5]), 1, 
        np.where(x['PRCITSHP'].isin([1, 2, 3]), 0, None)))
    citizen = lambda x: pd.Categorical(
        np.where(x['PRCITSHP'].isin([1, 2, 3, 4]), 1, 
        np.where(x['PRCITSHP'].isin([5]), 0, None)))
    indgrp =  lambda x: pd.Categorical(x['PRMJIND1'].map(maps['ind']))
    manager = lambda x: np.where(x.PRDTOCC1 == 1, 1,
                        np.where(x.PRDTOCC1 > 0, 0, None))
    mjh = lambda x: pd.Categorical(
        np.where(x['PRSJMJ']==2, 1, np.where(x['PRSJMJ']==1, 0, None)))
    numjobs = lambda x: np.where(x.PEMJNUM > 1, x.PEMJNUM,
        np.where((x.MJH == 0) & (x.LFS == "Employed"), 1, 0))
    unemptype = lambda x: pd.Categorical(
        np.where(x['PRUNTYPE'].isin([1, 2, 3]), 'Job Loser',
        np.where(x['PRUNTYPE'] == 4, 'Job Leaver',
        np.where(x['PRUNTYPE'] == 5, 'Re-entrant',
        np.where(x['PRUNTYPE'] == 6, 'New Entrant', None)))))
    layoff = lambda x: pd.Categorical(
        np.where(x['PEMLR'] == 3, 'Layoff',
        np.where(x['PEMLR'] == 4, 'Looking', None)))
    ptecon = lambda x: pd.Categorical(
        np.where(x['PRWKSTAT'].isin([3, 6]), 1, 
        np.where(x['PRWKSTAT'].between(2, 10), 0, None)))
    uslft = lambda x: pd.Categorical(
        np.where((x['PEHRFTPT'] == 1) | (x['PEHRUSL1'] >= 35), 1,
        np.where(x['PEHRFTPT'] == 2, 0, 
        np.where(x['PEHRFTPT'] == 3, -4, None))))
    workft = lambda x: pd.Categorical(
        np.where(x['PRWKSTAT'].isin([2, 8, 9]), 1,
        np.where(x['PRWKSTAT'].between(2, 10), 0, None)))
    ftlf = lambda x: pd.Categorical(
        np.where(x.PRFTLF == 1, 'Full-time',
        np.where(x.PRFTLF == 2, 'Part-time', None)))
    sameemp = lambda x: pd.Categorical(
        np.where(x['PUIODP1'] == 1, 1,
        np.where(x['PUIODP1'] == 2, 0, None)))
    chduties = lambda x: pd.Categorical(
        np.where(x['PUIODP2'] == 1, 1,
        np.where(x['PUIODP2'] == 2, 0, None)))
    sameact = lambda x: pd.Categorical(
        np.where(x['PUIODP3'] == 1, 1,
        np.where(x['PUIODP3'] == 2, 0, None)))
    chjobact = lambda x: pd.Categorical(
        np.where((x.PUIODP1 == 2) | (x.PUIODP2 == 1) | 
                 (x.PUIODP3 == 2), 1,
        np.where((x.PUIODP1 == 1) | (x.PUIODP2 == 2) | 
                 (x.PUIODP3 == 1), 0, None)))
    notatwork = lambda x: pd.Categorical(
        np.where(x['PRWKSTAT'].isin([5,10]), 1,
        np.where(x['PRWKSTAT'].between(1, 12), 0, None)))
    lfs = lambda x: pd.Categorical(
        np.where(x['PEMLR'].isin([1, 2]), 'Employed',
        np.where(x['PEMLR'].isin([3, 4]), 'Unemployed',
        np.where(x['PEMLR'].isin([5, 6, 7]), 'NILF', np.nan))))
    cow1 = lambda x: pd.Categorical(
        np.where(x['PEIO1COW'] == 1, 'Federal Government',
        np.where(x['PEIO1COW'] == 2, 'State Government',
        np.where(x['PEIO1COW'] == 3, 'Local Government',
        np.where(x['PEIO1COW'].isin([4, 5]), 'Private',
        np.where(x['PEIO1COW'] == 6, 'Self-employed Incorporated',
        np.where(x['PEIO1COW'] == 7, 'Self-employed Unincorporated',
        np.where(x['PEIO1COW'] == 8, 'Without Pay', None))))))))
    cow2 = lambda x: pd.Categorical(
        np.where(x['PEIO2COW'] == 1, 'Federal Government',
        np.where(x['PEIO2COW'] == 2, 'State Government',
        np.where(x['PEIO2COW'] == 3, 'Local Government',
        np.where(x['PEIO2COW'].isin([4, 5]), 'Private',
        np.where(x['PEIO2COW'] == 6, 'Self-employed Incorporated',
        np.where(x['PEIO2COW'] == 7, 'Self-employed Unincorporated',
        np.where(x['PEIO2COW'] == 8, 'Without Pay', None))))))))
    nilfreason = lambda x: pd.Categorical(
        np.where((x['PRWNTJOB']==2) & 
                 ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2]))), 
                 'Disabled/Ill',
        np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==4), 'Family',
        np.where((x['PRWNTJOB']==2) & ((x['PEMLR']==5) | (x['PENLFACT']==5)), 
                 'Retired',
        np.where((x['PRWNTJOB']==2) & ((x['PENLFACT']==3) | (x['SCHENR']==1)), 
                 'School',
        np.where((x['PRWNTJOB']==1) & (x['PEMLR'].isin([5,6,7])), 'Discouraged',
        np.where(x['PEMLR'].isin([5, 6, 7]), 'Other', np.nan)))))))
    nlffam = lambda x: pd.Categorical(
        np.where(x.PENLFACT == 4, 1, 
        np.where(x.LFS == 'NILF', 0, None)))
    paidhrly = lambda x: pd.Categorical(
        np.where(x['PEERNHRY'] == 1, 1,
        np.where(x['PEERNHRY'] == 2, 0, None)))
    proxy = lambda x: pd.Categorical(
        np.where(x['PUSLFPRX'] == 1, 'Self',
        np.where(x['PUSLFPRX'] == 2, 'Proxy',
        np.where(x['PUSLFPRX'] == 3, 'Both', None))))
    dwtype = lambda x: pd.Categorical(
        np.where(x.PRDISC == 1, 'Discouraged',
        np.where(x.PRDISC == 2, 'Marginally Attached',
        np.where(x.PRJOBSEA == 5, 'No Recent Search',
        np.where(x.PRDISC == 3, 'Unavailable', None)))))
    abstype = lambda x: pd.Categorical(
        np.where((x.PRABSREA.isin([1, 11, 21, 31])) | 
                 (x.PEHRRSN3 == 4), 'Vacation', 
        np.where((x.PRABSREA.isin([2, 12, 22, 32])) | 
                 (x.PEHRRSN3 == 5), 'Sick',  
        np.where((x.PRABSREA.isin([3, 13, 23, 33])) | 
                 (x.PEHRRSN3 == 7), 'Child Care', 
        np.where((x.PRABSREA.isin([4, 14, 24, 34])) | 
                 (x.PEHRRSN3 == 8), 'Other Family/Personal', 
        np.where(x.PRABSREA.isin([5, 15, 25, 35]), 'Maternity/Paternity',  
        np.where((x.PRABSREA.isin([6, 16, 26, 36])) | 
                 (x.PEHRRSN3 == 9), 'Labor Dispute',  
        np.where((x.PRABSREA.isin([7, 17, 27, 37])) |
                 (x.PEHRRSN3 == 10), 'Weather',  
        np.where((x.PRABSREA.isin([8, 18, 28, 38])) | 
                 (x.PEHRRSN3 == 11), 'School/Training',  
        np.where((x.PRABSREA.isin([9, 19, 29, 39])) | 
                 (x.PEHRRSN3 == 12), 'Civic/Military',  
        np.where((x.PRABSREA.isin([10, 20, 30, 40])) | 
                 (x.PEHRRSN3 == 13), 'Other',  
        np.where(x.PEHRRSN3 == 6, 'Holiday', None))))))))))))
    abspaid = lambda x: pd.Categorical(
        np.where(x.PRABSREA.isin(list(range(1, 11)) + 
                                 list(range(21, 31))), 'Paid',
        np.where(x.PRABSREA.isin(list(range(11, 21)) + 
                                 list(range(31, 41))), 'Unpaid', None)))
    ptreason = lambda x: pd.Categorical(
        np.where(x.ABSTYPE.notnull(), None, # Not defined for FT but absent
        np.where(x.PRPTREA == 3, 'Job Started/Ended During Week',
        np.where((x.PRPTREA.isin([1, 14])) | (x.PEHRRSN1 == 1), 
                 'Slack Work/Business Conditions',
        np.where((x.PRPTREA == 15) | (x.PEHRRSN1 == 2), 
                 'Could Only Find PT Work',
        np.where((x.PRPTREA.isin([2, 16])) | (x.PEHRRSN1 == 3), 
                 'Seasonal Work',
        np.where((x.PRPTREA == 17) | (x.PEHRRSN1 == 4) | (x.PEHRRSN2 == 1),
                 'Child Care Problems',
        np.where((x.PRPTREA == 18) | (x.PEHRRSN1 == 5) | (x.PEHRRSN2 == 2), 
                 'Other Family/Personal Obligations',
        np.where((x.PRPTREA == 19) | (x.PEHRRSN1 == 6) | (x.PEHRRSN2 == 3),
                 'Health/Medical Limitations',
        np.where((x.PRPTREA == 20) | (x.PEHRRSN1 == 7) | (x.PEHRRSN2 == 4), 
                 'School/Training',
        np.where((x.PRPTREA == 21) | (x.PEHRRSN1 == 8) | (x.PEHRRSN2 == 5), 
                 'Retired/Earnings Limit',
        np.where((x.PRPTREA == 22) | (x.PEHRRSN1 == 9) | (x.PEHRRSN2 == 6), 
                 'Workweek <35 Hours',
        np.where((x.PRPTREA.isin([13, 23])) | (x.PEHRRSN1 == 10) | (x.PEHRRSN2 == 7), 
                 'Other', None)))))))))))))
    wantft = lambda x: pd.Categorical(
        np.where(x.PEHRWANT == 1, 1, np.where(x.PEHRWANT == 2, 0, None)))
    jltype = lambda x: pd.Categorical(
        np.where(x.PRUNTYPE == 1, 'Temporary Layoff', 
        np.where(x.PRUNTYPE == 2, 'Permanent Job Loss', 
        np.where(x.PRUNTYPE == 3, 'Temporary Job Ended', None))))
    retired = lambda x: pd.Categorical(
        np.where((x.PENLFRET == 1) | (x.PENLFACT == 5) | 
                 (x.PEMLR == 5), 1, 0))
    school = lambda x: pd.Categorical(
        np.where(x.PESCHLVL == 1, 'High School', 
        np.where((x.PESCHLVL == 2) & (x.PESCHFT == 1), 'Full-time College', 
        np.where((x.PESCHLVL == 2) & (x.PESCHFT == 2), 'Part-time College', 
        np.where(x.SCHENR == 0, 'Not Enrolled', None)))))
    # Wage variables
    wkearn = lambda x: np.where(x.PRERNWA >= 0, x.PRERNWA / 100, None)
    hrwage = lambda x: (
        np.where((x.PRERNHLY < 0) & (x.PEHRUSL1 > 0 ) & 
                 (x.PRERNWA > 0), (x.PRERNWA / x.PEHRUSL1) / 100,
        np.where(x.PRERNHLY >= 0, x.PRERNHLY / 100, None)))
    otcamt = lambda x: np.where(x.PEERN > 0, x.PEERN / 100, None)
    priceadj = lambda x: 1 * x.REGION.map(cpi_vals).astype('float')
    
    # Minimum wage
    for start, end, wage in min_wage:
        if (date >= start) and (date < end):
            fed_min_wage = wage
    minwage = lambda x: pd.Categorical( 
        np.where((x['HRWAGE'] > 0) & (x['HRWAGE'] <= fed_min_wage), 1, 
        np.where(x['HRWAGE'] > fed_min_wage, 0, None)))
    
    # Correct weights for implied decimals
    df[maps['wgt']] = (df[maps['wgt']] / 10000.0).astype('float32')
    
    # Assign new variables and drop one ones
    df = (df.assign(AGE = age, 
                    FEMALE = female,
                    STATE = state,
                    REGION = region,
                    EDUCDT = educdt,
                    EDUC = educ,
                    SCHENR = schenr,
                    SCHOOL = school,
                    RETIRED = retired,
                    MARRIED = married,
                    WBHAO = wbhao,
                    WBAO = wbao,
                    HISPANIC = hispanic,
                    VETERAN = veteran,
                    FORBORN = forborn,
                    CITIZEN = citizen,
                    UNEMPTYPE = unemptype,
                    JLTYPE = jltype,
                    LAYOFF = layoff,
                    PTECON = ptecon,
                    USLFT = uslft,
                    WORKFT = workft,
                    FTLF = ftlf,
                    SAMEEMP = sameemp,
                    CHDUTIES = chduties,
                    SAMEACT = sameact,
                    CHJOBACT = chjobact,
                    NOTATWORK = notatwork,
                    ABSTYPE = abstype, 
                    ABSPAID = abspaid,
                    PTREASON = ptreason,
                    WANTFT = wantft,
                    DWTYPE = dwtype,
                    PAIDHRLY = paidhrly,
                    PROXY = proxy,
                    LFS = lfs,
                    COW1 = cow1,
                    COW2 = cow2,
                    INDGRP = indgrp,
                    MANAGER = manager,
                    MJH = mjh,
                    NUMJOBS = numjobs,
                    NILFREASON = nilfreason,
                    NLFFAM = nlffam,
                    WKEARN = wkearn,
                    HRWAGE = hrwage,
                    OTCAMT = otcamt,
                    PRICEADJ = priceadj,
                    MINWAGE = minwage)
            .drop(DropVars, axis=1)
            .rename({'HRMONTH': 'MONTH', 'HRHHID': 'HHID', 
                     'PRWERNAL': 'WKEARNFLG',
                     'PEPARENT': 'PARENT', 'PRFAMNUM': 'FAMNUM',
                     'PRUNEDUR': 'UNEMPDUR', 'PESPOUSE': 'SPOUSE',
                     'HRMIS': 'MIS', 'GTCBSA': 'CBSA', 'GTCSA': 'CSA',
                     'PEHRACTT': 'HRSACTT', 'PEHRUSLT': 'HRSUSLT',
                     'PEHRUSL1': 'HRSUSL1', 'PEHRUSL2': 'HRSUSL2',
                     'PEHRACT1': 'HRSACT1', 'PEHRACT2': 'HRSACT2',
                     'HWHHWGT': 'HHWGT', 'HEFAMINC': 'FAMINC',
                     'PULINENO': 'LINENO'}, axis=1))
    
    # Wage variables to float32
    wage_vars = ['WKEARN', 'OTCAMT', 'HRWAGE']
    df[wage_vars] = df[wage_vars].astype('float32')
    return df

In [6]:
square = lambda x: x**2
cube = lambda x: x**3

def impute_hours(df):
    spec = '''HRSUSL1 ~ AGE + square(AGE) + cube(AGE) + C(WBHAO) + 
              C(EDUCDT) + MARRIED + CITIZEN + FORBORN + PRIVATE + 
              C(STATE) + NUMJOBS + C(INDM) + MANAGER + SELFEMP'''
    for s in [0, 1]:
        female = (df.FEMALE == s)
        for i in ['Full-time', 'Part-time']:
            ftpt = (df.FTLF == i)
            d = df.loc[ftpt & female]
            data = d.query('HRSUSL1 > 0')
            fn = smf.ols(formula=spec, data=data)
            #print(fn)
            reg = fn.fit()
            predicted = reg.predict(d)
            predicted.values[predicted < 1] = 1
            if i == 'Full-time':
                predicted.values[predicted < 35] = 35
            df.loc[ftpt & female, 'HRSUSL1I'] = predicted
                
    return df

In [7]:
def mean_above_topcode(group, topcode):
    '''CEPR mean above topcode approach for specific columns'''
    wage_obs = group.query('WKEARN > 0')
    wages = wage_obs.WKEARN.values
    wgts = wage_obs.PWORWGT.values
    tc_share = (wage_obs.loc[wage_obs.PTWK == 1, 'PWORWGT'].sum() / 
                wage_obs.PWORWGT.sum())
    return mtc_cepr(wages, wgts, topcode, tc_share)


def mtc_cepr(data, weights, topcode, topcode_share):
    '''
    Estimate mean above topcode for lognormally distributed data
    '''
    if topcode_share == 0:
        return np.nan
    a = np.log(topcode)
    phi = 1 - topcode_share
    y = np.log(data)
    X = np.average(y, weights=weights)
    alpha = norm.ppf(phi) 
    lmbda = -norm.pdf(alpha) / phi
    sigma = (a - X) / (phi * (alpha - lmbda))
    mu = a - (alpha * sigma)
    hlambda = norm.pdf(alpha) / (1 - norm.cdf(alpha))
    mtc = mu + sigma * hlambda
    return np.exp(mtc)

In [8]:
def clean_special(dfm, maps, cpi_vals, date):
    
    # New variables in selected dates:
    wbhaom = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                 x['PRDTRACE'].map(maps['racem'])))
    hispdt = lambda x: pd.Categorical(x['PRDTHSP'].map(maps['hispdt']))
    hispdt03 = lambda x: pd.Categorical(x['PRDTHSP'].map(maps['hispdt03']))
    cert = lambda x: pd.Categorical(
        np.where(x['PECERT1'] == 1, 1, np.where(x['PECERT1']==2, 0, np.nan)))
    ctybirth94 = lambda x: pd.Categorical(x['PENATVTY'].map(COB1994Map))
    ctybirth07 = lambda x: pd.Categorical(x['PENATVTY'].map(COB2007Map))
    county = lambda x: pd.Categorical(
        np.where(x['GTCO'] > 0, x['GTCO'] * 100 + x['GESTFIPS'], 0))
    mpcstat = lambda x: pd.Categorical(
        np.where(x.MSAST == 1, 'Principal City',
        np.where(x.MSAST == 2, 'Balance',
        np.where(x.MSAST == 3, 'Nonmetropolitan',
        np.where(x.MSAST == 4, 'Not Identified', None)))))
    metstat = lambda x: pd.Categorical(
        np.where(x.METSTA == 1, 'Metropolitan',
        np.where(x.METSTA == 2, 'Nonmetropolitan',
        np.where(x.METSTA == 3, 'Not Identified', None))))
    
    # Revised weights for December 2007
    if date == pd.to_datetime('2007-12-01'):
        df_rev = pd.read_feather('clean/cps_dec07_rev.ft')
        dfm = pd.merge(dfm, df_rev)
        dfm['PWSSWGT'] = dfm['NWSSWGT']
        dfm['PWCMPWGT'] = dfm['NWCMPWGT']
        dfm = dfm.drop(['NWSSWGT', 'NWCMPWGT'], axis=1)

    # Country of origin revised in 2007    
    if date.year < 2007:
        dfm = dfm.assign(CTYBIRTH = ctybirth94)
    if date.year >= 2007:
        dfm = dfm.assign(CTYBIRTH = ctybirth07) 
    dfm = dfm.drop(['PENATVTY'], axis=1)   
    dfm['CTYBIRTH'] = dfm['CTYBIRTH'].astype('category')

    # Four digit year
    if date.year < 1998:
        dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
        dfm = dfm.drop(['HRYEAR'], axis=1)
        
        # Person weight is basic weight
        dfm['BASICWGT'] = dfm['PWSSWGT'] 
        
    dfm = dfm.rename({'HRYEAR4': 'YEAR'}, axis=1)

    # Person weight is composite weight
    if date.year >= 1998:
        dfm = dfm.rename({'PWCMPWGT': 'BASICWGT'}, axis=1)

    # Detailed race allows identifying more than one race
    if date.year > 2002:
        dfm = dfm.assign(WBHAOM = wbhaom)
        dfm = dfm.assign(HISPDT03 = hispdt03)
    if date.year > 2013:
        dfm = dfm.assign(HISPDT = hispdt)        
    dfm = dfm.drop(['PRDTHSP'], axis=1)
    
    # Atlanta Fed Wage Growth Tracker Flag
    # atlflg = lambda x: np.where((x.PTHR == 0) & (x.PTWK == 0) & (x.PRNAGWS == 1)
    #                            & (x.PRHERNAL < 1) & (x.WKEARNFLG < 1), 1, 0)
    # drop_vars = ['PTHR', 'PRHERNAL', 'PRNAGWS']
    # drop_vars = [dv for dv in drop_vars if dv in dfm.keys()]
    # if date >= pd.to_datetime('1995-09-01'):
    #     dfm = dfm.assign(ATLFLG = atlflg)
    # dfm = dfm.drop(drop_vars, axis=1) 
    
    # Renaming major industry and occupation codes
    if date.year >= 2003:
        dfm = dfm.rename({'PRMJOCC1': 'OCC03M', 'PRMJOCC2': 'OCC203M', 
                          'PRDTOCC1': 'OCC03D', 'PRDTOCC2': 'OCC203D',
                          'PRMJIND1': 'IND03M', 'PRMJIND2': 'IND203M', 
                          'PRDTIND1': 'IND03D', 'PRDTIND2': 'IND203D'}, axis=1)
    if date.year < 2003:
        dfm = dfm.rename({'PRMJOCC1': 'OCC80M', 'PRMJOCC2': 'OCC280M', 
                          'PRDTOCC1': 'OCC80D', 'PRDTOCC2': 'OCC280D',
                          'PRMJIND1': 'IND80M', 'PRMJIND2': 'IND280M', 
                          'PRDTIND1': 'IND80D', 'PRDTIND2': 'IND280D',
                          'PEIO1OCD': 'OCC90', 'PEIO2OCD': 'OCC290', 
                          'PEIO1ICD': 'IND90', 'PEIO2ICD': 'IND290'}, axis=1)
      
    # Renaming industry and occupation codes
    if 2003 <= date.year <= 2008:
        dfm = dfm.rename({'PEIO1ICD': 'IND02', 'PEIO2ICD': 'IND202'}, axis=1)
    if 2003 <= date.year <= 2010:
        dfm = dfm.rename({'PEIO1OCD': 'OCC00', 'PEIO2OCD': 'OCC200'}, axis=1)    
    if 2009 <= date.year <= 2013:
        dfm = dfm.rename({'PEIO1ICD': 'IND07', 'PEIO2ICD': 'IND207'}, axis=1) 
    if 2011 <= date.year <= 2019:
        dfm = dfm.rename({'PEIO1OCD': 'OCC10', 'PEIO2OCD': 'OCC210'}, axis=1)    
    if 2014 <= date.year <= 2019:
        dfm = dfm.rename({'PEIO1ICD': 'IND12', 'PEIO2ICD': 'IND212'}, axis=1)
    if 2020 <= date.year <= 2024:
        dfm = dfm.rename({'PEIO1OCD': 'OCC18', 'PEIO2OCD': 'OCC218',
                          'PEIO1ICD': 'IND17', 'PEIO2ICD': 'IND217'}, axis=1)
    if date.year >= 2025:
        dfm = dfm.rename({'PEIO1OCD': 'OCC18', 'PEIO2OCD': 'OCC218',
                          'PEIO1ICD': 'IND25', 'PEIO2ICD': 'IND225'}, axis=1)

    
    # Major industry recode consistent over years
    for indvar in ['IND90', 'IND02', 'IND07', 'IND12', 'IND17', 'IND25']:
        if indvar in dfm.keys():
            indmap = {i: k for k, v in INDMMap.items() for i in v[indvar]}
            indm = lambda x: pd.Categorical(x[indvar].map(indmap))
            dfm = dfm.assign(INDM = indm)
        
    # Detailed Asian Race
    asiandt = lambda x: pd.Categorical(x['PRDASIAN'].map(AsianMap))
    if date.year >= 2013:
        dfm = dfm.assign(ASIANDT = asiandt).drop(['PRDASIAN'], axis=1) 

    # Professional certification questions
    if date.year in [2015, 2016]:
        year = date.year
        month = date.month
        rev_df = (pd.read_feather(f'clean/cps_cert{year}.ft')
                    .query('MONTH == @month'))
        dfm = pd.merge(dfm, rev_df, how='outer')

    if date.year >= 2015:
        dfm = dfm.assign(CERT = cert).drop(['PECERT1'], axis=1)
        
    # Disability status
    if (date >= pd.to_datetime('2008-06-01') ) and (date.year < 2009):
        month = date.month
        feather_file = 'clean/cps_disability2008.ft'
        rev_df = (pd.read_feather(feather_file)
                    .query('MONTH == @month')
                    .drop(['MONTH'], axis=1))
        dfm = pd.merge(dfm, rev_df, on=['QSTNUM', 'OCCURNUM'], how='outer')
    if date >= pd.to_datetime('2008-06-01'):
        disability = lambda x: pd.Categorical(
            np.where(x['PRDISFLG'] == 1, 1, 0))
        dfm = dfm.assign(DISABILITY = disability).drop(['PRDISFLG'], axis=1)
   
    # Matching HRHHID2 in cases where it must be created manually
    if maps['id2'] == True:
        dfm['HRHHID2'] = id2_gen(dfm)
        dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], axis=1)
        
    dfm = dfm.rename({'HRHHID2': 'HHID2'}, axis=1)
    
    # Add QSTNUM and OCCURNUM where not available
    if date.year < 1998:
        dfm['QSTNUM'] = dfm.groupby(['HHID','HHID2']).ngroup().astype('int32')
        dfm['OCCURNUM'] = (dfm.groupby('QSTNUM').cumcount() + 1).astype('int8')
    
    # Unique household IDS
    if date > pd.to_datetime('1995-05-01'): 
        ids_file = 'CPS_unique_ids.pkl'
        if os.path.isfile(ids_file):
            dfm['CPSID'] = dfm['QSTNUM'].map(cps_ids_full[date])
    if date <= pd.to_datetime('1995-05-01'): 
        ids_file = 'CPSID_89-93.pkl'
        if os.path.isfile(ids_file):
            dfm['CPSID'] = dfm['QSTNUM'].map(cps_ids_full_early[date])
      
    # Merge in COVID data
    #if date >= pd.to_datetime('2020-05-01'):
    #    rev_df = (pd.read_feather(f'clean/cps_covid_{date.year}.ft')
    #                .query('MONTH == @date.month'))
    #    dfm = pd.merge(dfm, rev_df)         
        
    # Parent 2020 onward
    parent = lambda x: np.where(x.PEPAR2 > 0, x.PEPAR2, 
                       np.where(x.PEPAR1 > 0, x.PEPAR1, -1))
    if date >= pd.to_datetime('2020-01-01'):
        dfm = (dfm.assign(PARENT = parent)
                  .astype({'PARENT': 'int8'})
                  .drop(['PEPAR1', 'PEPAR2'], axis=1))
    
    # County code (state and county combined)
    if date >= pd.to_datetime('1995-09-01'): 
        dfm = dfm.assign(COUNTY = county)
        dfm = dfm.drop(['GESTFIPS', 'GTCO'], axis=1)
    elif date >= pd.to_datetime('1995-01-01'):
        dfm['COUNTY'] = -1
        dfm = dfm.drop(['GESTFIPS'], axis=1)
    else:
        dfm = dfm.drop(['GESTFIPS'], axis=1)
        
    # Metropolitan/Principal City Status
    if ((date >= pd.to_datetime('1995-06-01')) & 
        (date <= pd.to_datetime('1995-08-01'))):
        dfm['MSAST'] = 4
        dfm['METSTA'] = 3
        dfm['CMSA'] = -1
        dfm['MSA'] = -1
    if ((date >= pd.to_datetime('2004-05-01')) & 
        (date <= pd.to_datetime('2004-12-01'))):
        dfm['CMSA'] = -1
        dfm['MSA'] = -1
    if ((date >= pd.to_datetime('2004-01-01')) & 
        (date <= pd.to_datetime('2004-04-01'))):
        dfm['CBSA'] = -1
        dfm['CSA'] = -1
    dfm = (dfm.assign(METSTAT = metstat, MPCSTAT = mpcstat)
              .drop(['MSAST', 'METSTA'], axis=1))
    
    # Imputed mean above topcode weekly
    # if date.year < 1998: 
    #     topcode = 1923.0
    # elif date.year >= 1998:
    #     topcode = 2884.61
    # rsd = (dfm.groupby(['REGION', 'FEMALE'], group_keys=True)
    #           .apply(lambda x: mean_above_topcode(x, topcode))
    #           .to_dict())
    # dfm['WKEARNADJ'] = pd.Series(dtype='float32')
    # for group, value in rsd.items():
    #     mask = (dfm.PTWK == 1) & (dfm.REGION == group[0]) & (dfm.FEMALE == group[1])
    #     dfm.loc[mask, 'WKEARNADJ'] = value
    
    # Imputed usual hours worked on first job
    # private = lambda x: np.where(x['COW1'] == 'Private', 1, 0)
    # se = ['Self-employed Incorporated', 'Self-employed Unincorporated']
    # selfemp = lambda x: np.where(x.COW1.isin(se), 1, 0)
    # dfm = (impute_hours(dfm.assign(PRIVATE = private, SELFEMP = selfemp))
    #        .drop(['PRIVATE', 'SELFEMP'], axis=1)
    #        .astype({'HRSUSL1I': 'float16'}))
    
    # hrwageadj = (
    # lambda x: np.where((x.PRERNHLY >= 0) & (x.PEERN < 1), 
    #                    x.PRERNHLY / 100.0,
    #           np.where((x.WKEARNADJ >= 0) & (x.HRSUSL1 > 0), 
    #                    (x.WKEARNADJ / x.HRSUSL1),
    #           np.where((x.WKEARNADJ >= 0) & (x.HRSUSL1 == -4), 
    #                    (x.WKEARNADJ / x.HRSUSL1I), np.nan)))) 

    # drop_list = ['PRERNHLY', 'PEERN', 'PRERNWA']
    # dfm = (dfm.assign(HRWAGEADJ = hrwageadj)
    #           .astype({'HRWAGEADJ': 'float32',
    #                    'WKEARNADJ': 'float32'})
    #           .drop(drop_list, axis=1))
    
    return dfm

In [9]:
def revised_annual_data(df, year):
    
    # Merge in the 2000-revised weights and io recodes here
    if 2000 <= year <= 2002:
        rev_wgts = pd.read_feather(f'clean/cps_wgt_rev{year}.ft')
        df = pd.merge(df, rev_wgts)
        df['BASICWGT'] = df['NWCMPWGT']
        df['PWORWGT'] = df['NWORWGT']
        df['PWSSWGT'] = df['NWSSWGT']
        df['HHWGT'] = df['NWHHWGT']
        df = df.drop(['NWCMPWGT', 'NWORWGT', 'NWSSWGT', 'NWHHWGT'], axis=1)
        # IO recodes
        rev_io = pd.read_feather(f'clean/cps_io_rev{year}.ft')
        df = pd.merge(df, rev_io)       
        
    # Merge in revised union data
    if year in [2001, 2002]:
        rev_df = pd.read_feather(f'clean/cps_union_rev{year}.ft')
        df = pd.merge(df, rev_df)
        df['PEERNLAB'] = df['NEERNLAB']
        df['PEERNCOV'] = df['NEERNCOV']
        df = df.drop(['NEERNLAB', 'NEERNCOV'], axis=1)
        
    # Create UNION and UNIONMEM
    union = lambda x: pd.Categorical(
        np.where((x['PEERNLAB'] == 1) | (x['PEERNCOV'] == 1), 1, 
        np.where((x['PEERNLAB'] == 2) & (x['PEERNCOV'] == 2), 0, None)), 
        ordered=True)
    unionmem = lambda x: pd.Categorical(
        np.where(x['PEERNLAB'] == 1, 1, 
        np.where(x['PEERNLAB'] == 2, 0, None)), 
        ordered=True)
    
    df = (df.assign(UNION = union, UNIONMEM = unionmem)
            .drop(['PEERNLAB', 'PEERNCOV'], axis=1))
    
    # General mess clean up area
    cat_vars = ['PRDTRACE', 'COW2', 'REGION', 'CSA', 'CBSA', 'CTYBIRTH', 'YEAR',
                'DISABILITY', 'INDGRP', 'IND80D', 'OCC80D',
                'IND80M', 'OCC80M', 'IND03D', 'OCC03D', 'OCC03M', 
                'IND03M', 'IND203M', 'IND203D', 'OCC203D', 'OCC203M',
                'OCC280M', 'IND280M', 'OCC280D', 'IND280D', 
                'OCC90', 'OCC290', 'OCC00', 'OCC200', 'OCC10', 'OCC210', 'OCC18',
                'OCC218', 'IND90', 'IND290', 'IND02', 'IND202', 'IND07', 'IND207',
                'IND12', 'IND212', 'IND17', 'IND217', 'IND25', 'IND225',
                'HRSUSL1',
                'CMSA', 'MSA', 'COUNTY', 'PTCOVID1', 'PTCOVID2', 'PTCOVID3',
                'PTCOVID4', 'HHID2', 'FAMNUM', 'METSTAT', 'MPCSTAT', 'USLFT',
                'ABSTYPE', 'FTLF', 'FAMINC', 'PTREASON', 'WANTFT', #'ATLFLG', 
                'WKEARNFLG', 'PTWK', 'GTCBSASZ', 'SAMEEMP', 'CHDUTIES', 'SAMEACT',
                'CHJOBACT']
    cat_vars = [cv for cv in cat_vars if cv in df.keys()]
    convert_dict = {cat: 'category' for cat in cat_vars}
    df = df.astype(convert_dict)
    # Clean up int vars
    intvars = ['SPOUSE', 'PARENT']
    df[intvars] = df[intvars].fillna(-1).astype('int8')
    # Fix prnmchld size 1999
    chvars = ['PRCHLD', 'PRNMCHLD']
    if year == 1999:
        df[chvars] = df[chvars].fillna(-1).astype('int8')
    wgt_vars = ['BASICWGT', 'PWSSWGT', 'PWORWGT', 'HHWGT']
    df[wgt_vars] = df[wgt_vars].astype('float32')
    return df

In [10]:
def cps_to_feather(year_list):
    data_dictionary = None 
    for year in year_list:
        
        # Get the list of raw data files for the given year
        file_ending = f'{str(year)[2:]}pub.dat'
        raw_files = [file for file in data_files 
                     if file.endswith(file_ending)]
        
        # Loop over individual raw monthly files
        combined_data = []
        for file in raw_files:
            # Date of raw monthly file
            date = pd.to_datetime(f'{year}-{file[:3]}-01')
            # Month's CPI values (by region)
            cpi_vals = cpi.loc[date].to_dict()
            
            # Identify how to read the raw data file
            if data_dictionary != cpsdd['matcher'][file]:
                data_dictionary = cpsdd['matcher'][file]
                dd_info = cpsdd[data_dictionary]
                var_info = dd_info['dd']
                ws, we = var_info['PWSSWGT'][:2]
                dtypes = [(var_name, var_details[-1]) 
                          for var_name, var_details in var_info.items()]
                var_maps = dd_info['map']
                unpack_format = dd_info['unpack_fmt']
                unpacker = struct.Struct(unpack_format).unpack_from
                
            # Read raw monthly data and return pandas dataframe
            mo_data = data_file_reader(file, unpacker, dtypes, ws, we)
            
            # Clean up the data
            dfm = clean_all(mo_data, var_maps, cpi_vals, date)
            clean_mo_data = clean_special(dfm, var_maps, cpi_vals, date)
            
            combined_data.append(clean_mo_data)
            
        # Combine monthly files into one annual file
        df = (pd.concat(combined_data, sort=False)
                .reset_index(drop=True))
        
        # Census revised 2000-based weights and union data
        df = revised_annual_data(df, year)
        
        df.to_feather(f'clean/cps{year}.ft')
        obs = len(df)
        cols = len(df.keys())
        size = round(df.memory_usage().sum() / 1024**2, 1)
        print(f'{year} Done: ({obs:,} records, {cols} variables, {size}MB)')

In [11]:
cps_to_feather(range(1994, 2026))

  df = (pd.concat(combined_data, sort=False)
  df = (pd.concat(combined_data, sort=False)
  df = (pd.concat(combined_data, sort=False)
  df = (pd.concat(combined_data, sort=False)


1994 Done: (1,672,934 records, 109 variables, 301.6MB)
1995 Done: (1,648,060 records, 112 variables, 314.4MB)
1996 Done: (1,461,469 records, 112 variables, 269.1MB)
1997 Done: (1,462,817 records, 112 variables, 269.3MB)
1998 Done: (1,461,394 records, 112 variables, 269.0MB)
1999 Done: (1,465,602 records, 114 variables, 272.6MB)
2000 Done: (1,460,714 records, 126 variables, 294.0MB)
2001 Done: (1,560,956 records, 126 variables, 314.2MB)
2002 Done: (1,703,004 records, 126 variables, 342.8MB)
2003 Done: (1,685,264 records, 116 variables, 316.7MB)
2004 Done: (1,656,144 records, 119 variables, 317.6MB)
2005 Done: (1,644,787 records, 117 variables, 312.2MB)
2006 Done: (1,628,798 records, 117 variables, 307.6MB)
2007 Done: (1,611,901 records, 117 variables, 306.0MB)
2008 Done: (1,600,790 records, 118 variables, 305.4MB)
2009 Done: (1,617,099 records, 118 variables, 308.5MB)
2010 Done: (1,621,021 records, 118 variables, 309.3MB)
2011 Done: (1,600,068 records, 118 variables, 305.3MB)
2012 Done: