### bd econ CPS extract

bd_CPS_reader.ipynb

March 9, 2019

Contact: Brian Dew, @bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

-----

See [readme](https://github.com/bdecon/econ_data/tree/master/bd_CPS) for documentation.

In [1]:
# Import python packages
import pandas as pd
print('pandas:', pd.__version__)
import numpy as np
print('numpy:', np.__version__)
import struct
import string
import os
import pickle

# Map codes for country of birth to country/area names
from bd_CPS_details import COB1994Map, COB2007Map

os.chdir('/home/brian/Documents/CPS/data')

pandas: 0.24.2
numpy: 1.16.2


In [2]:
# Settings
cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
data_path = '/home/brian/Documents/CPS/data/'
data_files = os.listdir(data_path)

# Consumer Price Index (retrieve using bd_CPS_cpi)
cpi = pd.read_csv('clean/cpi.csv', index_col=[0], parse_dates=True)

# Federal minimum wage from DOL:
min_wage = [('1981-01-01', '1990-04-01', 3.35),
            ('1990-04-01', '1991-04-01', 3.80),
            ('1991-04-01', '1996-10-01', 4.25),
            ('1996-10-01', '1997-09-01', 4.75),
            ('1997-09-01', '2007-07-24', 5.15),
            ('2007-07-24', '2008-07-24', 5.85),
            ('2008-07-24', '2009-07-24', 6.55),
            ('2009-07-24', '2020-12-01', 7.25)]

min_wage = [(pd.to_datetime(i[0]), pd.to_datetime(i[1]), i[2])
            for i in min_wage]

# Unique ID list
ids_file = 'CPS_unique_ids.pkl'
if os.path.isfile(ids_file):
    cps_ids_full = pickle.load(open(ids_file, 'rb'))

In [3]:
def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo.loc[np_mo['HUHHNUM'] < 0, 'HUHHNUM'] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='uint32'))

In [4]:
def data_file_reader(file, unpacker, dtypes, ws, we):
    
    # If person weight > 0, unpack the raw file
    if file == 'mar04pub.dat':
        data = [unpacker(row) for row in open(file, 'rb')
                if b'**' not in row and row[ws:we].strip() > b'0']
    elif file[3:5] in ['94', '95']:
        data = [unpacker(row.replace(b'\x00\x00', b'-1')) 
                for row in open(file, 'rb')
                if row[ws:we].strip() > b'0']
    else:
        data = [unpacker(row) for row in open(file, 'rb') 
                if row[ws:we].strip() > b'0']
    
    # Convert to dataframe using specified weights
    df = pd.DataFrame(np.array(data, dtype=dtypes))
    
    return df

In [None]:
def clean_all(df, maps, cpi_vals, date):
    
    # New variables to add
    female = lambda x: pd.Categorical(np.where(x['PESEX'] == 2, 1, 0))
    state = lambda x: pd.Categorical(x['GESTFIPS'].map(maps['state']))
    region = lambda x: pd.Categorical(x['STATE'].map(maps['region']))
    educ = lambda x: pd.Categorical(x['PEEDUCA'].map(maps['educ']))
    schenr = lambda x: pd.Categorical(
        np.where(x['PESCHENR'] == 1, 1, 
        np.where(x['PESCHENR'] == 2, 0, np.nan)))
    married = lambda x: pd.Categorical(
        np.where(x['PRMARSTA'].isin([1, 2, 3]), 1, 0))
    wbhao = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                 x['PRDTRACE'].map(maps['race'])))
    veteran = lambda x: pd.Categorical(
        np.where(x['PEAFEVER'] == 1, 1, 
        np.where(x['PEAFEVER'] == 2, 0, np.nan)))
    forborn = lambda x: pd.Categorical(
        np.where(x['PRCITSHP'].isin([4, 5]), 1, 
        np.where(x['PRCITSHP'].isin([1, 2, 3]), 0, np.nan)))
    indgrp =  lambda x: pd.Categorical(x['PRMJIND1'].map(maps['ind'])) 
    mjh = lambda x: pd.Categorical(
        np.where(x['PRSJMJ']==2, 1, np.where(x['PRSJMJ']==1, 0, np.nan)))
    unemptype = lambda x: pd.Categorical(
        np.where(x['PRUNTYPE'].isin([1, 2, 3]), 'Job Loser',
        np.where(x['PRUNTYPE'] == 4, 'Job Leaver',
        np.where(x['PRUNTYPE'] == 5, 'Re-entrant',
        np.where(x['PRUNTYPE'] == 6, 'New Entrant', np.nan)))))
    ptecon = lambda x: pd.Categorical(
        np.where(x['PRWKSTAT'].isin([3, 6]), 1, 
        np.where(x['PRWKSTAT'].between(2, 10), 0, np.nan)))
    workft = lambda x: pd.Categorical(
        np.where(x['PRWKSTAT'].isin([2, 8, 9]), 1,
        np.where(x['PRWKSTAT'].between(2, 10), 0, np.nan)))
    lfs = lambda x: pd.Categorical(
        np.where(x['PEMLR'].isin([1, 2]), 'Employed',
        np.where(x['PEMLR'].isin([3, 4]), 'Unemployed',
        np.where(x['PEMLR'].isin([5, 6, 7]), 'NILF', np.nan))))
    cow1 = lambda x: pd.Categorical(
        np.where(x['PEIO1COW'] == 1, 'Federal Government',
        np.where(x['PEIO1COW'] == 2, 'State Government',
        np.where(x['PEIO1COW'] == 3, 'Local Government',
        np.where(x['PEIO1COW'].isin([4, 5]), 'Private',
        np.where(x['PEIO1COW'] == 6, 'Self-employed Incorporated',
        np.where(x['PEIO1COW'] == 7, 'Self-employed Unincorporated',
        np.where(x['PEIO1COW'] == 8, 'Without Pay', np.nan))))))))
    cow2 = lambda x: pd.Categorical(
        np.where(x['PEIO2COW'] == 1, 'Federal Government',
        np.where(x['PEIO2COW'] == 2, 'State Government',
        np.where(x['PEIO2COW'] == 3, 'Local Government',
        np.where(x['PEIO2COW'].isin([4, 5]), 'Private',
        np.where(x['PEIO2COW'] == 6, 'Self-employed Incorporated',
        np.where(x['PEIO2COW'] == 7, 'Self-employed Unincorporated',
        np.where(x['PEIO2COW'] == 8, 'Without Pay', np.nan))))))))
    nilfreason = lambda x: pd.Categorical(
        np.where((x['PRWNTJOB']==2) & 
                 ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2]))), 
                 'Disabled/Ill',
        np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==4), 'Family',
        np.where((x['PRWNTJOB']==2) & ((x['PEMLR']==5) | (x['PENLFACT']==5)), 
                 'Retired',
        np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==3), 'School',
        np.where(x['PEDWWNTO']==1, 'Discouraged',
        np.where(x['PEMLR'].isin([5, 6, 7]), 'Other', np.nan)))))))
    paidhrly = lambda x: pd.Categorical(
        np.where(x['PEERNHRY'] == 1, 1,
        np.where(x['PEERNHRY'] == 2, 0, np.nan)))
    proxy = lambda x: pd.Categorical(
        np.where(x['PUSLFPRX'] == 1, 'Self',
        np.where(x['PUSLFPRX'] == 2, 'Proxy',
        np.where(x['PUSLFPRX'] == 3, 'Both', np.nan))))
    
    # Wage variables
    wkwage = lambda x: np.where(x['PRERNWA'] > 0, x['PRERNWA'] / 100.0, np.nan)
    hrwage = lambda x: (
        np.where((x['PRERNHLY'] < 0) & (x['PEHRUSL1'] > 0 ) & 
                 (x['PRERNWA'] > 0), x['PRERNWA'] / x['PEHRUSL1'], 
        np.where(x['PRERNHLY'] > 0, x['PRERNHLY'], np.nan)) / 100.0  )
    rhrwage = lambda x: (np.where(x['HRWAGE'] > 0, 
                 x['HRWAGE'] * x['REGION'].map(cpi_vals), np.nan))
    rwkwage = lambda x: (np.where(x['WKWAGE'] > 0, 
                 x['WKWAGE'] * x['REGION'].map(cpi_vals), np.nan))
    
    # Minimum wage
    for start, end, wage in min_wage:
        if (date >= start) and (date < end):
            fed_min_wage = wage
    minwage = lambda x: pd.Categorical(
        np.where((x['HRWAGE'] > 0) & (x['HRWAGE'] <= fed_min_wage), 1, 
        np.where(x['HRWAGE'] > fed_min_wage, 0, np.nan)))
    
    # Old variables to drop
    drop_vars = ['PESEX', 'PEAFEVER', 'PESCHENR', 'PRSJMJ',
                 'PRUNTYPE', 'PRWKSTAT', 'PRCITSHP', 'PRERNWA', 'PEERNHRY',
                 'PUERN2', 'PRMARSTA', 'PEIO1COW', 'PRWNTJOB', 'PEDWWNTO',
                 'PEMLR', 'PENLFACT', 'PEEDUCA', 'PENLFRET', 'PRERNHLY',
                 'PUSLFPRX', 'PEIO2COW']
    
    # Correct weights for implied decimals
    df[maps['wgt']] = (df[maps['wgt']] / 10000.0).astype('float32')
    
    # Assign new variables and drop one ones
    df = (df.assign(FEMALE = female,
                    STATE = state,
                    REGION = region,
                    EDUC = educ,
                    SCHENR = schenr,
                    MARRIED = married,
                    WBHAO = wbhao,
                    VETERAN = veteran,
                    FORBORN = forborn,
                    INDGRP = indgrp,
                    MJH = mjh,
                    UNEMPTYPE = unemptype,
                    PTECON = ptecon,
                    WORKFT = workft,
                    PAIDHRLY = paidhrly,
                    PROXY = proxy,
                    LFS = lfs,
                    COW1 = cow1,
                    COW2 = cow2,
                    NILFREASON = nilfreason,
                    WKWAGE = wkwage,
                    HRWAGE = hrwage,
                    RHRWAGE = rhrwage,
                    RWKWAGE = rwkwage,
                    MINWAGE = minwage)
            .drop(drop_vars, axis=1)
            .rename({'PRTAGE': 'AGE', 'HRMONTH': 'MONTH', 'HRHHID': 'HHID',
                     'HRMIS': 'MIS', 'GTCBSA': 'CBSA', 'GTCSA': 'CSA',
                     'PRUNEDUR': 'UNEMPDUR',
                     'PEHRACTT': 'HRSACTT', 'PEHRUSLT': 'HRSUSLT',
                     'PEHRUSL1': 'HRSUSL1', 'PEHRUSL2': 'HRSUSL2',
                     'PEHRACT1': 'HRSACT1', 'PEHRACT2': 'HRSACT2'}, axis=1))
    
    # Wage variables to float32
    wage_vars = ['WKWAGE', 'RWKWAGE', 'HRWAGE', 'RHRWAGE']
    df[wage_vars] = df[wage_vars].astype('float32')
    
    # Variables to convert to categorical
    cat_vars = ['MIS', 'CBSA', 'CSA', 'PEERNLAB', 'PEERNCOV', 'PRFTLF']
    for cat_var in cat_vars:
        if cat_var in df.keys():
            df[cat_var] = df[cat_var].astype('category')
    
    return df

In [None]:
def clean_special(dfm, maps, date):
    
    # New variables in selected dates:
    wbhaom = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                 x['PRDTRACE'].map(maps['racem'])))
    cert = lambda x: pd.Categorical(
        np.where(x['PECERT1'] == 1, 1, np.where(x['PECERT1']==2, 0, np.nan)))
    ctybirth94 = lambda x: pd.Categorical(x['PENATVTY'].map(COB1994Map))
    ctybirth07 = lambda x: pd.Categorical(x['PENATVTY'].map(COB2007Map))
    county = lambda x: pd.Categorical(
        np.where(x['GTCO'] > 0, x['GTCO'] * 100 + x['GESTFIPS'], 0))
    
    # Revised weights for December 2007
    if date == pd.to_datetime('2007-12-01'):
        df_rev = pd.read_feather('clean/cps_dec07_rev.ft')
        dfm = pd.merge(dfm, df_rev)
        dfm['PWSSWGT'] = dfm['NWSSWGT']
        dfm['PWCMPWGT'] = dfm['NWCMPWGT']
        dfm = dfm.drop(['NWSSWGT', 'NWCMPWGT'], axis=1)

    # Country of origin revised in 2007    
    if date.year < 2007:
        dfm = dfm.assign(CTYBIRTH = ctybirth94)
    if date.year >= 2007:
        dfm = dfm.assign(CTYBIRTH = ctybirth07) 
    dfm = dfm.drop(['PENATVTY'], axis=1)   
    dfm['CTYBIRTH'] = dfm['CTYBIRTH'].astype('category')

    # Four digit year
    if date.year < 1998:
        dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
        dfm = dfm.drop(['HRYEAR'], axis=1)
        
        # Person weight is basic weight
        dfm['BASICWGT'] = dfm['PWSSWGT'] 
        
    dfm = dfm.rename({'HRYEAR4': 'YEAR'}, axis=1)

    # Person weight is composite weight
    if date.year >= 1998:
        dfm = dfm.rename({'PWCMPWGT': 'BASICWGT'}, axis=1)

    # Detailed race allows identifying more than one race
    if date.year > 2002:
        dfm = dfm.assign(WBHAOM = wbhaom)
    dfm = dfm.drop(['PRDTHSP'], axis=1)
    
    # Renaming industry and occupation codes
    if date.year > 2002:
        dfm = dfm.rename({'PRMJOCC1': 'OCCM', 'PRMJOCC2': 'OCC2M', 
                          'PEIO1OCD': 'OCC', 'PEIO2OCD': 'OCC2', 
                          'PRDTOCC1': 'OCCD', 'PRDTOCC2': 'OCC2D',
                          'PRMJIND1': 'INDM', 'PRMJIND2': 'IND2M', 
                          'PEIO1ICD': 'IND', 'PEIO2ICD': 'IND2', 
                          'PRDTIND1': 'INDD', 'PRDTIND2': 'IND2D'}, axis=1)
    if date.year <= 2002:
        dfm = dfm.rename({'PRMJOCC1': 'OCC80M', 'PRMJOCC2': 'OCC280M', 
                          'PEIO1OCD': 'OCC80', 'PEIO2OCD': 'OCC280', 
                          'PRDTOCC1': 'OCC80D', 'PRDTOCC2': 'OCC280D',
                          'PRMJIND1': 'IND80M', 'PRMJIND2': 'IND280M', 
                          'PEIO1ICD': 'IND80', 'PEIO2ICD': 'IND280', 
                          'PRDTIND1': 'IND80D', 'PRDTIND2': 'IND280D'}, axis=1)
    
    # Number/children in 1999 before data available = -1
    if (date.year == 1999) and (date < pd.to_datetime('1999-11-01')):
        dfm['PRNMCHLD'] = -1
        dfm['PRCHLD'] = -1
    if date.year == 1999:
        dfm[['PRNMCHLD', 'PRCHLD']] = dfm[['PRNMCHLD', 'PRCHLD']].astype('int8')
 
    # Professional certification questions
    if date.year in [2015, 2016]:
        year = date.year
        month = date.month
        rev_df = (pd.read_feather(f'clean/cps_cert{year}.ft')
                    .query('MONTH == @month'))
        dfm = pd.merge(dfm, rev_df, how='outer')

    if date.year >= 2015:
        dfm = dfm.assign(CERT = cert).drop(['PECERT1'], axis=1)
        
    # Disability status
    if (date >= pd.to_datetime('2008-06-01') ) and (date.year < 2009):
        month = date.month
        feather_file = 'clean/cps_disability2008.ft'
        rev_df = (pd.read_feather(feather_file)
                    .query('MONTH == @month')
                    .drop(['MONTH'], axis=1))
        dfm = pd.merge(dfm, rev_df, on=['QSTNUM', 'OCCURNUM'], how='outer')
    if date >= pd.to_datetime('2008-06-01'):
        disability = lambda x: pd.Categorical(
            np.where(x['PRDISFLG'] == 1, 1, 0))
        dfm = dfm.assign(DISABILITY = disability).drop(['PRDISFLG'], axis=1)
   
    # Matching HRHHID2 in cases where it must be created manually
    if maps['id2'] == True:
        dfm['HRHHID2'] = id2_gen(dfm)
        dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], axis=1)
        
    dfm = dfm.rename({'HRHHID2': 'HHID2'}, axis=1)
    
    # Add QSTNUM and OCCURNUM where not available
    if date.year < 1998:
        dfm['QSTNUM'] = dfm.groupby(['HHID','HHID2']).ngroup().astype('int32')
        dfm['OCCURNUM'] = (dfm.groupby('QSTNUM').cumcount() + 1).astype('int8')
    
    # Unique household IDS
    if date >= pd.to_datetime('1995-05-01'): 
        ids_file = 'CPS_unique_ids.pkl'
        if os.path.isfile(ids_file):
            dfm['CPSID'] = dfm['QSTNUM'].map(cps_ids_full[date])
            
    # County code (state and county combined)
    if date >= pd.to_datetime('1995-09-01'): 
        dfm = dfm.assign(COUNTY = county)
        dfm = dfm.drop(['GESTFIPS', 'GTCO'], axis=1)
    else:
        dfm = dfm.drop(['GESTFIPS'], axis=1)


    return dfm

In [None]:
def revised_annual_data(df, year):
    
    # Merge in the 2000-revised weights and io recodes here
    if 2000 <= year <= 2002:
        rev_wgts = pd.read_feather(f'clean/cps_wgt_rev{year}.ft')
        df = pd.merge(df, rev_wgts)
        df['BASICWGT'] = df['NWCMPWGT']
        df['PWORWGT'] = df['NWORWGT']
        df['PWSSWGT'] = df['NWSSWGT']
        df = df.drop(['NWCMPWGT', 'NWORWGT', 'NWSSWGT'], axis=1)
        # IO recodes
        rev_io = pd.read_feather(f'clean/cps_io_rev{year}.ft')
        df = pd.merge(df, rev_io)       
        
    # Merge in revised union data
    if year in [2001, 2002]:
        rev_df = pd.read_feather(f'clean/cps_union_rev{year}.ft')
        df = pd.merge(df, rev_df)
        df['PEERNLAB'] = df['NEERNLAB']
        df['PEERNCOV'] = df['NEERNCOV']
        df = df.drop(['NEERNLAB', 'NEERNCOV'], axis=1)
        
    # Create UNION and UNIONMEM
    union = lambda x: pd.Categorical(
        np.where((x['PEERNLAB'] == 1) | (x['PEERNCOV'] == 1), 1, 
        np.where((x['PEERNLAB'] == 2) & (x['PEERNCOV'] == 2), 0, np.nan)))
    unionmem = lambda x: pd.Categorical(
        np.where(x['PEERNLAB'] == 1, 1, 
        np.where(x['PEERNLAB'] == 2, 0, np.nan)))
    
    df = (df.assign(UNION = union, UNIONMEM = unionmem)
            .drop(['PEERNLAB', 'PEERNCOV'], axis=1))
    
    # General mess clean up area
    df = df.assign(YEAR = lambda x: pd.Categorical(x['YEAR']))
    df['CTYBIRTH'] = df['CTYBIRTH'].astype('category')
    cat_vars = ['PRDTRACE', 'PULINENO', 'PRFTLF', 'MONTH', 'CBSA', 'CSA', 
                'DISABILITY', 'INDGRP', 'IND80', 'OCC80', 'IND80D', 'OCC80D',
                'IND80M', 'OCC80M', 'IND', 'OCC', 'IND2', 'OCC2', 'INDD'
                'OCCD', 'OCCM', 'INDM', 'IND2M', 'IND2D', 'OCC2D', 'OCC2M',
                'OCC280M', 'IND280M', 'OCC280D', 'IND280D', 'OCC280', 'IND280',
                'CMSA', 'MSA', 'COUNTY']
    for cat_var in cat_vars:
        if cat_var in df.keys():
            df[cat_var] = df[cat_var].astype('category')
    # Clean up metro status
    df['METSTA'] = df['METSTA'].fillna(-1).astype('int8')
    wgt_vars = ['BASICWGT', 'PWSSWGT']
    df[wgt_vars] = df[wgt_vars].astype('float32')
    return df

In [None]:
def cps_to_feather(year_list):
    data_dictionary = None 
    for year in year_list:
        
        # Get the list of raw data files for the given year
        file_ending = f'{str(year)[2:]}pub.dat'
        raw_files = [file for file in data_files 
                     if file.endswith(file_ending)]
        
        # Loop over individual raw monthly files
        combined_data = []
        for file in raw_files:
            # Date of raw monthly file
            date = pd.to_datetime(f'{year}-{file[:3]}-01')
            # Month's CPI values (by region)
            cpi_vals = cpi.loc[date].to_dict()
            
            # Identify how to read the raw data file
            if data_dictionary != cpsdd['matcher'][file]:
                data_dictionary = cpsdd['matcher'][file]
                dd_info = cpsdd[data_dictionary]
                var_info = dd_info['dd']
                ws, we = var_info['PWSSWGT'][:2]
                dtypes = [(var_name, var_details[-1]) 
                          for var_name, var_details in var_info.items()]
                var_maps = dd_info['map']
                unpack_format = dd_info['unpack_fmt']
                unpacker = struct.Struct(unpack_format).unpack_from
                
            # Read raw monthly data and return pandas dataframe
            mo_data = data_file_reader(file, unpacker, dtypes, ws, we)
            
            # Clean up the data
            dfm = clean_all(mo_data, var_maps, cpi_vals, date)
            clean_mo_data = clean_special(dfm, var_maps, date)
            
            combined_data.append(clean_mo_data)
            
        # Combine monthly files into one annual file
        df = (pd.concat(combined_data, sort=False)
                .reset_index(drop=True)
                .assign(MONTH = lambda x: pd.Categorical(x['MONTH'])))
        
        # Census revised 2000-based weights and union data
        df = revised_annual_data(df, year)
        
        df.to_feather(f'clean/cps{year}.ft')
        obs = len(df)
        cols = len(df.keys())
        size = round(df.memory_usage().sum() / 1024**2, 1)
        print(f'{year} Done: ({obs:,} records, {cols} variables, {size}MB)')

In [None]:
cps_to_feather(range(1994, 2020))

1994 Done: (1,672,934 records, 65 variables, 175.6MB)
1995 Done: (1,648,060 records, 67 variables, 199.7MB)
1996 Done: (1,461,469 records, 67 variables, 167.4MB)
1997 Done: (1,462,817 records, 67 variables, 167.5MB)
1998 Done: (1,461,394 records, 65 variables, 161.8MB)
1999 Done: (1,465,602 records, 67 variables, 165.0MB)
2000 Done: (1,460,724 records, 79 variables, 203.5MB)
2001 Done: (1,560,960 records, 79 variables, 217.5MB)
2002 Done: (1,703,017 records, 79 variables, 248.6MB)
2003 Done: (1,685,264 records, 70 variables, 197.8MB)
2004 Done: (1,656,144 records, 72 variables, 216.5MB)
2005 Done: (1,644,787 records, 70 variables, 204.0MB)
