### bd econ CPS extract

bd_CPS_reader.ipynb

February 3, 2019

Contact: Brian Dew, @bd_econ

Requires: `cps_basic_dd.pkl` which is generated by bd_CPS_dd.ipynb

-----

See [readme](https://github.com/bdecon/econ_data/tree/master/bd_CPS) for documentation.

In [1]:
# Import python packages
import pandas as pd
import numpy as np
import struct
import string
import os
import pickle

os.chdir('/home/brian/Documents/CPS/data')

In [2]:
# Settings
cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))
data_path = '/home/brian/Documents/CPS/data/'
data_files = os.listdir(data_path)

# Consumer Price Index
cpi = pd.read_csv('clean/cpi.csv', index_col=[0], parse_dates=True)

# Mapping for country of birth
cob = pickle.load(open('country_of_birth.pkl', 'rb')) # From CEPR program

In [3]:
def id2_gen(np_mo):
    """Create HRHHID2 for pre May 2004 data"""
    hrsample = [x[1:3] for x in np_mo['HRSAMPLE']]
    hrsersuf = [x.strip() for x in np_mo['HRSERSUF']]
    sersuf_d = {a: str(ord(a.lower()) - 96).zfill(2) for a in set(hrsersuf)
            if a in list(string.ascii_letters)}
    sersuf_d.update({'-1': '00', '-1.0': '00', '0': '00'})
    sersuf = list(map(sersuf_d.get, hrsersuf))
    np_mo['HUHHNUM'][np_mo['HUHHNUM'] < 0] = 0
    huhhnum = np_mo['HUHHNUM'].astype('U1')
    
    id2 = [''.join(i) for i in zip(hrsample, sersuf, huhhnum)]

    return(np.array(id2, dtype='uint32'))

In [4]:
def data_file_reader(file, unpacker, dtypes, ws, we):
    
    # If person weight > 0, unpack the raw file
    if file == 'mar04pub.dat':
        data = [unpacker(row) for row in open(file, 'rb')
                if b'**' not in row and row[ws:we].strip() > b'0']
    else:
        data = [unpacker(row) for row in open(file, 'rb') 
                if row[ws:we].strip() > b'0']
    
    # Convert to dataframe using specified weights
    df = pd.DataFrame(np.array(data, dtype=dtypes))
    
    return df

In [5]:
def clean_all(df, maps, cpi_vals):
    
    # New variables to add
    female = lambda x: pd.Categorical(np.where(x['PESEX'] == 2, 1, 0))
    state = lambda x: pd.Categorical(x['GESTFIPS'].map(maps['state']))
    region = lambda x: pd.Categorical(x['STATE'].map(maps['region']))
    educ = lambda x: pd.Categorical(x['PEEDUCA'].map(maps['educ']))
    schenr = lambda x: pd.Categorical(
        np.where(x['PESCHENR'] == 1, 1, 
        np.where(x['PESCHENR'] == 2, 0, np.nan)))
    married = lambda x: pd.Categorical(
        np.where(x['PRMARSTA'].isin([1, 2, 3]), 1, 0))
    wbhao = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                 x['PRDTRACE'].map(maps['race'])))
    veteran = lambda x: pd.Categorical(
        np.where(x['PEAFEVER'] == 1, 1, 
        np.where(x['PEAFEVER'] == 2, 0, np.nan)))
    forborn = lambda x: pd.Categorical(
        np.where(x['PRCITSHP'].isin([4, 5]), 1, 
        np.where(x['PRCITSHP'].isin([1, 2, 3]), 0, np.nan)))
    indgrp =  lambda x: pd.Categorical(x['PRMJIND1'].map(maps['ind'])) 
    mjh = lambda x: pd.Categorical(
        np.where(x['PRSJMJ']==2, 1, np.where(x['PRSJMJ']==1, 0, np.nan)))
    unemptype = lambda x: pd.Categorical(
        np.where(x['PRUNTYPE'].isin([1, 2, 3]), 'Job Loser',
        np.where(x['PRUNTYPE'] == 4, 'Job Leaver',
        np.where(x['PRUNTYPE'] == 5, 'Re-entrant',
        np.where(x['PRUNTYPE'] == 6, 'New Entrant', np.nan)))))
    ptecon = lambda x: pd.Categorical(
        np.where(x['PRWKSTAT'].isin([3, 6]), 1, 
        np.where(x['PRWKSTAT'].between(2, 10), 0, np.nan)))
    lfs = lambda x: pd.Categorical(
        np.where(x['PEMLR'].isin([1, 2]), 'Employed',
        np.where(x['PEMLR'].isin([3, 4]), 'Unemployed',
        np.where(x['PEMLR'].isin([5, 6, 7]), 'NILF', np.nan))))
    cow1 = lambda x: pd.Categorical(
        np.where(x['PEIO1COW'] == 1, 'Federal Government',
        np.where(x['PEIO1COW'] == 2, 'State Government',
        np.where(x['PEIO1COW'] == 3, 'Local Government',
        np.where(x['PEIO1COW'].isin([4, 5]), 'Private',
        np.where(x['PEIO1COW'] == 6, 'Self-employed Incorporated',
        np.where(x['PEIO1COW'] == 7, 'Self-employed Unincorporated',
        np.where(x['PEIO1COW'] == 8, 'Without Pay', np.nan))))))))
    nilfreason = lambda x: pd.Categorical(
        np.where((x['PRWNTJOB']==2) & 
                 ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2]))), 
                 'NILF - Disabled/ill',
        np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==4), 'NILF - Family',
        np.where((x['PRWNTJOB']==2) & ((x['PEMLR']==5) | (x['PENLFACT']==5)), 
                 'NILF - Retired',
        np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==3), 'NILF - School',
        np.where(x['PEDWWNTO']==1, 'NILF - Discouraged',
        np.where(x['PEMLR'].isin([5, 6, 7]), 'NILF - Other', np.nan)))))))
    
    # Wage variables
    wkwage = lambda x: np.where(x['PRERNWA'] > 0, x['PRERNWA'] / 100.0, np.nan)
    hrwage = lambda x: (
        np.where((x['PRERNHLY'] < 0) & (x['PEHRUSL1'] > 0 ) & 
                 (x['PRERNWA'] > 0), x['PRERNWA'] / x['PEHRUSL1'], 
        np.where(x['PRERNHLY'] > 0, x['PRERNHLY'], np.nan)) / 100.0  )
    ottcamt = lambda x: np.where(x['PUERN2'] > 0, x['PUERN2'] / 100.0, np.nan)
    rhrwage = lambda x: (np.where(x['HRWAGE'] > 0, 
                 x['HRWAGE'] * x['REGION'].map(cpi_vals), np.nan))
    rwkwage = lambda x: (np.where(x['WKWAGE'] > 0, 
                 x['WKWAGE'] * x['REGION'].map(cpi_vals), np.nan))
    rottcamt = lambda x: (np.where(x['OTTCAMT'] > 0, 
                 x['OTTCAMT'] * x['REGION'].map(cpi_vals), np.nan))
    
    
    # Old variables to drop
    drop_vars = ['PESEX', 'GESTFIPS', 'PEAFEVER', 'PESCHENR', 'PRSJMJ',
                 'PRUNTYPE', 'PRWKSTAT', 'PRCITSHP', 'PRERNHLY', 'PRERNWA',
                 'PUERN2', 'PRMARSTA', 'PEIO1COW', 'PRWNTJOB', 'PEDWWNTO',
                 'PEMLR', 'PENLFACT', 'PEEDUCA', 'PENLFRET']
    
    # Correct weights for implied decimals
    df[maps['wgt']] = (df[maps['wgt']] / 10000.0).astype('float32')
    
    # Assign new variables and drop one ones
    df = (df.assign(FEMALE = female,
                    STATE = state,
                    REGION = region,
                    EDUC = educ,
                    SCHENR = schenr,
                    MARRIED = married,
                    WBHAO = wbhao,
                    VETERAN = veteran,
                    FORBORN = forborn,
                    INDGRP = indgrp,
                    MJH = mjh,
                    UNEMPTYPE = unemptype,
                    PTECON = ptecon,
                    LFS = lfs,
                    COW1 = cow1,
                    NILFREASON = nilfreason,
                    WKWAGE = wkwage,
                    HRWAGE = hrwage,
                    OTTCAMT = ottcamt,
                    RHRWAGE = rhrwage,
                    RWKWAGE = rwkwage,
                    ROTTCAMT = rottcamt)
            .drop(drop_vars, axis=1)
            .rename({'PRTAGE': 'AGE', 'HRMONTH': 'MONTH', 'HRHHID': 'HHID',
                     'HRMIS': 'MIS', 'GTCBSA': 'CBSA', 'GTCSA': 'CSA',
                     'PRUNEDUR': 'UNEMPDUR', 
                     'PEHRACTT': 'HRSACTT', 'PEHRUSLT': 'HRSUSLT',
                     'PEHRUSL1': 'HRSUSL1', 'PEHRUSL2': 'HRSUSL2',
                     'PEHRACT1': 'HRSACT1', 'PEHRACT2': 'HRSACT2'}, axis=1))
    
    # Wage variables to float32
    wage_vars = ['WKWAGE', 'RWKWAGE', 'OTTCAMT', 'ROTTCAMT',
                 'HRWAGE', 'RHRWAGE']
    df[wage_vars] = df[wage_vars].astype('float32')
    
    # Variables to convert to categorical
    cat_vars = ['MIS', 'CBSA', 'CSA', 'GTMETSTA', 'PEERNLAB', 'PEERNCOV', 
                'PRFTLF']
    for cat_var in cat_vars:
        if cat_var in df.keys():
            df[cat_var] = df[cat_var].astype('category')
    
    return df

In [6]:
def clean_special(dfm, maps, date):
    
    # New variables in selected dates:
    wbhaom = lambda x: pd.Categorical(
        np.where(x['PRDTHSP'].isin(maps['hisp']), 'Hispanic', 
                 x['PRDTRACE'].map(maps['racem'])))
    cert = lambda x: pd.Categorical(
        np.where(x['PECERT1'] == 1, 1, np.where(x['PECERT1']==2, 0, np.nan)))
    ctybirth94 = lambda x: pd.Categorical(x['PENATVTY'].map(cob[1994]))
    ctybirth07 = lambda x: pd.Categorical(x['PENATVTY'].map(cob[2007]))
    
    # Revised weights for December 2007
    if date == pd.to_datetime('2007-12-01'):
        df_rev = pd.read_feather('clean/cps_dec07_rev.ft')
        dfm = pd.merge(dfm, df_rev)
        dfm['PWSSWGT'] = dfm['NWSSWGT']
        dfm['PWCMPWGT'] = dfm['NWCMPWGT']
        dfm = dfm.drop(['NWSSWGT', 'NWCMPWGT'], axis=1)
    
    # Country of origin revised in 2007    
    if date.year < 2007:
        dfm = dfm.assign(CTYBIRTH = ctybirth94)
    if date.year >= 2007:
        dfm = dfm.assign(CTYBIRTH = ctybirth07) 
    dfm = dfm.drop(['PENATVTY'], axis=1)   
    dfm['CTYBIRTH'] = dfm['CTYBIRTH'].astype('category')

    # Four digit year
    if date.year < 1998:
        dfm['HRYEAR4'] = dfm['HRYEAR'] + 1900
        dfm = dfm.drop(['HRYEAR'], axis=1)
        
        # Person weight is basic weight
        dfm['BASICWGT'] = dfm['PWSSWGT'] 
        
    dfm = dfm.rename({'HRYEAR4': 'YEAR'}, axis=1)
    
    # Person weight is composite weight
    if date.year >= 1998:
        dfm = dfm.rename({'PWCMPWGT': 'BASICWGT'}, axis=1)
    
    # Detailed race allows identifying more than one race
    if date.year > 2002:
        dfm = dfm.assign(WBHAOM = wbhaom)
    dfm = dfm.drop(['PRDTHSP'], axis=1)
        
    # Professional certification questions asked
    if date.year > 2016:
        dfm = dfm.assign(CERT = cert).drop(['PECERT1'], axis=1)
        
    # Matching HRHHID2 in cases where it must be created manually
    if maps['id2'] == True:
        dfm['HRHHID2'] = id2_gen(dfm)
        dfm = dfm.drop(['HRSAMPLE', 'HRSERSUF', 'HUHHNUM'], axis=1)
    dfm = dfm.rename({'HRHHID2': 'HHID2'}, axis=1)
    
    return dfm

In [7]:
def revised_annual_data(df, year):
    
    # Merge in the 2000-revised weights here
    if 2000 <= year <= 2002:
        rev_wgts = (pd.read_feather('clean/cps_wgt_rev.ft')
                      .query('YEAR == @year'))
        df = pd.merge(df, rev_wgts)
        df['BASICWGT'] = df['NWCMPWGT']
        df['PWORWGT'] = df['NWORWGT']
        df['PWSSWGT'] = df['NWSSWGT']
        df = df.drop(['NWCMPWGT', 'NWORWGT', 'NWSSWGT'], axis=1)
        
    # Merge in revised union data
    if year in [2001, 2002]:
        rev_df = (pd.read_feather('clean/cps_union_rev.ft')
                    .query('YEAR == @year'))
        df = pd.merge(df, rev_df)
        df['PEERNLAB'] = df['NEERNLAB']
        df['PEERNCOV'] = df['NEERNCOV']
        df = df.drop(['NEERNLAB', 'NEERNCOV'], axis=1)
                
    if year >= 1998:    
        df = df.drop(['QSTNUM', 'OCCURNUM'], axis=1)
        
    # Rename ORG Weight ORGWGT    
    df = df.rename({'PWORWGT': 'ORGWGT'})
    
    # General mess clean up area
    df = df.assign(YEAR = lambda x: pd.Categorical(x['YEAR']))
    df['CTYBIRTH'] = df['CTYBIRTH'].astype('category')
    cat_vars = ['PRDTRACE', 'PULINENO', 'PRFTLF', 'MONTH', 'CBSA', 'CSA']
    for cat_var in cat_vars:
        if cat_var in df.keys():
            df[cat_var] = df[cat_var].astype('category')
    wgt_vars = ['BASICWGT', 'PWSSWGT']
    df[wgt_vars] = df[wgt_vars].astype('float32')
    
    return df

In [8]:
def cps_to_feather(year_list):
    data_dictionary = None 
    for year in year_list:
        
        # Get the list of raw data files for the given year
        file_ending = f'{str(year)[2:]}pub.dat'
        raw_files = [file for file in data_files 
                     if file.endswith(file_ending)]
        
        # Loop over individual raw monthly files
        combined_data = []
        for file in raw_files:
            # Date of raw monthly file
            date = pd.to_datetime(f'{year}-{file[:3]}-01')
            
            # Month's CPI values (by region)
            cpi_vals = cpi.loc[date].to_dict()
            
            # Identify how to read the raw data file
            if data_dictionary != cpsdd['matcher'][file]:
                data_dictionary = cpsdd['matcher'][file]
                dd_info = cpsdd[data_dictionary]
                var_info = dd_info['dd']
                ws, we = var_info['PWSSWGT'][:2]
                dtypes = [(var_name, var_details[-1]) 
                          for var_name, var_details in var_info.items()]
                var_maps = dd_info['map']
                unpack_format = dd_info['unpack_fmt']
                unpacker = struct.Struct(unpack_format).unpack_from
                
            # Read raw monthly data and return pandas dataframe
            mo_data = data_file_reader(file, unpacker, dtypes, ws, we)

            # Clean up the data
            dfm = clean_all(mo_data, var_maps, cpi_vals)
            clean_mo_data = clean_special(dfm, var_maps, date)
            
            combined_data.append(clean_mo_data)
            
        # Combine monthly files into one annual file
        df = (pd.concat(combined_data, sort=False)
                .reset_index(drop=True)
                .assign(MONTH = lambda x: pd.Categorical(x['MONTH'])))
        
        # Census revised 2000-based weights and union data
        df = revised_annual_data(df, year)

        df.to_feather(f'clean/cps_{year}.ft')
        obs = len(df)
        cols = len(df.keys())
        size = round(df.memory_usage().sum() / 1024**2, 1)
        print(f'{year} Done: ({obs:,} records, {cols} variables, {size}MB)')

In [9]:
cps_to_feather(range(1994, 2019))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


1994 Done: (1,672,934 records, 51 variables, 161.2MB)
1995 Done: (1,648,060 records, 51 variables, 169.8MB)
1996 Done: (1,461,469 records, 51 variables, 140.8MB)
1997 Done: (1,462,817 records, 51 variables, 140.9MB)
1998 Done: (1,461,394 records, 51 variables, 140.8MB)
1999 Done: (1,465,602 records, 53 variables, 163.5MB)
2000 Done: (1,460,724 records, 53 variables, 149.1MB)
2001 Done: (1,560,960 records, 53 variables, 159.3MB)
2002 Done: (1,703,017 records, 53 variables, 173.8MB)
2003 Done: (1,685,264 records, 54 variables, 167.2MB)
2004 Done: (1,656,144 records, 56 variables, 175.3MB)
2005 Done: (1,644,787 records, 56 variables, 167.9MB)
2006 Done: (1,628,798 records, 56 variables, 166.2MB)
2007 Done: (1,611,901 records, 56 variables, 166.0MB)
2008 Done: (1,600,790 records, 56 variables, 164.9MB)
2009 Done: (1,617,099 records, 57 variables, 168.1MB)
2010 Done: (1,621,021 records, 57 variables, 168.5MB)
2011 Done: (1,600,068 records, 57 variables, 166.4MB)
2012 Done: (1,588,264 record