### bd econ CPS extract

bd_CPS_revisions_reader.ipynb

January 28, 2019

Contact: Brian Dew, @bd_econ

-----

Initial goals:

1) Retrieve variable names and locations from data dictionary

2) Get the basic variables: YEAR, MONTH, HHID, PERSON LINE, AGE, SEX, RACE

3) Review basic variables and adjust to match 1994-onward

4) Check that the results make sense and match to benchmarks

5) Repeat with more variables: EMP, OCC, IND, LMSTAT, etc.

In [1]:
import os, re, struct, pickle
import numpy as np
import pandas as pd

os.chdir('/home/brian/Documents/CPS/data/')

cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))

In [2]:
# User-defined functions
def id_dtype(size):
    '''Return data type based on fixed-width size'''
    size = int(size)
    dtype = ('intp' if size > 9 
             else 'int32' if size > 4 
             else 'int16' if size > 2 
             else 'int8')
    return dtype

def data_dict_reader(dd_file, var_list):
    '''Read data dictionary and return variable locations'''
    data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()
    p = ('(\w{1,2}[\$\-%]\w*|PADDING)\s'
         '*CHARACTER\*(\d{3})\s*\.{0,1}\s*\((\d*):(\d*)\).*')
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{int(s[1])}s', id_dtype(s[1])]
         for s in re.findall(p, data_dict) if s[0] in var_list}
    return d

def struct_unpacker(d):
    '''Return struct unpacker from variable locations'''
    start, end, width, size = zip(*d.values())
    skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
    return struct.Struct(unpack_fmt).unpack_from

def data_file_reader(file, unpacker, dtypes, wgt):
    '''Convert raw monthly file to dataframe'''
    raw_data = open(file, 'rb')
    data = [unpacker(row) for row in raw_data if len(row) >= 405]
    data = [tuple(int(i) if i.strip() else -1 for i in row) for row in data]
    np_data = np.array(data, dtype=dtypes)
    df = pd.DataFrame(np_data[np_data[wgt] > 0])
    return df

In [3]:
# Create annual feather files
dd_files = {'cps89.ddf': [1989, 1990, 1991],
            'cps92.ddf': [1992, 1993]}

var_list = ['H-MONTH', 'H-YEAR', 'H-MIS', 'HG-FIPS', 'H-METSTA', 'A-VET',
            'H-ID', 'A-LINENO', 'A-AGE', 'A-MARITL', 'A-SEX', 'A-HGA',
            'A-RACE', 'A-MAJACT', 'A-IND', 'A-OCC', 'A-USLFT', 'A-CLSWKR',
            'A-USLHRS', 'A-UNMEM', 'A-FTPT', 'A-REORGN', 'A-LFSR', 'A-ENRLW',
            'A-UNTYPE', 'A-NLFREA', 'A-RCOW', 'A-FNLWGT', 'A-ERNLWT',
            'A-HERNTP', 'A-WERNTP', 'A-HRS1', 'A-WKSLK', 'A-WANTJB', 
            'A-UNCOV', 'A-HGC']

# Remove the first two characters from each variable name
rename_list = {v: v[2:] for v in var_list if v[0:2] != 'HG'}
rename_list['HG-FIPS'] = 'STATEFIPS'
rename_list['A-WKSLK'] = 'UNEMPDUR'
filter_wgt = 'A-FNLWGT'

# Map state FIPS codes to two letter codes
state_codes = cpsdd['jan94_mar94_dd.txt']['map']['state']
state = lambda x: pd.Categorical(x['STATEFIPS'].map(state_codes))

# 1992-onward educ codes
educ_codes = cpsdd['jan94_mar94_dd.txt']['map']['educ']
educ = lambda x: pd.Categorical(x['HGA'].map(educ_codes))

# Unemployment type recode
unemptype = lambda x: (
    pd.Categorical(np.where(x['UNTYPE'].isin([1, 2]), 'Job Loser',
                   np.where(x['UNTYPE'] == 3, 'Job Leaver',
                   np.where(x['UNTYPE'] == 4, 'Re-entrant',
                   np.where(x['UNTYPE'] == 5, 'New Entrant', np.nan))))))

# Map WBHAO codes for race/ethnicity
hisp_map = cpsdd['jan94_mar94_dd.txt']['map']['hisp']
race_map = cpsdd['jan94_mar94_dd.txt']['map']['race']
wbhao = lambda x: (    # If not hispanic, map race to racial groups
    pd.Categorical(np.where(x['REORGN'].isin(hisp_map), 'Hispanic', 
                            x['RACE'].map(race_map))))

# bd CPS consistent variables
female = lambda x: np.where(x['SEX'] == 2, 1, 0)
veteran = lambda x: np.where(x['VET'].isin([1, 2, 3, 4, 5]), 1,
                             np.where(x['VET'] == 6, 0, np.nan))
married = lambda x: np.where(x['MARITL'].isin([1, 2, 3]), 1, 
                             np.where(x['MARITL'].isin([4,5,6,7]), 0, np.nan))
emp = lambda x: np.where(x['LFSR'].isin([1,2]), 1, 0)


for ddf, year_list in dd_files.items():
    
    d = data_dict_reader(ddf, var_list)

    dtypes = [(k, v[-1]) for k, v in d.items()]

    unpacker = struct_unpacker(d)

    for year in year_list:
        file_list = [f for f in os.listdir() 
                    if f.startswith((f'cpsb{str(year)[2:]}')) 
                    and f.endswith('.dat')]

        df = (pd.concat([data_file_reader(file, unpacker, dtypes, filter_wgt)
                         for file in file_list]))

        decimal_vars = ['A-FNLWGT', 'A-HERNTP', 'A-ERNLWT']
        df[decimal_vars] = df[decimal_vars] / 100.0
        df = df.rename(rename_list, axis=1)
        if year < 1992:
            educ_map = {
                'LTHS': (df['HGA'].between(1, 11)) | ((df['HGA']==12) & 
                                                      (df['HGC']==2)),
                'HS': (df['HGA']==12) & (df['HGC']==1),
                'SC': (df['HGA'].between(13,15)) | ((df['HGA']==16) & 
                                                    (df['HGC']==2)),
                'COLL': ((df['HGA']==16) & (df['HGC']==1)) | (df['HGA']==17),
                'ADV': (df['HGA'] >= 18)}
            df['EDUC'] = np.select(educ_map.values(), educ_map.keys(), default=None)
            df = df.drop(['HGA', 'HGC'], axis=1)
        if year >= 1992:
            df = df.assign(EDUC = educ).drop(['HGA'], axis=1)
        df = (df.assign(STATE = state,
                        FEMALE = female,
                        WBHAO = wbhao,
                        VETERAN = veteran,
                        MARRIED = married,
                        EMP = emp,
                        UNEMPTYPE = unemptype)
                .drop(['STATEFIPS', 'SEX', 'VET', 'MARITL', 'UNTYPE',
                       'RACE', 'REORGN'], axis=1))
        df['YEAR'] = year
        df['BASICWGT'] = df['FNLWGT']
        resize_vars = ['STATE', 'FEMALE', 'VETERAN', 'MARRIED', 'EMP', 'EDUC']
        df[resize_vars] = df[resize_vars].astype('category')
        
        df.reset_index(drop=True).to_feather(f'clean/cps{year}.ft')
        print(f'{year} Done: ({len(df):,} records, {len(df.keys())} variables)')

1989 Done: (1,713,347 records, 36 variables)
1990 Done: (1,791,585 records, 36 variables)
1991 Done: (1,774,232 records, 36 variables)
1992 Done: (1,746,184 records, 36 variables)
1993 Done: (1,722,398 records, 36 variables)
