### bd econ CPS data dictionaries

bd_CPS_dd.ipynb

October 13, 2018

@bd_econ

Requires: `cps_details.py`.

In [5]:
# Import libraries
import os
import re
import struct
import pickle
import pandas as pd

from bd_CPS_details import VarList, DataDict, text_repl, StatesMap, RegionsMap

os.chdir('/home/brian/Documents/CPS/data')

In [6]:
# Match CPS microdata files with their data dictionary
Matcher = {}

DataDict.pop('matcher', None)
for dfile, dvals in DataDict.items():
    ddf = open(f'{dfile}', 'r', encoding='iso-8859-1').read()
    if dfile == 'jan98dd.asc':
        d = {text_repl(s[0]): [int(s[2])-1, int(s[2])+int(s[1])-1, int(s[1])] 
             for s in re.findall(dvals['re'], ddf) if s[0] in VarList}
    else:
        d = {text_repl(s[0]): [int(s[3])-1, int(s[4]), int(s[2])] 
             for s in re.findall(dvals['re'], ddf)}
    
    # Suggest dtypes for numpy
    for k, v in d.items(): 
        d[k].append('U4' if k in ['HRSAMPLE']
                    else 'U2' if k in ['HRSERSUF']
                    else 'f4' if 'WGT' in k
                    #else 'f4' if 'PRER' in k
                    else 'int8' if v[-1] < 3 
                    else 'int16' if v[-1] < 6 
                    else 'int32' if v[-1] < 12 
                    else 'intp')    
    
    # Make sure that start and end = length
    error_list = [k for k, v in d.items() if v[1] - v[0] != v[2]]
    if len(error_list) > 0:
        print(f'Error: {dfile}: {", ".join(error_list)}')
    DataDict[dfile]['dd'] = d
    
    # Add list of related monthly CPS microdata files
    mos = pd.date_range(dvals['start'], dvals['end'], freq='MS')
    monthly_file_list = [f'{i:%b%y}pub.dat'.lower() for i in mos]
    DataDict[dfile]['flist'] = monthly_file_list
    
    # Add relevant monthly CPS filenames to matcher
    for file in monthly_file_list:
        Matcher[file] = dfile
    
    # Stuct unpack format
    start, end, width, fmt = zip(*d.values())
    skip = ([f'{st - en}x' if (st - en) > 0 else '' 
             for st, en in zip(start, [0] + list(end[:-1]))])
    keep = [f'{w}s' for w in width]
    unpack_fmt = ''.join([j for i in zip(skip, keep) for j in i])
    DataDict[dfile]['unpack_fmt'] = unpack_fmt

In [7]:
# Create new/cleaned variables
# Education groups
educ = {'LTHS': [31, 32, 33, 34, 35, 36, 37, 38], 
        'HS': [39],
        'SC': [40, 41, 42],
        'COLL': [43],
        'ADV': [44, 45, 46]}
educ_map = {}
for k, v in educ.items():
    for i in v:
        educ_map.update({i:k})

for dfile, dvals in DataDict.items():
    DataDict[dfile]['map'] = {}
    
    # Add state id map to two letter codes
    DataDict[dfile]['map']['state'] = StatesMap
    
    # Add Census regions map from state two letter codes
    DataDict[dfile]['map']['region'] = RegionsMap
    
    # Add education groups
    DataDict[dfile]['map']['educ'] = educ_map
    
    # WBHAO race/ethnic groups from CEPR
    start_month = pd.to_datetime(dvals['start'])
    if start_month > pd.to_datetime('2012-04-01'):
        race = {'White': [1], 
                'Black': [2, 6, 10, 11, 12, 16, 17, 18, 22, 23], 
                'Asian': [4, 5, 8, 9, 13, 14, 15, 19, 20, 21, 24], 
                'Other': [3, 7, 25, 26]}
    elif start_month > pd.to_datetime('2002-12-01'):
        race = {'White': [1], 
                'Black': [2, 6, 10, 11, 12, 15, 16, 19], 
                'Asian': [4, 5, 8, 9, 13, 14, 17, 18], 
                'Other': [3, 7, 20, 21]}
    else:
        race = {'White': [1], 
                'Black': [2], 
                'Asian': [4], 
                'Other': [3, 5]}
    race_map = {}
    for k, v in race.items():
        for i in v:
            race_map[i] = k
    DataDict[dfile]['map']['race'] = race_map
    
    # Hispanic identification
    if start_month > pd.to_datetime('2013-12-01'):
        hisp = [1, 2, 3, 4, 5, 6, 7, 8]
    elif start_month > pd.to_datetime('2002-12-01'):
        hisp = [1, 2, 3, 4, 5]
    else:
        hisp = [1, 2, 3, 4, 5, 6, 7]
        
    DataDict[dfile]['map']['hisp'] = hisp
    
    # Major industry group
    start_month = pd.to_datetime(dvals['start'])
    if start_month > pd.to_datetime('2002-12-01'):
        ind = {'Construction and mining': [1, 2, 3],
               'Finance and business services': [7, 8, 9, 12],
               'Manufacturing': [4],
               'Trade, transportation, and utilities': [5, 6],
               'Education and health': [10],
               'Leisure and hospitality': [11],
               'Public administration': [13],
               'Armed forces': [14]}
    else:
        ind = {'Construction and mining': [1, 2, 3, 21],
               'Finance and business services': [7, 11, 12, 13, 14, 20],
               'Manufacturing': [4, 5],
               'Trade, transportation, and utilities': [6, 8, 9, 10],
               'Education and health': [16, 17, 18, 19],
               'Leisure and hospitality': [15],
               'Public administration': [22],
               'Armed forces': [23]}    
    ind_map = {}
    for k, v in ind.items():
        for i in v:
            ind_map[i] = k
    DataDict[dfile]['map']['ind'] = ind_map        
        
    # Identify when to calculate ID2 manually
    DataDict[dfile]['map']['id2'] = False
    if start_month < pd.to_datetime('2004-05-01'):
        DataDict[dfile]['map']['id2'] = True

    # Identify weight variables for each data dict
    wgt_vars = [i for i in dvals['dd'].keys() if 'WGT' in i]
    DataDict[dfile]['map']['wgt'] = wgt_vars

    # Identify weight variables for each data dict
    er_vars = [i for i in dvals['dd'].keys() if 'PRER' in i]
    DataDict[dfile]['map']['er'] = er_vars

In [8]:
# Generate pickle file with data for reader
DataDict['matcher'] = Matcher

with open('cps_basic_dd.pkl', 'wb') as f:
    pickle.dump(DataDict, f)