### bd econ CPS data dictionaries

bd_CPS_dd.ipynb

April 14, 2019

@bd_econ

Requires: `cps_details.py`.

In [1]:
# Import libraries
import os
import re
import struct
import pickle
import pandas as pd
print('pandas:', pd.__version__)

from bd_CPS_details import VarList, DataDict, text_repl, StatesMap, RegionsMap

os.chdir('/home/brian/Documents/CPS/data')

# Some variables start in the middle of the jan98dd.asc dictionary
# This code splits the jan98dd.asc into two
ddf = open('jan98dd.asc', 'r', encoding='iso-8859-1').read()
chldvars = 'PRPERTYP \n     = 2 \n\nD PRCHLD    2    633\n\nD PRNMCHLD    2    635\n'
ddf = ddf.replace('PRPERTYP \n     = 2 ', chldvars)
with open('jan98dd2.asc', "w") as ddm:
    ddm.write(ddf)
    
# Fix two typos in latest data dictionary
file = '2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt'
ddf = open(file, 'r', encoding='iso-8859-1').read()
ddf = (ddf.replace('PRSJMS\t\t\t', 'PRSJMS\t\t\t2')
          .replace('PRNAGWS\t\t\t', 'PRNAGWS\t\t\t2'))
with open('January_2020_Record_Layout.txt', 'w') as ddm:
    ddm.write(ddf)

pandas: 2.2.2


  're': f'({"|".join([x for x in VarList if x not in LostVars])})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join([x for x in VarList if x not in LostVars])})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'({"|".join(VarList)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'},
  're': f'\n(?:\x0c)?({"|".join(VarList)})\s+(\d+)\s+.*? \s+.*?(\d\d*).*?(\d\d+)'},
  're': f'\n(?:\x0c)?({"|".join(VarList)})\s+(\d+)\s+.*? \s+.*?(\d\d*).*?(\d\d+)'},
  're': f'\n(?:\x0c)?({"|".join(VarList)})\s

In [2]:
# Match CPS microdata files with their data dictionary
Matcher = {}

DataDict.pop('matcher', None)
for dfile, dvals in DataDict.items():
    #print(dfile)
    ddf = open(f'{dfile}', 'r', encoding='iso-8859-1').read()
    if dfile in ['jan03dd.txt', 'augnov05dd.txt', 'jan07dd.txt']:
        ddf = ddf.replace('PRNMCHLD', 'PRNMCHLD  2  ')
    if dfile in ['jan98dd.asc', 'jan98dd2.asc']:
        d = {text_repl(s[0]): [int(s[2])-1, int(s[2])+int(s[1])-1, int(s[1])] 
             for s in re.findall(dvals['re'], ddf) if s[0] in VarList}       
    elif dfile == 'may04dd.txt':
        d = {text_repl(s[0]): [int(s[2])-1, int(s[3]), int(s[1])] 
             for s in re.findall(
                 dvals['re'].replace('(partII)', '\(partII\)'), ddf)}
    else:
        d = {text_repl(s[0]): [int(s[3])-1, int(s[4]), int(s[2])] 
             for s in re.findall(dvals['re'], ddf)}
    
    # Suggest dtypes for numpy
    for k, v in d.items(): 
        d[k].append('U4' if k in ['HRSAMPLE']
                    else 'U2' if k in ['HRSERSUF']
                    else 'int32' if k in ['GTCO', 'GESTFIPS']
                    #else 'int64' if k in ['HRHHID2']
                    else 'f4' if 'WGT' in k
                    else 'int8' if v[-1] < 3 
                    else 'int16' if v[-1] < 5 
                    else 'int32' if v[-1] < 11 
                    else 'intp')    
    
    # Make sure that start and end = length
    error_list = [k for k, v in d.items() if v[1] - v[0] != v[2]]
    if len(error_list) > 0:
        print(f'Error: {dfile}: {", ".join(error_list)}')
    DataDict[dfile]['dd'] = d
    
    # Add list of related monthly CPS microdata files
    mos = pd.date_range(dvals['start'], dvals['end'], freq='MS')
    monthly_file_list = [f'{i:%b%y}pub.dat'.lower() for i in mos]
    DataDict[dfile]['flist'] = monthly_file_list
    
    # Add relevant monthly CPS filenames to matcher
    for file in monthly_file_list:
        Matcher[file] = dfile
    
    # Stuct unpack format
    start, end, width, fmt = zip(*d.values())
    skip = ([f'{st - en}x' if (st - en) > 0 else '' 
             for st, en in zip(start, [0] + list(end[:-1]))])
    keep = [f'{w}s' for w in width]
    unpack_fmt = ''.join([j for i in zip(skip, keep) for j in i])
    DataDict[dfile]['unpack_fmt'] = unpack_fmt

  dvals['re'].replace('(partII)', '\(partII\)'), ddf)}


In [3]:
# Create new/cleaned variables
# Education groups
educ = {'LTHS': [31, 32, 33, 34, 35, 36, 37, 38], 
        'HS': [39],
        'SC': [40, 41, 42],
        'COLL': [43],
        'ADV': [44, 45, 46]}
educ_map = {}
for k, v in educ.items():
    for i in v:
        educ_map.update({i:k})

for dfile, dvals in DataDict.items():
    DataDict[dfile]['map'] = {}
    
    # Add state id map to two letter codes
    DataDict[dfile]['map']['state'] = StatesMap
    
    # Add Census regions map from state two letter codes
    DataDict[dfile]['map']['region'] = RegionsMap
    
    # Add education groups
    DataDict[dfile]['map']['educ'] = educ_map
    
    # WBHAO and WBHAOM race/ethnic groups from CEPR
    start_month = pd.to_datetime(dvals['start'])
    if start_month > pd.to_datetime('2012-04-01'):
        race = {'White': [1], 
                'Black': [2, 6, 10, 11, 12, 16, 17, 18, 22, 23], 
                'Asian': [4, 5, 8, 9, 13, 14, 15, 19, 20, 21, 24], 
                'Other': [3, 7, 25, 26]}
        racem = {'White': [1],
                 'Black': [2],
                 'Asian': [4, 5],
                 'Native American': [3],
                 'More than one': list(range(6, 27))}
    elif start_month > pd.to_datetime('2002-12-01'):
        race = {'White': [1], 
                'Black': [2, 6, 10, 11, 12, 15, 16, 19], 
                'Asian': [4, 5, 8, 9, 13, 14, 17, 18], 
                'Other': [3, 7, 20, 21]}
        racem = {'White': [1],
                 'Black': [2],
                 'Asian': [4, 5],
                 'Native American': [3],
                 'More than one': list(range(6, 22))}
    else:  # Mixed not available before 2003
        race = {'White': [1], 
                'Black': [2], 
                'Asian': [4], 
                'Other': [3, 5]}
    race_map = {i: k for k, v in race.items() for i in v}
    race_map2 = {i: k for k, v in racem.items() for i in v}
    DataDict[dfile]['map']['race'] = race_map
    DataDict[dfile]['map']['racem'] = race_map2    
    
    # Hispanic identification
    if start_month > pd.to_datetime('2013-12-01'):
        hisp = [1, 2, 3, 4, 5, 6, 7, 8]
        hispdt = {'Mexican': [1],
                  'Puerto Rican': [2],
                  'Cuban': [3],
                  'Dominican': [4],
                  'Salvadoran': [5],
                  'Central American, excluding Salvadoran': [6],
                  'Sotuh American': [7],
                  'Other Spanish': [8]}
        hispdt03 = {'Mexican': [1],
                    'Puerto Rican': [2],
                    'Cuban': [3],
                    'Central/South American': [4, 5, 6, 7],
                    'Other Spanish': [8]}
    elif start_month > pd.to_datetime('2002-12-01'):
        hisp = [1, 2, 3, 4, 5]
        hispdt03 = {'Mexican': [1],
                    'Puerto Rican': [2],
                    'Cuban': [3],
                    'Central/South American': [4],
                    'Other Spanish': [5]}
    else:
        hisp = [1, 2, 3, 4, 5, 6, 7]
        
    hisp_map = {i: k for k, v in hispdt.items() for i in v}
    hisp_map2 = {i: k for k, v in hispdt03.items() for i in v}
    DataDict[dfile]['map']['hisp'] = hisp
    DataDict[dfile]['map']['hispdt'] = hisp_map
    DataDict[dfile]['map']['hispdt03'] = hisp_map2
    
    # Major industry group
    start_month = pd.to_datetime(dvals['start'])
    if start_month > pd.to_datetime('2002-12-01'):
        ind = {'Construction and mining': [1, 2, 3],
               'Finance and business services': [7, 8, 9, 12],
               'Manufacturing': [4],
               'Trade, transportation, and utilities': [5, 6],
               'Education and health': [10],
               'Leisure and hospitality': [11],
               'Public administration': [13],
               'Armed forces': [14]}
    else:
        ind = {'Construction and mining': [1, 2, 3, 21],
               'Finance and business services': [7, 11, 12, 13, 14, 20],
               'Manufacturing': [4, 5],
               'Trade, transportation, and utilities': [6, 8, 9, 10],
               'Education and health': [16, 17, 18, 19],
               'Leisure and hospitality': [15],
               'Public administration': [22],
               'Armed forces': [23]}    
    ind_map = {i: k for k, v in ind.items() for i in v}    
    DataDict[dfile]['map']['ind'] = ind_map     
    
    # Broader EPI version of Major industry group
    
        
    # Identify when to calculate ID2 manually
    DataDict[dfile]['map']['id2'] = False
    if start_month < pd.to_datetime('2004-05-01'):
        DataDict[dfile]['map']['id2'] = True

    # Identify weight variables for each data dict
    wgt_vars = [i for i in dvals['dd'].keys() if 'WGT' in i]
    DataDict[dfile]['map']['wgt'] = wgt_vars

    # Identify earnings variables for each data dict
    er_vars = [i for i in dvals['dd'].keys() if 'PRER' in i]
    DataDict[dfile]['map']['er'] = er_vars

In [4]:
# Generate pickle file with data for reader
DataDict['matcher'] = Matcher

with open('cps_basic_dd.pkl', 'wb') as f:
    pickle.dump(DataDict, f)