### bd econ CPS extract

bd_CPS_1989-93.ipynb

April 16, 2019

Contact: Brian Dew, @bd_econ

----

### About

**Goal:** Use python to work with Current Population Survey data from 1989-93.

This notebook reads raw CPS data from Census, downloaded from [NBER](https://www.nber.org/data/cps_basic.html), to generate annual feather-format files that match with the bd CPS extracts for 1994-onward. A major revision to the CPS in 1994 makes it impossible to completely match the 1989-93 data to the 1994-onward data, but this notebook attempts to adjust the 1989-93 data to match with 1994-onward data as close as possible. 

See the [GitHub repo page](https://github.com/bdecon/econ_data/tree/master/bd_CPS) for details on how to use the bd CPS and on what variables are available and how they are defined. See also the [benchmark notebook](https://github.com/bdecon/econ_data/blob/master/micro/bd_CPS_benchmark.ipynb) for examples of using the 1989-93 data. 

-----

See [issues](https://github.com/bdecon/econ_data/issues?q=is%3Aopen+is%3Aissue+label%3A1989-93) and [project](https://github.com/bdecon/econ_data/projects/4) on Github. Please feel free to contact me if you have any questions or are interested in helping with the project. My email address is brianwdew@gmail.com.


----

See this [discussion of the CPS revamp](https://www.bls.gov/cps/revisions1994.pdf) for guidance on matching the 1989-93 data to the 1994-onward data.


In [1]:
# import libraries and settings
import os, re, struct, pickle, shutil
import pandas as pd
print('pandas:', pd.__version__)
import numpy as np
print('numpy:', np.__version__)

# Map codes for country of birth to country/area names
from bd_CPS_details import EducDTMap, INDMMap

data_dir = '/home/brian/Documents/CPS/data/'

os.chdir(data_dir)

# Data dictionary file generated by bd_CPS_dd.ipynb
cpsdd = pickle.load(open('cps_basic_dd.pkl', 'rb'))

# Dictionary of unique IDS
ids_file = 'CPSID_89-93.pkl'
if os.path.isfile(ids_file):
    cps_ids_full = pickle.load(open(ids_file, 'rb'))

pandas: 2.2.2
numpy: 1.26.4


In [2]:
# If running for first time, unzip the annual NBER zip files in raw data 
# folder and run this block of code. It will rename the files accordingly.

# Check if files exist:
date_range = [dt.strftime('%b%y').lower() for dt in 
              pd.date_range(start='1989-01-01', end='1993-12-01', freq='MS')]
correct_files = [f'{date}pub.dat' for date in date_range]
if correct_files not in os.listdir():
    # Rename unzipped NBER files
    raw_files = [f for f in os.listdir() if f.startswith('cpsb') 
                 and '.dat' not in f and '.zip' not in f]
    to_rename = list(set(raw_files) - set(correct_files))
    for file in to_rename:
        date = pd.to_datetime(f'19{file[4:6]}-{file[6:8]}-01')
        os.rename(file, date.strftime('%b%y').lower() + 'pub.dat')
        print('Renamed: ', file, date.strftime('%b%y').lower() + 'pub.dat')

In [3]:
# User-defined functions
def id_dtype(size):
    '''Return data type based on fixed-width size'''
    size = int(size)
    dtype = ('intp' if size > 9 
             else 'int32' if size > 4 
             else 'int16' if size > 2 
             else 'int8')
    return dtype

def data_dict_reader(dd_file, var_list):
    '''Read data dictionary and return variable locations'''
    data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()
    p = ('(\w{1,2}[\$\-%]\w*|PADDING)\s'
         '*CHARACTER\*(\d{3})\s*\.{0,1}\s*\((\d*):(\d*)\).*')
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{int(s[1])}s', id_dtype(s[1])]
         for s in re.findall(p, data_dict) if s[0] in var_list}
    return d

def struct_unpacker(d):
    '''Return struct unpacker from variable locations'''
    start, end, width, size = zip(*d.values())
    skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
    return struct.Struct(unpack_fmt).unpack_from

def data_file_reader(file, unpacker, dtypes, wgt):
    '''Convert raw monthly file to pandas dataframe'''
    raw_data = open(file, 'rb')
    data = [unpacker(row) for row in raw_data if len(row) >= 405]
    data = [tuple(int(i) if i.strip() else -1 for i in row) for row in data]
    np_data = np.array(data, dtype=dtypes)
    df = pd.DataFrame(np_data[np_data[wgt] > 0])
    return df

  p = ('(\w{1,2}[\$\-%]\w*|PADDING)\s'
  '*CHARACTER\*(\d{3})\s*\.{0,1}\s*\((\d*):(\d*)\).*')


In [4]:
# Create annual feather files
dd_files = {'cps89.ddf': [1989, 1990, 1991],
            'cps92.ddf': [1992, 1993]}

var_list = ['H-MONTH', 'H-YEAR', 'H-MIS', 'HG-FIPS', 'H-METSTA', 'HG-MSAS',
            'H-ID', 'A-LINENO', 'A-AGE', 'A-MARITL', 'A-SEX', 'A-HGA',
            'A-RACE', 'A-IND', 'A-OCC', 'A-CLSWKR', 'HG-MSAC', 'HG-CMSA',
            'A-USLHRS', 'A-UNMEM', 'A-REORGN', 'A-LFSR', 'A-ENRLW',
            'A-UNTYPE', 'A-FNLWGT', 'A-ERNLWT', 'A-HRLYWK', 'H-HHWGT',
            'A-HERNTP', 'A-WERNTP', 'A-HRS1', 'A-WKSLK', 'H-HHNUM',
            'A-UNCOV', 'A-HGC', 'A-MJIND', 'A-MJOCC', 'A-WKSTAT', 
            'A-DTOCC', 'A-DTIND', 'A-SPOUSE', 'A-PARENT', 'A-EMPHRS',
            'H-HTYPE', 'A-FTPT', 'A-HSCOL', 'A-VET', 'H-FAMINC',
            'A-USLFT', 'A-RRP']

# Remove the first two characters from each variable name
rename_list = {v: v[2:] for v in var_list if v[0:2] != 'HG'}
rename_list['HG-FIPS'] = 'STATEFIPS'
rename_list['A-WKSLK'] = 'UNEMPDUR'
rename_list['HG-MSAC'] = 'MSA'
rename_list['HG-CMSA'] = 'CMSA'
rename_list['HG-MSAS'] = 'MSAS'
filter_wgt = 'A-FNLWGT'

# Map state FIPS codes to two letter codes
state_codes = cpsdd['jan94_mar94_dd.txt']['map']['state']
region_codes = cpsdd['jan94_mar94_dd.txt']['map']['region']
state = lambda x: pd.Categorical(x['STATEFIPS'].map(state_codes))
region = lambda x: x['STATE'].map(region_codes)

# Metro and Principal city status
mpcstat = lambda x: pd.Categorical(
    np.where(x.MSAS == 1, 'Principal City',
    np.where(x.MSAS == 2, 'Balance',
    np.where(x.MSAS == 3, 'Nonmetropolitan',
    np.where(x.MSAS == 4, 'Not Identified', None)))))
metstat = lambda x: pd.Categorical(
    np.where(x.METSTA == 1, 'Metropolitan',
    np.where(x.METSTA == 2, 'Nonmetropolitan',
    np.where(x.METSTA == 3, 'Not Identified', None))))

# 1992-onward educ codes
educ_codes = cpsdd['jan94_mar94_dd.txt']['map']['educ']
educ = lambda x: x['HGA'].map(educ_codes)
educdt = lambda x: pd.Categorical(x['HGA'].map(EducDTMap))

# School enrollment
schenr = lambda x: pd.Categorical(
    np.where(x['ENRLW'] == 1, 1, 
    np.where(x['ENRLW'] == 2, 0, None)))

school = lambda x: pd.Categorical(
    np.where(x.HSCOL == 1, 'High School', 
    np.where((x.HSCOL == 2) & (x.FTPT == 1), 'Full-time College', 
    np.where((x.HSCOL == 2) & (x.FTPT == 2), 'Part-time College', None))))

# major industry group
ind_codes = cpsdd['jan94_mar94_dd.txt']['map']['ind']
indgrp =  lambda x: pd.Categorical(x['MJIND'].map(ind_codes))

# Manager
manager = lambda x: np.where(x.DTOCC == 1, 1,
                    np.where(x.DTOCC > 0, 0, None))

# Unemployment type recode - FIX (MAP IS SLOW)
unemptype_map = {1: 'Job Loser', 2: 'Job Loser',
                 3: 'Job Leaver',
                 4: 'Re-entrant',
                 5: 'New Entrant'}

unemptype = lambda x: x['UNTYPE'].map(unemptype_map)

# Layoff vs looking
layoff = lambda x: pd.Categorical(
    np.where(x['LFSR'] == 4, 'Layoff',
    np.where(x['LFSR'] == 3, 'Looking', None)))

# Part-time for economic reasons
ptecon = lambda x: pd.Categorical(
    np.where(x['WKSTAT'].isin([3, 5]), 1, 
    np.where(x['WKSTAT'].between(2, 5), 0, None)))

# Worked full-time (usually FT or usually PT)
workft = lambda x: pd.Categorical(
    np.where(x['WKSTAT'] == 2, 1,
    np.where(x['WKSTAT'].between(2, 5), 0, None)))

# Usually work full-time (35+ hours)
uslft = lambda x: pd.Categorical(
    np.where((x['USLFT'] == 1) | (x['USLHRS'] >= 35), 1, 
    np.where(x['USLFT'] == 2, 0, None)))

# Not at work during reference week
notatwork = lambda x: pd.Categorical(
    np.where(x['EMPHRS'].isin([1, 2, 3, 4, 5]), 1,
    np.where(x['EMPHRS'].between(6, 16), 0, None)))

# Map WBHAO codes for race/ethnicity
hisp_map = cpsdd['jan94_mar94_dd.txt']['map']['hisp']
hispanic = lambda x: pd.Categorical(np.where(x['REORGN'].isin(hisp_map), 1, 0))
race_map = cpsdd['jan94_mar94_dd.txt']['map']['race']
wbhao = lambda x: (    # If not hispanic, map race to racial groups
    pd.Categorical(np.where(x['REORGN'].isin(hisp_map), 'Hispanic', 
                            x['RACE'].map(race_map))))
wbao = lambda x: pd.Categorical(x['RACE'].map(race_map))

# Wage variables
wkearn = lambda x: np.where(x.WERNTP >= 0, x.WERNTP, None)
hrwage = lambda x: np.where((x.HERNTP < 0) & (x.USLHRS > 0 ) & 
                            (x.WERNTP > 0), (x.WERNTP / x.USLHRS) / 100, 
                            np.where(x.HERNTP >= 0, x.HERNTP / 100, None)) 
priceadj = lambda x: 1 * x.REGION.map(cpi_vals)

# Union member and coverage
union = lambda x: pd.Categorical(
    np.where((x['UNMEM'] == 1) | (x['UNCOV'] == 1), 1, 
    np.where((x['UNMEM'] == 2) & (x['UNCOV'] == 2), 0, None)),
    ordered=True)
unionmem = lambda x: pd.Categorical(
    np.where(x['UNMEM'] == 1, 1, 
    np.where(x['UNMEM'] == 2, 0, None)),
    ordered=True)

# Paid hourly
paidhrly = lambda x: pd.Categorical(
    np.where(x['HRLYWK'] == 1, 1,
    np.where(x['HRLYWK'] == 2, 0, None)))

# bd CPS consistent variables
age = lambda x: np.where(x['AGE'] > 80, 80, x['AGE'])
female = lambda x: np.where(x['SEX'] == 2, 1, 0)
faminc = lambda x: pd.Categorical(np.where(x.FAMINC.between(0, 14), 
                                           x.FAMINC + 1, None))
veteran = lambda x: np.where(x['VET'].isin([1, 2, 3, 4, 5]), 1,
                             np.where(x['VET'] == 6, 0, None))
married = lambda x: np.where(x['MARITL'].isin([1, 2, 3]), 1, 
                             np.where(x['MARITL'].isin([4,5,6,7]), 0, None))
emp = lambda x: np.where(x['LFSR'].isin([1,2]), 1, 0)

# Labor force status
lfs = lambda x: pd.Categorical(
    np.where(x['LFSR'].isin([1, 2]), 'Employed',
    np.where(x['LFSR'].isin([3, 4]), 'Unemployed',
    np.where(x['LFSR'].isin([5, 6, 7]), 'NILF', np.nan))))

# Class of worker
cow1 = lambda x: pd.Categorical(
    np.where(x['CLSWKR'] == 2, 'Federal Government',
    np.where(x['CLSWKR'] == 3, 'State Government',
    np.where(x['CLSWKR'] == 4, 'Local Government',
    np.where(x['CLSWKR'] == 1, 'Private',
    np.where(x['CLSWKR'] == 5, 'Self-employed Incorporated',
    np.where(x['CLSWKR'] == 6, 'Self-employed Unincorporated',
    np.where(x['CLSWKR'] == 7, 'Without Pay', None))))))))


# Weight variables
basicwgt = lambda x: np.where(x['FNLWGT'] > 0, x['FNLWGT'] / 100.0, x['FNLWGT'])
pworwgt = lambda x: np.where(x['ERNLWT'] > 0, x['ERNLWT'] / 100.0, x['ERNLWT'])
hhwgt = lambda x: np.where(x['HHWGT'] > 0, x['HHWGT'] / 100.0, x['HHWGT'])

# Read in Consumer Price Index data created by bd_CPS_cpi.ipynb
cpi = pd.read_csv('clean/cpi.csv', index_col=[0], parse_dates=True)

# Read data dictionaries for information on processing raw data files
for ddf, year_list in dd_files.items():
    
    d = data_dict_reader(ddf, var_list)

    dtypes = [(k, v[-1]) for k, v in d.items()]

    unpacker = struct_unpacker(d)

    # Loop over and process each monthly file in each year
    for year in year_list:
        file_list = [file for file in os.listdir()
                     if file.endswith(f'{str(year)[2:]}pub.dat')]        
        combined_data = []
        
        for file in file_list:
            date = pd.to_datetime(f'{year}-{file[:3]}-01')
            cpi_vals = cpi.loc[date].to_dict()
            df = data_file_reader(file, unpacker, dtypes, filter_wgt)
            df = df.rename(rename_list, axis=1)
            # Education variable underlying data changes in 1992
            if year < 1992:
                educ_map = {
                    'LTHS': (df['HGA'].between(1, 11)) | ((df['HGA']==12) & 
                                                          (df['HGC']==2)),
                    'HS': (df['HGA']==12) & (df['HGC']==1),
                    'SC': (df['HGA'].between(13,15)) | ((df['HGA']==16) & 
                                                        (df['HGC']==2)),
                    'COLL': ((df['HGA']==16) & (df['HGC']==1)) | (df['HGA']==17),
                    'ADV': (df['HGA'] >= 18)}
                df['EDUC'] = (np.select(educ_map.values(), 
                                        educ_map.keys(), 
                                        default=None))
                df = df.drop(['HGA', 'HGC'], axis=1)
            # Add custom variables defined above
            if year >= 1992:
                df = df.assign(EDUC = educ, EDUCDT = educdt).drop(['HGA'], axis=1)
            df = (df.assign(STATE = state,
                            REGION = region,
                            METSTAT = metstat,
                            MPCSTAT = mpcstat,
                            FAMINC = faminc,
                            AGE = age,
                            FEMALE = female,
                            WBHAO = wbhao,
                            WBAO = wbao,
                            HISPANIC = hispanic,
                            VETERAN = veteran,
                            SCHENR = schenr,
                            SCHOOL = school,
                            MARRIED = married,
                            EMP = emp,
                            LFS = lfs,
                            COW1 = cow1,
                            UNEMPTYPE = unemptype,
                            LAYOFF = layoff,
                            PTECON = ptecon,
                            WORKFT = workft,
                            USLFT = uslft,
                            NOTATWORK = notatwork,
                            UNION = union,
                            UNIONMEM = unionmem,
                            PAIDHRLY = paidhrly,
                            INDGRP = indgrp,
                            MANAGER = manager,
                            WKEARN = wkearn,
                            HRWAGE = hrwage,
                            PRICEADJ = priceadj,
                            BASICWGT = basicwgt,
                            PWORWGT = pworwgt,
                            HHWGT = hhwgt)
                    .drop(['STATEFIPS', 'SEX', 'VET', 'MARITL', 'UNTYPE',
                           'REORGN', 'ENRLW', 'WKSTAT', 'UNMEM', 'RACE',
                           'UNCOV', 'CLSWKR', 'HRLYWK', 'HERNTP', 'EMP',
                           'WERNTP', 'LFSR', 'FNLWGT', 'ERNLWT', 'EMPHRS',
                           'FTPT', 'HSCOL', 'HHNUM', 'METSTA',
                           'MSAS'], axis=1))
            df['YEAR'] = year
            
            # Rename and resize selected variables
            df = df.rename({'USLHRS': 'HRSUSL1', 'HRS1': 'HRSACTT',
                            'ID': 'HHID',
                            'DTIND': 'IND80D', 'DTOCC': 'OCC80D',
                            'MJIND': 'IND80M', 'MJOCC': 'OCC80M'}, axis=1)
            if year < 1992:
                df = df.rename({'IND': 'IND80', 'OCC': 'OCC80'}, axis=1)
            if year >= 1992:
                df = df.rename({'IND': 'IND90', 'OCC': 'OCC90'}, axis=1)
            resize_vars = ['STATE', 'FEMALE', 'VETERAN', 'MARRIED', 
                           'EDUC', 'UNEMPTYPE', 'REGION', 'YEAR', 'PTECON',
                           'CMSA', 'MSA']
            df[resize_vars] = df[resize_vars].astype('category')
            flt_vars = ['WKEARN', 'HRWAGE', 
                        'BASICWGT', 'PWORWGT', 'HHWGT']
            df[flt_vars] = df[flt_vars].astype('float32')
            
            # Add QSTNUM and OCCURNUM
            df['QSTNUM'] = df.groupby('HHID').ngroup().astype('int32')
            df['OCCURNUM'] = ((df.groupby('QSTNUM').cumcount() + 1)
                                 .astype('int8'))
            
            # Major industry recode
            for indvar in ['IND80', 'IND90']:
                if indvar in df.keys():
                    indmap = {i: k for k, v in INDMMap.items() for i in v[indvar]}
                    indm = lambda x: pd.Categorical(x[indvar].map(indmap))
                    df = df.assign(INDM = indm)            
            
            # bd CPS household ID
            if os.path.isfile(ids_file):
                df['CPSID'] = df['QSTNUM'].map(cps_ids_full[date])

            combined_data.append(df)
            
        # Combine monthly files into annual file
        df = (pd.concat(combined_data)).reset_index(drop=True)
        
        ind_occ_cats = ['INDGRP', 'IND80', 'OCC80', 'IND80D', 'OCC80D',
                        'IND80M', 'OCC80M', 'IND90', 'OCC90']
        cat_vars = [cv for cv in ind_occ_cats if cv in df.keys()]
        convert_dict = {cat: 'category' for cat in cat_vars}
        df = df.astype(convert_dict)       
        
        # Store results as feather file
        df.to_feather(f'clean/cps{year}.ft')
        
        # Print outcome
        obs = len(df)
        cols = len(df.keys())
        size = round(df.memory_usage().sum() / 1024**2, 1)
        print(f'{year} Done: ({obs:,} records, {cols} variables, {size}MB)')

1989 Done: (1,713,347 records, 57 variables, 173.2MB)
1990 Done: (1,791,585 records, 57 variables, 181.1MB)
1991 Done: (1,774,232 records, 57 variables, 179.4MB)
1992 Done: (1,746,184 records, 58 variables, 178.2MB)
1993 Done: (1,722,398 records, 58 variables, 175.8MB)
