## Unemployment and COVID

Brian Dew, July 25, 2019

#### CPS Documentation:
- [BLS Handbook of Methods](https://www.bls.gov/opub/hom/cps/home.htm)
- [2020 Data Dictionary](https://www2.census.gov/programs-surveys/cps/datasets/2020/basic/2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt)
- [2017 Data Dictionary](https://www2.census.gov/programs-surveys/cps/datasets/2017/basic/January_2017_Record_Layout.txt) (covers 2019 data)


In [1]:
import pandas as pd
import numpy as np
import os
import re
import struct
import gzip
import shutil
from io import BytesIO
import requests

### Download Files

In [2]:
# Download monthly CPS files if not available in current directory
file_list = ['apr19pub.dat', 'may19pub.dat', 'jun19pub.dat', 
             'apr20pub.dat', 'may20pub.dat', 'jun20pub.dat']
for file in file_list:
    if file not in os.listdir():
        file_loc = f'https://www2.census.gov/programs-surveys/cps/datasets/20{file[3:5]}/basic/{file}.gz'
        print(f'Downloading: {file} from census.gov')
        r = requests.get(file_loc)
        with gzip.open(BytesIO(r.content), 'r') as f_in, open(file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
            
# Download two data dictionaries if not available in current directory
dd_list = {'2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt': 2020, 
             'January_2017_Record_Layout.txt': 2017}
for file, year in dd_list.items():
    if file not in os.listdir():
        file_loc = f'https://www2.census.gov/programs-surveys/cps/datasets/{year}/basic/{file}'
        print(f'Downloading: {file} from census.gov')
        r = requests.get(file_loc)
        with open(file, 'wb') as f:
            f.write(r.content)

### Read data

Search the data dictionary for `var_names` and pull those columns from fixed-width format microdata file. 

In [3]:
# manually list out the IDs for series of interest 
var_names = ['HRMONTH', 'HRYEAR4', 'PRTAGE', 'PRFAMNUM', 'PRDISFLG', 'QSTNUM', 
             'PEMLR', 'PEHRACTT', 'PEIO1COW', 'PEDWRSN', 'PRWNTJOB', 'PENLFACT',
             'PEDWWNTO', 'PRWKSTAT', 'PESCHENR', 'PWSSWGT', 'PWFMWGT', 'HWHHWGT']

dd_list = {19: 'January_2017_Record_Layout.txt',
           20: '2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt'}

unpackers = {}
for year, dd in dd_list.items():
    # read data dictionary text file 
    data_dict = open(dd, 'r', encoding='iso-8859-1').read()
    # regular expression matching series name and data dict pattern
    p = f'\n({"|".join(var_names)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'
    # dictionary of variable name: [start, end, and length + 's']
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{s[1]}s']
         for s in re.findall(p, data_dict)}
    # lists of variable starts, ends, and lengths
    start, end, width = zip(*d.values())
    # create list of which characters to skip in each row
    skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])
    # create format string by joining skip and variable segments
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
    # struct to read each row of the file using the format string
    unpackers[year] = struct.Struct(unpack_fmt).unpack_from    

Open the six monthly files as a pandas dataframe, keep only rows with a person weight greater than zero

In [4]:
df = pd.DataFrame()

for file in file_list:
    raw_data = open(file, 'rb').readlines()

    wgt = d['PWSSWGT']  # Person sample weight
    # unpack and store data of interest if sample weight > 0
    data = [[*map(int, unpackers[int(file[3:5])](row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0]
    
    df = df.append(pd.DataFrame(data, columns=d.keys()))

### Create Variables

Categorize person records into various labor market groups and categories that relate to possible work limitations. Summarize these groups and categories at the family level.

In [5]:
# Uniquely identify households and families using these variables
hh_grp = ['HRYEAR4', 'HRMONTH', 'QSTNUM']
fam_grp = ['HRYEAR4', 'HRMONTH', 'QSTNUM', 'PRFAMNUM']

groups = ['U18', 'DIS', 'STU', 'CARE', 'UNEM', 'UNDE', 'ELD', 'SE']

# Reason for not being in labor force
nilfreason = lambda x: pd.Categorical(
    np.where((x['PRWNTJOB']==2) & 
             ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2]))), 
             'Disabled/Ill',
    np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==4), 'Family',
    np.where((x['PRWNTJOB']==2) & ((x['PEMLR']==5) | (x['PENLFACT']==5)), 
             'Retired',
    np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==3), 'School',
    np.where(x['PEDWWNTO']==1, 'Discouraged',
    np.where(x['PEMLR'].isin([5, 6, 7]), 'Other', np.nan)))))))
# Part time for economic reasons
ptecon = lambda x: pd.Categorical(
    np.where(x['PRWKSTAT'].isin([3, 6]), 1, 
    np.where(x['PRWKSTAT'].between(2, 10), 0, np.nan)))
# Marginally attached to labor force
mrgnatt = lambda x: pd.Categorical(
    np.where(x['PEDWRSN'].between(1, 11), 1, 0))
lfs = lambda x: pd.Categorical(
    np.where(x['PEMLR'].isin([1, 2]), 'Employed',
    np.where(x['PEMLR'].isin([3, 4]), 'Unemployed',
    np.where(x['PEMLR'].isin([5, 6, 7]), 'NILF', np.nan))))

df = df.assign(NILFREASON = nilfreason, PTECON = ptecon, 
               MRGNATT = mrgnatt, LFS = lfs)

# Weights have four implied decimal places:
for weight in ['PWFMWGT', 'PWSSWGT', 'HWHHWGT']:
    df[weight] = df[weight] / 10000.0

# File identifies non-workers as having -1 hours, which affects average
df['PEHRACTT'].replace(-1, 0, inplace=True)

# Identify categories of people less likely to recieve labor income
# Children
df['U18'] = np.where(df['PRTAGE'] < 18, 1, 0)
# Disability or Illness
df['DIS'] = np.where((df['PRDISFLG'] == 1) | (df['NILFREASON'] == 'Disabled/Ill'), 1, 0)
# Students
df['STU'] = np.where((df['PESCHENR'] == 1) | (df['NILFREASON'] == 'School'), 1, 0)
# Unpaid Caregivers
df['CARE'] = np.where(df['NILFREASON'] == 'Family', 1, 0)
# Unemployed
df['UNEM'] = np.where(df['LFS'] == 'Unemployed', 1, 0)
# Underutilized
df['UNDE'] = np.where((df['LFS'] == 'Unemployed') | (df['MRGNATT'] == 1) | 
                      (df['PTECON'] == 1) | 
                      (df['NILFREASON'] == 'Discouraged'), 1, 0)
# Elderly
df['ELD'] = np.where(df['PRTAGE'] > 64, 1, 0)
# Self-employed
df['SE'] = np.where(df['PEIO1COW'].isin([6,7]), 1, 0)

# Family level summary
df['FMHRST'] = df.groupby(fam_grp)['PEHRACTT'].transform('sum')
df['FMNUM'] = df.groupby(fam_grp)['PWSSWGT'].transform('count')
df['FMHRSPP'] = df['FMHRST'] / df['FMNUM']
for group in groups:
    df['FM' + group] = df.groupby(fam_grp)[group].transform('sum')
df['FMANY'] = df[['FM' + group for group in groups]].sum(axis=1)
df['FMTOT'] = 1
df['FMNONE'] = np.where(df['FMANY'] == 0, 1, 0)

#### Create Table 1

In [6]:
# Take first row from each family
fam_data = df.groupby(fam_grp).nth(0).reset_index()

names = {'FMTOT': 'Total, all families', 
         'Label': 'Family contains one or more: ',
         'FMU18': '  Children', 
         'FMDIS': '  Persons with disabilities', 
         'FMSTU': '  Students',
         'FMCARE': '  Unpaid caregivers', 
         'FMUNEM': '  Unemployed', 
         'FMUNDE': '  Underutilized',
         'FMELD': '  Elderly', 
         'FMSE': '  Self-employed',
         'FMANY': '  Family contains any from above groups',
         'FMNONE': '  Family contains none from above groups'}

idx = [value for key, value in names.items()]

col = pd.MultiIndex.from_product([['April to June 2020', 'April to June 2019'],
                                  ['Share of families', 'Average weekly hours worked per family member']])

tbl = pd.DataFrame('', idx, col)

for year in [2020, 2019]:
    data = fam_data.query('HRYEAR4 == @year')
    num = data.PWFMWGT.sum() / 3
    date_col = f'April to June {year}'
    for group in names.keys():
        if group != 'Label':
            d = data.loc[data[group] > 0]
            n = d.PWFMWGT.sum () / 3
            hrs = f"{np.average(d['FMHRSPP'], weights=d['PWFMWGT']):.1f}"
            share = f'{(n/num)*100:.1f}%'
        else:
            hrs = ''
            share = ''
        col = (date_col, 'Average weekly hours worked per family member')
        tbl.loc[names[group], col] = hrs
        col = (date_col, 'Share of families')
        tbl.loc[names[group], col] = share
print('Table 1. Actual weekly hours per family member')       
tbl

Table 1. Actual weekly hours per family member


Unnamed: 0_level_0,April to June 2020,April to June 2020,April to June 2019,April to June 2019
Unnamed: 0_level_1,Share of families,Average weekly hours worked per family member,Share of families,Average weekly hours worked per family member
"Total, all families",100.0%,16.0,100.0%,19.5
Family contains one or more:,,,,
Children,27.8%,12.6,28.2%,14.9
Persons with disabilities,21.4%,7.1,22.3%,8.7
Students,15.2%,14.4,15.7%,17.6
Unpaid caregivers,9.0%,9.1,9.0%,10.5
Unemployed,12.4%,7.8,3.8%,9.1
Underutilized,22.5%,10.3,9.5%,12.6
Elderly,29.5%,6.5,28.3%,8.2
Self-employed,10.4%,20.8,10.1%,26.6


#### Create table 2

In [7]:
names = {'FMTOT': 'Unemployed group and family', 
         'Label': 'Family members include: ',
         'U18': '  Children', 
         'DIS': '  Persons with disabilities', 
         'STU': '  Students',
         'CARE': '  Unpaid caregivers', 
         'ELD': '  Elderly'}

idx = ['Unemployed group alone'] + [value for key, value in names.items()]

col = pd.MultiIndex.from_product([['April to June 2020', 'April to June 2019'],
                                  ['Unemployed', 'Underutilized']])

tbl = pd.DataFrame('', idx, col)

for year in [2020, 2019]:
    date_col = f'April to June {year}'
    for category in ['FMUNEM', 'FMUNDE']:
        col_name = 'Unemployed' if category == 'FMUNEM' else 'Underutilized'
        data = df.loc[(df['HRYEAR4'] == year) & (df[category] > 0)]
        group_size = f'{(data.loc[data[category[2:]] == 1].PWSSWGT.sum()/3).round(-3):,.0f}'
        col = (date_col, col_name)
        tbl.loc['Unemployed group alone', col] = group_size
        for key, value in names.items():
            if key != 'Label':
                tot = f'{(data.loc[data[key] == 1].PWSSWGT.sum()/3).round(-3):,.0f}'
            else: 
                tot = ''
            tbl.loc[value, col] = tot

print('Table 2. Composition of unemployed and underutilized families')            
tbl

Table 2. Composition of unemployed and underutilized families


Unnamed: 0_level_0,April to June 2020,April to June 2020,April to June 2019,April to June 2019
Unnamed: 0_level_1,Unemployed,Underutilized,Unemployed,Underutilized
Unemployed group alone,20536000.0,39340000.0,5861000.0,15274000.0
Unemployed group and family,49206000.0,87764000.0,15563000.0,37915000.0
Family members include:,,,,
Children,11335000.0,20748000.0,3806000.0,9096000.0
Persons with disabilities,3231000.0,6066000.0,1347000.0,3480000.0
Students,5863000.0,10275000.0,1943000.0,4877000.0
Unpaid caregivers,1147000.0,2291000.0,267000.0,798000.0
Elderly,3264000.0,6234000.0,951000.0,2951000.0
