## Unemployment and COVID

Brian Dew, July 25, 2019

In [1]:
import pandas as pd
import numpy as np
import os
import re
import struct
import gzip
import shutil
from io import BytesIO
import requests

### Download Files

In [2]:
# Download monthly CPS files if not available in current directory
file_list = ['apr19pub.dat', 'may19pub.dat', 'jun19pub.dat', 
             'apr20pub.dat', 'may20pub.dat', 'jun20pub.dat']
for file in file_list:
    if file not in os.listdir():
        file_loc = f'https://www2.census.gov/programs-surveys/cps/datasets/20{file[3:5]}/basic/{file}.gz'
        print(f'Downloading: {file} from census.gov')
        r = requests.get(file_loc)
        with gzip.open(BytesIO(r.content), 'r') as f_in, open(file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
            
# Download two data dictionaries if not available in current directory
dd_list = {'2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt': 2020, 
             'January_2017_Record_Layout.txt': 2017}
for file, year in dd_list.items():
    if file not in os.listdir():
        file_loc = f'https://www2.census.gov/programs-surveys/cps/datasets/{year}/basic/{file}'
        print(f'Downloading: {file} from census.gov')
        r = requests.get(file_loc)
        with open(file, 'wb') as f:
            f.write(r.content)

### Read data

Search the data dictionary for `var_names` and pull those columns from microdata file

In [3]:
# manually list out the IDs for series of interest 
var_names = ['HRMONTH', 'HRYEAR4', 'PRTAGE', 'PRFAMNUM', 'PRDISFLG', 'QSTNUM', 
             'PEMLR', 'PEHRACTT', 'PEIO1COW', 'PEDWRSN', 'PRWNTJOB', 'PENLFACT',
             'PEDWWNTO', 'PRWKSTAT', 'PESCHENR', 'PWSSWGT', 'PWFMWGT', 'HWHHWGT']

dd_list = {19: 'January_2017_Record_Layout.txt',
           20: '2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt'}

unpackers = {}

for year, dd in dd_list.items():
    # read data dictionary text file 
    data_dict = open(dd, 'r', encoding='iso-8859-1').read()

    # regular expression matching series name and data dict pattern
    p = f'\n({"|".join(var_names)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'

    # dictionary of variable name: [start, end, and length + 's']
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{s[1]}s']
         for s in re.findall(p, data_dict)}

    # lists of variable starts, ends, and lengths
    start, end, width = zip(*d.values())

    # create list of which characters to skip in each row
    skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])

    # create format string by joining skip and variable segments
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

    # struct can interpret row bytes with the format string
    unpackers[year] = struct.Struct(unpack_fmt).unpack_from    

Open the six monthly files as a pandas dataframe, keep only rows with a person weight greater than zero

In [4]:
df = pd.DataFrame()

for file in file_list:
    # open file (read as binary) and read lines into "raw_data"
    raw_data = open(file, 'rb').readlines()

    wgt = d['PWSSWGT']  # Location of sample weight variable

    # unpack and store data of interest if sample weight > 0
    data = [[*map(int, unpackers[int(file[3:5])](row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0]
    
    df = df.append(pd.DataFrame(data, columns=d.keys()))

### Create Variables

In [5]:
# Uniquely identify households and families using these variables
hh_grp = ['HRYEAR4', 'HRMONTH', 'QSTNUM']
fam_grp = ['HRYEAR4', 'HRMONTH', 'QSTNUM', 'PRFAMNUM']

groups = ['U18', 'DIS', 'STU', 'CARE', 'UNEM', 'UNDE', 'ELD', 'SE']

# Reason for not being in labor force
nilfreason = lambda x: pd.Categorical(
    np.where((x['PRWNTJOB']==2) & 
             ((x['PEMLR']==6) | (x['PENLFACT'].isin([1, 2]))), 
             'Disabled/Ill',
    np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==4), 'Family',
    np.where((x['PRWNTJOB']==2) & ((x['PEMLR']==5) | (x['PENLFACT']==5)), 
             'Retired',
    np.where((x['PRWNTJOB']==2) & (x['PENLFACT']==3), 'School',
    np.where(x['PEDWWNTO']==1, 'Discouraged',
    np.where(x['PEMLR'].isin([5, 6, 7]), 'Other', np.nan)))))))
# Part time for economic reasons
ptecon = lambda x: pd.Categorical(
    np.where(x['PRWKSTAT'].isin([3, 6]), 1, 
    np.where(x['PRWKSTAT'].between(2, 10), 0, np.nan)))
# Marginally attached to labor force
mrgnatt = lambda x: pd.Categorical(
    np.where(x['PEDWRSN'].between(1, 11), 1, 0))
lfs = lambda x: pd.Categorical(
    np.where(x['PEMLR'].isin([1, 2]), 'Employed',
    np.where(x['PEMLR'].isin([3, 4]), 'Unemployed',
    np.where(x['PEMLR'].isin([5, 6, 7]), 'NILF', np.nan))))
# Class of worker on main job
cow1 = lambda x: pd.Categorical(
    np.where(x['PEIO1COW'] == 1, 'Federal Government',
    np.where(x['PEIO1COW'] == 2, 'State Government',
    np.where(x['PEIO1COW'] == 3, 'Local Government',
    np.where(x['PEIO1COW'].isin([4, 5]), 'Private',
    np.where(x['PEIO1COW'] == 6, 'Self-employed Incorporated',
    np.where(x['PEIO1COW'] == 7, 'Self-employed Unincorporated',
    np.where(x['PEIO1COW'] == 8, 'Without Pay', np.nan))))))))

df = df.assign(NILFREASON = nilfreason, PTECON = ptecon, 
               MRGNATT = mrgnatt, LFS = lfs, COW1 = cow1)

df['PWFMWGT'] = df['PWFMWGT'] / 10000.0
df['PWSSWGT'] = df['PWSSWGT'] / 10000.0
df['HWHHWGT'] = df['HWHHWGT'] / 10000.0

df['PEHRACTT'].replace(-1, 0, inplace=True)
df['FMHRST'] = df.groupby(fam_grp)['PEHRACTT'].transform('sum')
df['FMNUM'] = df.groupby(fam_grp)['PWSSWGT'].transform('count')
df['FMHRSPP'] = df['FMHRST'] / df['FMNUM']
df['HHHRST'] = df.groupby(hh_grp)['PEHRACTT'].transform('sum')
df['HHNUM'] = df.groupby(hh_grp)['PWSSWGT'].transform('count')
df['HHHRSPP'] = df['HHHRST'] / df['HHNUM']
df['U18'] = np.where(df['PRTAGE'] < 18, 1, 0)
df['DIS'] = np.where((df['PRDISFLG'] == 1) | (df['NILFREASON'] == 'Disabled/Ill'), 1, 0)
df['STU'] = np.where((df['PESCHENR'] == 1) | (df['NILFREASON'] == 'School'), 1, 0)
df['CARE'] = np.where(df['NILFREASON'] == 'Family', 1, 0)
df['UNEM'] = np.where(df['LFS'] == 'Unemployed', 1, 0)
df['UNDE'] = np.where((df['LFS'] == 'Unemployed') | (df['MRGNATT'] == 1) | 
                      (df['PTECON'] == 1) | 
                      (df['NILFREASON'] == 'Discouraged'), 1, 0)
df['ELD'] = np.where(df['PRTAGE'] > 64, 1, 0)
df['SE'] = np.where((df['COW1'] == 'Self-employed Incorporated') | 
                    (df['COW1'] == 'Self-employed Unincorporated'), 1, 0)

# Family level summary
for group in groups:
    df['FM' + group] = df.groupby(fam_grp)[group].transform('sum')
df['ANY'] = df[['FM' + group for group in groups]].sum(axis=1)

# Household level summary
for group in groups:
    df['HH' + group] = df.groupby(hh_grp)[group].transform('sum')
df['ANY'] = df[['HH' + group for group in groups]].sum(axis=1)

In [6]:
# Take first row from each family
fam_data = df.groupby(fam_grp).nth(0).reset_index()

names = {'U18': 'Children', 'DIS': 'Persons with Disabilities', 'STU': 'Students',
         'CARE': 'Unpaid caregivers', 'UNEM': 'Unemployed', 'UNDE': 'Underutilized',
         'ELD': 'Elderly', 'SE': 'Self-Employed'}

In [7]:
for year in [2019, 2020]:
    print(year)
    data = fam_data.query('HRYEAR4 == @year')
    print('Total, All Families: ')
    num = data.PWFMWGT.sum() / 3
    print(' Number of Families: ', f'{num:,.0f}')
    val = np.average(data['FMHRSPP'], weights=data['PWFMWGT'])
    print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('Family contains one or more:')
    for group in groups:
        fmgroup = 'FM' + group
        d = data.loc[data[fmgroup] > 0]
        n = d.PWFMWGT.sum () / 3
        print(names[group], ': ')
        print(' Number of Families: ', f'{n:,.0f}')
        print(' Share of Families: ', f'{(n/num)*100:.1f}%')
        val = np.average(d['FMHRSPP'], weights=d['PWFMWGT'])
        print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('Family contains any of above groups: ')
    d = data.loc[data['ANY'] > 0]
    n = d.PWFMWGT.sum () / 3
    print(' Number of Families: ', f'{n:,.0f}')
    print(' Share of Families: ', f'{(n/num)*100:.1f}%')
    val = np.average(d['FMHRSPP'], weights=d['PWFMWGT'])
    print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('Family contains none of above groups: ')
    d = data.loc[data['ANY'] == 0]
    n = d.PWFMWGT.sum () / 3
    print(' Number of Families: ', f'{n:,.0f}')
    print(' Share of Families: ', f'{(n/num)*100:.1f}%')
    val = np.average(d['FMHRSPP'], weights=d['PWFMWGT'])
    print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('')

2019
Total, All Families: 
 Number of Families:  139,422,336
 Total hours worked per person in reference week:  19.5
Family contains one or more:
Children : 
 Number of Families:  39,323,672
 Share of Families:  28.2%
 Total hours worked per person in reference week:  14.9
Persons with Disabilities : 
 Number of Families:  31,026,519
 Share of Families:  22.3%
 Total hours worked per person in reference week:  8.7
Students : 
 Number of Families:  21,893,624
 Share of Families:  15.7%
 Total hours worked per person in reference week:  17.6
Unpaid caregivers : 
 Number of Families:  12,519,184
 Share of Families:  9.0%
 Total hours worked per person in reference week:  10.5
Unemployed : 
 Number of Families:  5,281,584
 Share of Families:  3.8%
 Total hours worked per person in reference week:  9.1
Underutilized : 
 Number of Families:  13,198,567
 Share of Families:  9.5%
 Total hours worked per person in reference week:  12.6
Elderly : 
 Number of Families:  39,500,204
 Share of Famil

In [11]:
# Take first row from each family
hh_data = df.groupby(hh_grp).nth(0).reset_index()

for year in [2019, 2020]:
    print(year)
    data = hh_data.query('HRYEAR4 == @year')
    print('Total, All Households: ')
    num = data.HWHHWGT.sum() / 3
    print(' Number of Households: ', f'{num:,.0f}')
    val = np.average(data['HHHRSPP'], weights=data['HWHHWGT'])
    print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('Household contains one or more:')
    for group in groups:
        hhgroup = 'HH' + group
        d = data.loc[data[hhgroup] > 0]
        n = d.HWHHWGT.sum () / 3
        print(names[group], ': ')
        print(' Number of Households: ', f'{n:,.0f}')
        print(' Share of Households: ', f'{(n/num)*100:.1f}%')
        val = np.average(d['HHHRSPP'], weights=d['HWHHWGT'])
        print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('Family contains any of above groups: ')
    d = data.loc[data['ANY'] > 0]
    n = d.HWHHWGT.sum () / 3
    print(' Number of Households: ', f'{n:,.0f}')
    print(' Share of Households: ', f'{(n/num)*100:.1f}%')
    val = np.average(d['HHHRSPP'], weights=d['HWHHWGT'])
    print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('Family contains none of above groups: ')
    d = data.loc[data['ANY'] == 0]
    n = d.HWHHWGT.sum () / 3
    print(' Number of Households: ', f'{n:,.0f}')
    print(' Share of Households: ', f'{(n/num)*100:.1f}%')
    val = np.average(d['HHHRSPP'], weights=d['HWHHWGT'])
    print(' Total hours worked per person in reference week: ', f'{val:.1f}')
    print('')

2019
Total, All Households: 
 Number of Households:  129,569,014
 Total hours worked per person in reference week:  19.4
Household contains one or more:
Children : 
 Number of Households:  38,197,586
 Share of Households:  29.5%
 Total hours worked per person in reference week:  15.6
Persons with Disabilities : 
 Number of Households:  30,631,958
 Share of Households:  23.6%
 Total hours worked per person in reference week:  9.0
Students : 
 Number of Households:  21,425,706
 Share of Households:  16.5%
 Total hours worked per person in reference week:  18.1
Unpaid caregivers : 
 Number of Households:  12,372,922
 Share of Households:  9.5%
 Total hours worked per person in reference week:  11.5
Unemployed : 
 Number of Households:  5,216,888
 Share of Households:  4.0%
 Total hours worked per person in reference week:  10.1
Underutilized : 
 Number of Households:  12,930,018
 Share of Households:  10.0%
 Total hours worked per person in reference week:  13.2
Elderly : 
 Number of Hous

In [19]:
print('April to June 2020 Averages:')
df20 = df.query('HRYEAR4 == 2020')
unemp = df20.loc[df20['UNEM'] == 1].PWSSWGT.sum() / 3
print('Unemployed population: ', f'{unemp:,.0f}')
unempfam = df20.loc[(df20['FMUNEM'] > 0), 'PWSSWGT'].sum() / 3
print('Unemployed plus their families: ', f'{unempfam:,.0f}')
unempfamonly = unempfam - unemp
print('Unemployed family only: ', f'{unempfamonly:,.0f}')
unempkids = df20.loc[(df20['FMUNEM'] > 0) & (df20['U18'] == 1), 'PWSSWGT'].sum() / 3
print('Children living with unemployed: ', f'{unempkids:,.0f}')
unde = df20.loc[df20['UNDE'] == 1].PWSSWGT.sum() / 3
print('Underutilized population: ', f'{unde:,.0f}')
undefam = df20.loc[(df20['FMUNDE'] > 0), 'PWSSWGT'].sum() / 3
print('Underutilized plus their families: ', f'{undefam:,.0f}')
undefamonly = undefam - unde
print('Unemployed family only: ', f'{undefamonly:,.0f}')
undekids = df20.loc[(df20['FMUNDE'] > 0) & (df20['U18'] == 1), 'PWSSWGT'].sum() / 3
print('Children living with underutilized: ', f'{unde:,.0f}')
print('')
print('April to June 2019 Averages:')
df19 = df.query('HRYEAR4 == 2019')
unemp = df19.loc[df19['UNEM'] == 1].PWSSWGT.sum() / 3
print('Unemployed population: ', f'{unemp:,.0f}')
unempfam = df19.loc[(df19['FMUNEM'] > 0), 'PWSSWGT'].sum() / 3
print('Unemployed plus their families: ', f'{unempfam:,.0f}')
unempfamonly = unempfam - unemp
print('Unemployed family only: ', f'{unempfamonly:,.0f}')
unempkids = df19.loc[(df19['FMUNEM'] > 0) & (df19['U18'] == 1), 'PWSSWGT'].sum() / 3
print('Children living with unemployed: ', f'{unempkids:,.0f}')
unde = df19.loc[df19['UNDE'] == 1].PWSSWGT.sum() / 3
print('Underutilized population: ', f'{unde:,.0f}')
undefam = df19.loc[(df19['FMUNDE'] > 0), 'PWSSWGT'].sum() / 3
print('Underutilized plus their families: ', f'{undefam:,.0f}')
undefamonly = undefam - unde
print('Unemployed family only: ', f'{undefamonly:,.0f}')
undekids = df19.loc[(df19['FMUNDE'] > 0) & (df19['U18'] == 1), 'PWSSWGT'].sum() / 3
print('Children living with underutilized: ', f'{unde:,.0f}')

April to June 2020 Averages:
Unemployed population:  20,535,697
Unemployed plus their families:  49,206,196
Unemployed family only:  28,670,498
Children living with unemployed:  11,334,735
Underutilized population:  39,339,577
Underutilized plus their families:  87,764,203
Unemployed family only:  48,424,625
Children living with underutilized:  39,339,577

April to June 2019 Averages:
Unemployed population:  5,861,173
Unemployed plus their families:  15,562,923
Unemployed family only:  9,701,750
Children living with unemployed:  3,805,562
Underutilized population:  15,274,384
Underutilized plus their families:  37,914,958
Unemployed family only:  22,640,574
Children living with underutilized:  15,274,384
