# CPS - Contributions to labor force changes
##### February 14, 2018 -- @bd_econ

In [1]:
import sys # Check which version of python is being used
print(f'python {sys.version_info[0]}.{sys.version_info[1]}')
import pandas as pd    # Pandas to organize and make calcs
print(f'pandas {pd.__version__}')
import numpy as np     # Numpy for calculations
print(f'numpy {np.__version__}')
import os, re, wquantiles

python 3.6
pandas 0.22.0
numpy 1.14.0


Source is census CPS monthly files available [here](http://thedataweb.rm.census.gov/ftp/cps_ftp.html). Download and unzip all of the monthly files into a folder called data. We also need a data dictionary, found [here](http://thedataweb.rm.census.gov/pub/cps/basic/201701-/January_2017_Record_Layout.txt), which describes the variables and their range of possible values.

### Get variables from data dictionary

In [2]:
# Read the dictionary file to get the column names and locations
dd_txt = 'data/January_2017_Record_Layout.txt'
dd = open(dd_txt, 'r', encoding='iso-8859-1').read()
# Regular expression for info of interest based on pattern p
p = re.compile('\n(\w+)\s+(\d+)\s+(.*?)\t+.*?(\d\d*).*?(\d\d+)')
var_key = pd.DataFrame(p.findall(dd))
var_key.columns = ['Var', 'Length', 'Name', 'Start', 'End']

### Read data for 2017

In [3]:
# Series of interest 
s = ['HRMONTH', 'PRTAGE', 'PESEX', 'PWORWGT', 'PEMLR', 'PENLFRET', 'PENLFACT', 
     'PRDISC', 'PWCMPWGT', 'HRMIS']
s_key = var_key[var_key['Var'].isin(s)]  
colspecs = [(int(i[1]['Start'])-1, int(i[1]['End'])) for i in s_key.iterrows()]

In [7]:
df = pd.DataFrame()

for file in [f for f in os.listdir('data/') if f.endswith('17pub.dat')]:
    df = df.append(pd.read_fwf(f'data/{file}', colspecs=colspecs, header=None, dtype='int',
        memory_map=True, names=s_key['Var'].values, usecols=s_key['Var'].values))#,    

In [6]:
df[(df['PEMLR'] >= 5) & (df['PRTAGE'].isin(range(25,55)))]['PWORWGT'].sum() / 120000000

23042.930484658333

In [None]:
# Second benchmark, read in full year of data
val = np.array([])   # This will be the combined annual df
wgt = np.array([])

for file in [f for f in os.listdir('data/') if f.endswith('17pub.dat')]:
    df = pd.read_fwf(f'data/{file}', colspecs=colspecs, header=None, dtype='int',
        memory_map=True, names=s_key['Var'].values, usecols=s_key['Var'].values)#, 

    df = df[(df['PRERNWA'] > -1) & 
            (df['PRTAGE'] >= 16) & 
            (df['PRFTLF'] == 1) &
            (df['HRMONTH'].isin([1, 2, 3]))]
    val = np.append(val, df['PRERNWA'].values)
    wgt = np.append(wgt, df['PWORWGT'].values)
        
print('2017 Q1 Usual Weekly Earnings: ${0:,.2f}'.format(
    # Weighted median using wquantiles package
    wquantiles.median(val, wgt) / 100.0))

Important: Instead of reading the data dictionary into a pandas dataframe. you can just make a list of tuples with start and end values and pass the list to the fixed width format reader as colspecs, to read a few columns manually.

------

## Read text file

In [10]:
# Location of fixed-width-format text file
filename = 'data/jan17pub.dat'

# Series of interest 
s = ['HRMONTH', 'PRTAGE', 'PESEX', 'PWORWGT', 'PEMLR', 'PENLFRET', 'PENLFACT', 
     'PRDISC', 'PWCMPWGT', 'HRMIS']
s_key = var_key[var_key['Var'].isin(s)]  
colspecs = [(int(i[1]['Start'])-1, int(i[1]['End'])) for i in s_key.iterrows()]

In [11]:
# Read file
df = pd.read_fwf(filename, colspecs=colspecs, header=None, names=s_key['Var'].values)

In [32]:
df[(df['PEMLR'] >= 5) & (df['PRTAGE'].isin(range(25,55)))]['PWORWGT'].sum() / 10000000

23748.176961500001

## Benchmarking

In [4]:
# Prime age epop for women, Jan 2017 = 71.3 (LNU02300062)
# Start by identifying the criteria
dft = df[(df['PESEX'] == 2) & (df['PRTAGE'].isin(range(25,55)))]
empl = np.where(dft['PREMPNOT']==1, 1, 0)
epop = np.average(empl.astype(float), weights=dft['PWCMPWGT']) * 100
print(f'January 2017: {round(epop, 1)}')

January 2017: 71.3


In [5]:
%%time
# Second benchmark, read in full year of data
val = np.array([])   # This will be the combined annual df
wgt = np.array([])

for file in [f for f in os.listdir('data/') if f.endswith('pub.dat')]:
    df = pd.read_fwf(f'data/{file}', colspecs=colspecs, header=None, dtype='int',
        memory_map=True, names=s_key['Var'].values, usecols=s_key['Var'].values)#, 

    df = df[(df['PRERNWA'] > -1) & 
            (df['PRTAGE'] >= 16) & 
            (df['PRFTLF'] == 1) &
            (df['HRMONTH'].isin([1, 2, 3]))]
    val = np.append(val, df['PRERNWA'].values)
    wgt = np.append(wgt, df['PWORWGT'].values)
        
print('2017 Q1 Usual Weekly Earnings: ${0:,.2f}'.format(
    # Weighted median using wquantiles package
    wquantiles.median(val, wgt) / 100.0))

2017 Q1 Usual Weekly Earnings: $865.00
Wall time: 18.7 s


In [6]:
%%time
files = [f for f in os.listdir('data/') if f.endswith('pub.dat')]

mo_dfs = [pd.read_fwf(f'data/{file}', colspecs=colspecs, header=None, dtype='int',
        memory_map=True, names=s_key['Var'].values, usecols=s_key['Var'].values) for file in files]

df = pd.concat(mo_dfs)

Wall time: 23.7 s
