# Pickled_CPS

Convert the basic monthly CPS datafiles from 1998 to present into annual pickle files.


Updated: February 18, 2018 -- @bd_econ

In [1]:
# Preliminaries and libraries
import sys # Check which version of python is being used
print(f'python {sys.version_info[0]}.{sys.version_info[1]}')
import pandas as pd    # Pandas to organize and make calcs
print(f'pandas {pd.__version__}')
import numpy as np     # Numpy for calculations
print(f'numpy {np.__version__}')
import os
import re
import struct
from calendar import month_abbr

# Location of data
os.chdir('E:/08_Other/Archive/')

python 3.6
pandas 0.22.0
numpy 1.14.0


### Data Dictionaries

BD Note to self: This section needs some series work! The data dictionaries are small and fast to read, so I just used brute force to read them and manually adjust issues between years. The problem with that method is extending it back to 1994 will be harder, and it makes the code practically unreadable. I should go with some type of dictionary instead.

In [7]:
# Convert data dictionaries in to list of variables and their locations

# Series of interest 
s = ['PWORWGT', 'PWCMPWGT', 'HRHHID', 'HURESPLI', 'HRLONGLK', 'HRHHID2', 
     'PRERNWA', 'PRERNHLY', 'PRTAGE', 'PEAGE', 'PTERNWA', 'PTERNHLY',]

# These series can be stored as categorical later on
s2 = ['HRMONTH', 'PESEX', 'PEMLR', 'PENLFRET', 'PENLFACT', 'PRDISC', 'GESTFIPS',
      'HRMIS', 'PRCOW1', 'PRFTLF', 'PREMPNOT', 'PRCIVLF', 'PEJHRSN', 'PEMJOT',
      'PEEDUCA', 'PRWKSTAT', 'PRDTOCC1', 'GTMETSTA', 'GEMETSTA', 'PEDWWNTO']   

s = s + s2
d = {}

p = re.compile('\n(\w+)\s+(\d+)\s+(.*?)\t+.*?(\d\d*).*?(\d\d+)')
for y in [2013, 2014, 2015, 2017]:
    # Read the data dictionary file to get the column names and locations
    dd_txt = f'data/January_{y}_Record_Layout.txt'
    dd = open(dd_txt, 'r', encoding='iso-8859-1').read()
    # Regular expression for info of interest based on pattern p
    d[y] = [(i[0], int(i[3]), int(i[1])) for i in p.findall(dd) if i[0] in s]
p = re.compile('\n(\w+)\s+(\d+)\s+(.*?)\s+.*?(\d\d*).*?(\d\d+)')
for v in ['augnov05', 'jan07', 'jan09', 'jan10', 'may12']:
    dd = open(f'data/{v}dd.txt', 'r', encoding='iso-8859-1').read()
    d[v] = [(i[0], int(i[3]), int(i[1])) for i in p.findall(dd) if i[0] in s]
    # Manual touch up of data dictionaries
    d[v][5] = ('HRHHID2', 71, 5)
    d[v][8] = ('PRTAGE', 122, 2)
    d[v][21] = ('PRDTOCC1', 476, 2)

# May 2004 to July 2005 dictionary
dd = open(f'data/may04dd.txt', 'r', encoding='iso-8859-1').read()
d['may04'] = [(i[0], int(i[3]), int(i[1])) for i in p.findall(dd) if i[0] in s]
d['may04'][7] = ('PRTAGE', 122, 2)   
d['may04'][20] = ('PRDTOCC1', 476, 2)
    
# 2003 to May 2004 dictionary    
dd = open(f'data/jan03dd.txt', 'r', encoding='iso-8859-1').read()
d['jan03'] = [(i[0], int(i[3]), int(i[1])) for i in p.findall(dd) if i[0] in s]
d['jan03'][6] = ('GTMETSTA', 105, 1)
d['jan03'][7] = ('PRTAGE', 122, 2) 
d['jan03'][20] = ('PRDTOCC1', 476, 2)
    
# 1998 to 2002 dictionary    
p = re.compile('D (\w+)\s+(\d{1,2})\s+(\d+)\s+')
dd = open(f'data/jan98dd.asc', 'r', encoding='iso-8859-1').read()
d['jan98'] = [(i[0], int(i[2]), int(i[1])) for i in p.findall(dd) if i[0] in s]
d['jan98'][21] = ('PRERNHLY', 520, 4)
d['jan98'][22] = ('PRERNWA', 527, 8)
d['jan98'][6] = ('GTMETSTA', 105, 1)

# Manually note years where data dictionary does not change
d[2016] = d[2015]
d[2018] = d[2017]
d[2010] = d['jan10']
d[2011] = d['jan10']
d[2009] = d['jan09']
d[2008] = d['jan07']
d[2007] = d['jan07']
d[2006] = d['augnov05']
d[2003] = d['jan03']
d[2002] = d['jan98']
d[2001] = d['jan98']
d[2000] = d['jan98']
d[1999] = d['jan98']
d[1998] = d['jan98']

### Tools for manual use
Read a data dictionary, convert .cps extensions to .dat, or unzip files in a directory

In [3]:
# Tools for manual use, as needed

# Read a data dictionary in the notebook
#print(open(f'data/jan10dd.txt', 'r', encoding='iso-8859-1').read())

# Unzip files
#from zipfile import ZipFile
#for file in [f for f in os.listdir('data/') if f.endswith('pub.zip')]:
#    with ZipFile(f'data/{file}', 'r') as zip_ref:
#        zip_ref.extractall('data/')

# Convert .cps file extension into .dat (early data files end in .cps)
#for file in [f for f in os.listdir('data/') if f.endswith('cps')]:
#    os.rename(f'data/{file}', f'data/{file[:-4]}.dat')

### Functions for reading the data

In [8]:
# Set of functions for parsing raw data

# Use struct to read files faster 
def struct_constr(fieldspecs):
    """Specify which characters to retrieve and which to ignore"""
    unpack_len = 0
    unpack_fmt = ""
    for fieldspec in fieldspecs:
        start = fieldspec[1] - 1
        end = start + fieldspec[2]
        if start > unpack_len:
            unpack_fmt += str(start - unpack_len) + "x"
        unpack_fmt += str(end - start) + "s"
        unpack_len = end
    return struct.Struct(unpack_fmt).unpack_from

# Convert valid lines to list
def fwf_to_list(filelist, unpacker):
    rows = []
    for file in filelist:
        with open(f'data/{file}', 'r', encoding='utf-8') as f:
            for line in f:
                row = tuple(map(int, unpacker(line.encode())))
                if row[-1] > 0:  # Filter out weightless rows
                    rows.append(row)
    return rows

# Convert list of lists to pandas df
def list_to_df(row_list, fieldspecs, year):
    """Store list as pandas dataframe"""
    df = (pd.DataFrame(row_list, columns=[v[0] for v in fieldspecs])
          .apply(pd.to_numeric, downcast='signed'))
    return df

# This is source of problem with 2004 and 2012
def special_years(year, fs1, fs2, sm, path):
    """Handle cases where dictionary is split in middle of year.
       Takes two sets of field specifications, and the split month"""
    y, m1, m2 = f'{year}'[2:], list(range(1, sm)), list(range(sm, 13))
    files = [[f'{month_abbr[m].lower()}{y}pub.dat' for m in mlist] for mlist in [m1, m2]]
    row_list = fwf_to_list(files[0], struct_constr(fs1)) # First set of months
    df2 = list_to_df(row_list, fs1, year)               # Store as temp df
    row_list = fwf_to_list(files[1], struct_constr(fs2)) # Second set of months
    df2.append(list_to_df(row_list, fs2, year)).to_pickle(f'{path}cps_{year}.pkl')  

# Manages the other functions
def monthly_to_annual(year, path):
    """Read monthly files and store as one annual file"""
    if year not in [2004, 2005, 2012]:
        # Fill list with monthly data from each monthly file
        filepath = [f for f in os.listdir('data/') if f.endswith(f'{str(year)[-2:]}pub.dat')]
        row_list = fwf_to_list(filepath, struct_constr(d[year]))
        df = list_to_df(row_list, d[year], year)
        df.to_pickle(f'{path}cps_{year}.pkl')
        
    if year == 2012: special_years(2012, d[2011], d['may12'], 5, path)
    if year == 2005: special_years(2005, d['may04'], d['augnov05'], 8, path)
    if year == 2004: special_years(2004, d['jan03'], d['may04'], 5, path)

### Applying the function to selected years

In [11]:
# Loop or manual
path = 'C:/Working/econ_data/micro/data/'

for year in range(1998, 2019):
    monthly_to_annual(year, path)

#monthly_to_annual(2000, path)

In [10]:
df = pd.read_pickle(f'{path}cps_2000.pkl')
df.memory_usage(index=True).sum()

49123748