### bd econ CPS extract

bd_CPS_revisions_reader.ipynb

January 24, 2019

Contact: Brian Dew, @bd_econ

-----

Reads in Census revised data and stores it as feather files for merging with the bd CPS.

**2000-based revised weights:**

Requires: `2000-2extract.txt` data dictionary and unzipped 2000 Based Public Use Extracts from [Census](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract). 

**2001-2002 revised union data:**

Requires: `2000-2extract.txt` data dictionary and unzipped 2000 Based Public Use Extracts from [Census](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract). 

**December 2007 revised weights:**

Requires: `dec07revwgts_dd.txt` data dictionary and unzipped `dec07revwgts.dat` from [Census CPS FTP](https://thedataweb.rm.census.gov/ftp/cps_ftp.html). 

-----

See [related GitHub issue](https://github.com/bdecon/econ_data/issues/82) for explanation.

In [1]:
# Import relevant libraries (python 3.7)
import os, re, struct
import pandas as pd
import numpy as np

os.chdir('/home/brian/Documents/CPS/data')

In [2]:
# User-defined functions
def id_dtype(size):
    '''Return data type based on fixed-width size'''
    size = int(size)
    dtype = ('intp' if size > 9 
             else 'int32' if size > 4 
             else 'int16' if size > 2 
             else 'int8')
    return dtype

def data_dict_reader(dd_file, var_names):
    '''Read data dictionary and return variable locations'''
    data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()
    p = f'({"|".join(var_names)})\s+(\d+)\s+.*? \s+.*?(\d\d*).*?(\d\d*)'
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{s[1]}s', id_dtype(s[1])]
         for s in re.findall(p, data_dict)}
    return d

def struct_unpacker(d):
    '''Return struct unpacker from variable locations'''
    start, end, width, size = zip(*d.values())
    skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
    return struct.Struct(unpack_fmt).unpack_from

def data_file_reader(file, unpacker, dtypes, wgt):
    '''Convert raw monthly file to dataframe'''
    raw_data = open(file, 'rb')
    data = [unpacker(row) for row in raw_data]
    np_data = np.array(data, dtype=dtypes)
    df = pd.DataFrame(np_data[np_data[wgt] > 0])
    return df

def df_adjuster(df, wgt_vars=None):
    '''Adjust dataframe to match with bd CPS'''
    rev_df = (df.rename({'HRYEAR4': 'YEAR', 'HRMONTH': 'MONTH'}, axis=1)
                .assign(YEAR = lambda x: pd.Categorical(x['YEAR'])))
    if wgt_vars != None:
        rev_df[wgt_vars] = rev_df[wgt_vars] / 10000
    return rev_df.reset_index(drop=True)

#### Revised 2000-based weights

In [3]:
# Store 2000-based revised weights as feather file
dd_file = '2000-2extract.txt'

var_names = ['HRMONTH', 'HRYEAR4', 'QSTNUM', 'OCCURNUM', 
             'NWCMPWGT', 'NWORWGT', 'NWSSWGT']

wgt_vars = ['NWCMPWGT', 'NWORWGT', 'NWSSWGT']

filter_wgt = 'NWSSWGT'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

rev_data_path = 'pubuse2000_2002/'

file_names = os.listdir(rev_data_path)

files = [rev_data_path + month_file for month_file in file_names]

df = pd.concat([data_file_reader(file, unpacker, dtypes, filter_wgt) 
                for file in files])

df = df_adjuster(df, wgt_vars=wgt_vars)

df.to_feather('clean/cps_wgt_rev.ft')

#### Revised union data (2001-2002)

In [4]:
# Store revised union data as feather file
dd_file = '2000-2extract.txt'

var_names = ['HRMONTH', 'HRYEAR4', 'QSTNUM', 'OCCURNUM', 
             'NEERNLAB', 'NEERNCOV', 'NWSSWGT']

filter_wgt = 'NWSSWGT'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

rev_data_path = 'pubuse2000_2002/'

file_names = [file for file in os.listdir(rev_data_path) if file[3:5] != '00']

files = [rev_data_path + month_file for month_file in file_names]

df = pd.concat([data_file_reader(file, unpacker, dtypes, filter_wgt) 
                for file in files])

df = df_adjuster(df).drop('NWSSWGT', axis=1)

df.to_feather('clean/cps_union_rev.ft')

#### Revised December 2007 weights

In [5]:
# Store revised union data as feather file
dd_file = 'dec07revwgts_dd.txt'

var_names = ['QSTNUM', 'OCCURNUM', 'PWSSWGT', 'PWCMPWGT']

filter_wgt = 'PWSSWGT'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

file = 'dec07revwgts.dat'

# Special code to remove rows with only '.'
with open(file, 'rb+') as f:
    new_f = f.readlines()
    f.seek(0)
    for line in new_f:
        if b'.' not in line:
            f.write(line)
    f.truncate()

df = data_file_reader(file, unpacker, dtypes, filter_wgt) 

df[['PWSSWGT', 'PWCMPWGT']] = df[['PWSSWGT', 'PWCMPWGT']] / 10000

df = df.rename({'PWSSWGT': 'NWSSWGT', 'PWCMPWGT': 'NWCMPWGT'}, axis=1)

df.reset_index(drop=True).to_feather('clean/cps_dec07_rev.ft')