### bd econ CPS extract

bd_CPS_2000-based_weights.ipynb

January 23, 2019

Contact: Brian Dew, @bd_econ

Requires: `2000-2extract.txt` data dictionary and unzipped 2000 Based Public Use Extracts from [Census](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract). 

-----

See [related GitHub issue](https://github.com/bdecon/econ_data/issues/82) for explanation.

In [1]:
# Import relevant libraries (python 3.7)
import os, re, struct
import pandas as pd
import numpy as np

# Move to CPS data folder
os.chdir('/home/brian/Documents/CPS/data')

# Data dictionaries
rev_dd = '2000-2extract.txt'

# Data files
rev_data = 'pubuse2000_2002/oct00pubuse_2000b.dat'

In [2]:
# Read data dictionary text file 
data_dict = open(rev_dd, 'r', encoding='iso-8859-1').read()

# manually list out the IDs for series of interest 
var_names = ['HRMONTH', 'HRYEAR4', 'QSTNUM', 'OCCURNUM', 
             'NWCMPWGT', 'NWORWGT', 'NWSSWGT']

# regular expression matching series name and data dict pattern
p = f'({"|".join(var_names)})\s+(\d+)\s+.*? \s+.*?(\d\d*).*?(\d\d*)'

# dictionary of variable name: [start, end, and length + 's']
d = {s[0]: [int(s[2])-1, int(s[3]), f'{s[1]}s']
     for s in re.findall(p, data_dict)}

# lists of variable starts, ends, and lengths
start, end, width = zip(*d.values())

# create list of which characters to skip in each row
skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])

# create format string by joining skip and variable segments
unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

# struct can interpret row bytes with the format string
unpacker = struct.Struct(unpack_fmt).unpack_from

In [3]:
# Create feather file of revised weights and IDs
rev_df = pd.DataFrame()
rev_data_path = '/home/brian/Documents/CPS/data/pubuse2000_2002/'
for file in os.listdir(rev_data_path):
    # open file (read as binary) and read lines into "raw_data"
    raw_data = open(f'{rev_data_path}{file}', 'rb').readlines()

    wgt = d['NWSSWGT']  # Location of sample weight variable

    # unpack and store data of interest if sample weight > 0
    data = [[*map(int, unpacker(row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0]

    # Pandas dataframe of women age 25 to 54
    df = pd.DataFrame(data, columns=d.keys())
    rev_df = rev_df.append(df)
rev_df = (rev_df.rename({'HRYEAR4': 'YEAR', 'HRMONTH': 'MONTH'}, axis=1)
                .assign(YEAR = lambda x: pd.Categorical(x['YEAR'])))
wgt_vars = ['NWCMPWGT', 'NWORWGT', 'NWSSWGT']
rev_df[wgt_vars] = rev_df[wgt_vars] / 10000

rev_df.reset_index(drop=True).to_feather('clean/cps_wgt_rev.ft')