### Unique CPS person ID 

February 9, 2019

Brian Dew, @bd_econ

In [1]:
import os, re, struct
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from bd_CPS_details import StatesMap

os.chdir('/home/brian/Documents/CPS/data')

In [2]:
# Details for matching new file to previous data
file = 'dec15pub.dat'
curr_mo = pd.to_datetime(file[:5], format='%b%y')
mo_diffs = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15]
poss_mos = [poss_mo for poss_mo in [curr_mo - pd.DateOffset(months=mo_diff)
            for mo_diff in mo_diffs]]

years = list(set([pm.year for pm in poss_mos]))
bd_CPS_files = [f'cps{year}.ft' for year in years]

yymms = [int(pm.strftime('%y%m')) for pm in poss_mos]

# Create mapping of MIS to potential match months
yymms_rev = yymms[::-1]

search_list = {
    8: yymms_rev[:4] + yymms_rev[-3:],
    7: yymms_rev[1:5] + yymms_rev[-2:],
    6: yymms_rev[2:6] + yymms_rev[-1:],
    5: yymms_rev[3:7],
    4: yymms_rev[-3:],
    3: yymms_rev[-2:],
    2: yymms_rev[-1:]
}

In [3]:
# Background to read current monthly file
# read data dictionary text file 
dd_file = 'January_2017_Record_Layout.txt'
data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()

# manually list out the IDs for series of interest 
var_names = ['HRMONTH', 'HRYEAR4', 'HRMIS', 'QSTNUM', 'OCCURNUM', 
             'HRHHID', 'HRHHID2', 'GESTFIPS', 'HWHHWGT']

# regular expression matching series name and data dict pattern
p = f'\n({"|".join(var_names)})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'

# pick data type based on size of variable
def id_dtype(size):
    size = int(size)
    dtype = ('intp' if size > 9 
             else 'int32' if size > 4 
             else 'int16' if size > 2 
             else 'int8')
    return dtype

# dictionary of variable name: [start, end, and length + 's']
d = {s[0]: [int(s[2])-1, int(s[3]), f'{s[1]}s', id_dtype(s[1])]
     for s in re.findall(p, data_dict)}

# data types
dtypes = [(k, v[-1]) for k, v in d.items()]

# weight variable start and end location
ws, we = d['HWHHWGT'][:2]

# first occurance to set household start and end
hs, he = d['OCCURNUM'][:2]

# lists of variable starts, ends, and lengths
start, end, width, dtype = zip(*d.values())

# create list of which characters to skip in each row
skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])

# create format string by joining skip and variable segments
unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

# struct can interpret row bytes with the format string
unpacker = struct.Struct(unpack_fmt).unpack_from

In [4]:
# Read new monthly file
data = [unpacker(row) for row in open(file, 'rb') 
        if (row[ws:we].strip() > b'0')
        and (row[hs:he].strip() == b'1')]

# Assign new date variable
date = lambda x: (((x.HRYEAR4.astype(np.int32) * 100) + 
                  x.HRMONTH.astype(np.int8)) % 10000)

# Convert to dataframe using specified weights
df = (pd.DataFrame(np.array(data, dtype=dtypes))
        .assign(DATE = date)
        .rename({'HRHHID': 'HHID', 'HRHHID2': 'HHID2'}, axis=1))

# Need to map state to state id codes
df['STATE'] = df['GESTFIPS'].map(StatesMap)

# Drop GESTFIPS and OCCURNUM
df = df.drop(['GESTFIPS', 'OCCURNUM'], axis=1)
print(len(df))

52187


In [5]:
# Read potential match data
keep_cols = ['YEAR', 'MONTH', 'MIS', 'HHID', 'HHID2', 'QSTNUM', 
             'OCCURNUM', 'STATE']

date = lambda x: (((x.YEAR.astype(np.int32) * 100) + 
                  x.MONTH.astype(np.int8)) % 10000)

mdf = (pd.concat(
    [(pd.read_feather(f'clean/cps{year}.ft', columns=keep_cols)
        .query('OCCURNUM == 1')
        .assign(DATE = date))
     for year in years]))

mdf = (mdf[mdf['DATE'].isin(yymms)].drop(['MONTH', 'YEAR'], axis=1))

In [6]:
# Merge data
d = {}

# MIS = 1 households get current id
dfmis1 = df.loc[df['HRMIS'] == 1, ['QSTNUM', 'DATE']]
dfmis1['ID'] = dfmis1['DATE'] * 100000 + dfmis1['QSTNUM']
mis1id = dfmis1.set_index('QSTNUM')['ID'].to_dict()
d.update(mis1id)

df = df.loc[df['HRMIS'] > 1]
dft = df

# Loop over MIS and potentional matches to find matched id
for mis in [2, 3, 4, 5, 6, 7, 8]:    
    for pm in search_list[mis]:
        results = (dft.loc[dft['HRMIS'] == mis]
                      .merge(mdf[mdf['DATE'] == pm], 
                             on=['HHID', 'HHID2', 'STATE']))

        results['ID'] = results['DATE_y'] * 100000 + results['QSTNUM_y']

        matched_id = results.set_index('QSTNUM_x')['ID'].to_dict()
        d.update(matched_id)

        dft = dft.loc[~dft['QSTNUM'].isin(matched_id.keys())]
    
    # Households with no match get current id, same has MIS=1
    new_hh = dft[dft['HRMIS'] == mis]
    new_hh['ID'] = new_hh['DATE'] * 100000 + new_hh['QSTNUM']
    new_hh_d = new_hh.set_index('QSTNUM')['ID'].to_dict()
    d.update(new_hh_d)
    print(len(d))
    

13034
19681
26323
32484
38904
45520
52187
