### CPS ASEC replicate Census Median HH Income Estimates

Brian Dew, brian.w.dew@gmail.com

December 28, 2019

----

Try to replicate the median household income statistics [published](https://www.census.gov/library/publications/2018/demo/p60-263.html) by Census, using a binned- and weighted-median.

The number I want to get (at least very close) is $61,372.


Also want to clean up the code a bit.

In [1]:
import os, re, struct
import pandas as pd
import numpy as np

os.chdir('/home/brian/Documents/ASEC/data/')

In [2]:
# read data dictionary text file 
pubuse_file = 'asec2018_pubuse.dat'
dd_file = '08ASEC2018_Data_Dict_Full.txt'
data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()

In [3]:
# Retrieve column info from dictionary
variables = ['HRECORD', 'HSUP_WGT', 'HTOTVAL']
p = re.compile(f'D ({"|".join(variables)})\s+(\d{{1,2}})\s+(\d+)\s+')
cols = {name: (int(start) - 1, int(start) - 1 + int(length)) 
        for name, length, start in re.findall(p, data_dict)}

In [4]:
# Read raw fwf file
df = (pd.read_fwf(pubuse_file, 
                  colspecs=list(cols.values()), 
                  header=None, 
                  names=cols.keys())
        .query('HRECORD == 1 and HSUP_WGT > 0'))

In [5]:
def binned_weighted_percentile(group, percentile=0.5):
    """Return BLS-styled binned and weighted percentile"""
    weight = 'HSUP_WGT'
    wage_var = 'HTOTVAL'
    bin_size = 2500
    bins = list(np.arange(0, 250000, bin_size))
    # Cut wage series according to bins of bin_size
    bin_cut = lambda x: pd.cut(x[wage_var], bins, include_lowest=True)
    
    # Calculate cumulative sum for weight variable
    cum_sum = lambda x: x[weight].cumsum()
    
    # Sort wages then apply bin_cut and cum_sum
    df = (group.sort_values(wage_var)
               .assign(WAGE_BIN = bin_cut, CS = cum_sum))
    
    # Find the weight at the percentile of interest
    pct_wgt = df[weight].sum() * percentile

    # Find wage bin for person nearest to weighted percentile
    pct_bin = df.iloc[df['CS'].searchsorted(pct_wgt)].WAGE_BIN
    
    # Weight at bottom and top of bin
    wgt_btm, wgt_top = (df.loc[df.WAGE_BIN == pct_bin, 'CS']
                          .iloc[[0, -1]].values)
    
    # Find where in the bin the percentile is and return that value
    pct_value = ((((pct_wgt - wgt_btm) / 
                   (wgt_top - wgt_btm)) * bin_size) + pct_bin.left)
    return pct_value

binned_weighted_percentile(df)

61136.840550989225