[Rendered Version of this notebook](http://nbviewer.jupyter.org/github/andersonfrailey/Notebook-Uploads/blob/master/CPS%20Comparisons.ipynb)

In [1]:
from taxcalc import *
from taxcalc.utils import *
from bokeh.io import show, output_notebook
from bokeh.charts import Bar, Scatter
from bokeh.layouts import column
from bokeh.plotting import figure
from collections import OrderedDict
import copy
from notebookfunctions import distribution, index_list, percentile
output_notebook()
import json

### Missing Variables

In [2]:
cps = pd.read_csv('../taxdata/cps_data/cps_ben_full.csv')
usable = open('taxcalc/records_variables.json')
usable_vars = json.load(usable)
usable.close()
missing = 0
for item in usable_vars['read'].keys():
    if item not in cps.columns:
        print '{}: {}'.format(item, usable_vars['read'][item]['desc'])
        missing += 1
print '\nTotal Variables Missing: {}'.format(missing)

p23250: Sch D: Net long-term capital gains/losses
p25470: Sch E: Royalty depletion and/or rental depreciation
e09800: Unreported payroll taxes from Form 4137 or 8919
e02000: Sch E rental, royalty, S-corp, etc, income/loss
e62900: Alternative Minimum Tax foreign tax credit from Form 6251
p08000: Other tax credits (but not including Sch R credit)
e58990: Investment income elected amount from Form 4952
e00700: Taxable refunds of state and local income taxes
e03290: Health savings account deduction from Form 8889
e07240: Retirement savings contributions credit from Form 8880
e27200: Sch E: Farm rent net income or loss
e01200: Other net gain/loss from Form 4797
e03500: Alimony paid
e07260: Residential energy credit from Form 5695
p22250: Sch D: Net short-term capital gains/losses
e03220: Educator expenses
e07400: General business credit from Form 3800
f6251: 1 if Form 6251 (AMT) attached to return; otherwise 0
e03230: Tuition and fees from Form 8917
e03400: Penalty on early withdrawal of sa

In [3]:
# Data from IRS-SOI Tax Stats. Used for comparison
soi_stats = pd.read_csv('soi_stats.csv', index_col=0)  # Equivalent to tax-calc diagnostic table
soi_income = pd.read_csv('soi_income_stats.csv')  # Distribution of income items
soi_deductions = pd.read_csv('soi_deductions.csv', index_col=0)  # Itemized deductions
soi_deductions['index'] = soi_deductions.index

In [4]:
# Create calculator from the PUF
calc_puf = Calculator(records=Records(), policy=Policy())
calc_puf.advance_to_year(2015)

You loaded data for 2009.
Tax-Calculator startup automatically extrapolated your data to 2013.


In [5]:
# Calculator the CPS
wt = pd.read_csv('../Dropbox/cps_weights.csv')
# wt.drop('SEQUENCE', inplace=True, axis=1)
# wt = wt.loc[:,:'WT2026']
wt *= 100
recs_cps = Records(data=cps,
                   weights=wt,
                   adjust_ratios=None,
                   start_year=2015)
pol_cps = Policy(start_year=2015, num_years=11)
calc_cps = Calculator(records=recs_cps, policy=pol_cps)
calc_cps.advance_to_year(2015)

In [6]:
calc_puf.calc_all()
calc_cps.calc_all()

### CPS Distribution Table

In [7]:
create_distribution_table(calc_cps.records, groupby='weighted_deciles', result_type='weighted_avg')

Unnamed: 0,expanded_income,s006,c00100,num_returns_StandardDed,standard,num_returns_ItemDed,c04470,c04600,c04800,taxbc,c62100,num_returns_AMT,c09600,c05800,c07100,othertaxes,refund,iitax,payrolltax,combined
0,-182.0,16318429.0,-542.0,9212955.0,6107.0,14925.0,7.0,4916.0,26.0,1.0,-547.0,0.0,0.0,1.0,0.0,0.0,90.0,-90.0,160.0,70.0
1,8802.0,16320544.0,4791.0,11439177.0,7366.0,177336.0,112.0,5630.0,115.0,8.0,4742.0,14199.0,0.0,8.0,2.0,0.0,718.0,-711.0,626.0,-85.0
2,15277.0,16319424.0,8267.0,11894865.0,7536.0,957115.0,706.0,6004.0,1283.0,121.0,7915.0,34552.0,1.0,122.0,22.0,0.0,983.0,-884.0,1094.0,211.0
3,21865.0,16319273.0,14491.0,12587216.0,7423.0,2379352.0,1558.0,6657.0,3968.0,403.0,13596.0,31097.0,2.0,405.0,68.0,0.0,1174.0,-837.0,1969.0,1132.0
4,29317.0,16319617.0,21952.0,11735601.0,7127.0,4080354.0,2705.0,7199.0,8369.0,953.0,20325.0,31232.0,3.0,956.0,135.0,0.0,1025.0,-205.0,3025.0,2820.0
5,38837.0,16318693.0,30962.0,10446955.0,6627.0,5685388.0,4276.0,7626.0,14676.0,1744.0,28430.0,24728.0,4.0,1748.0,227.0,0.0,600.0,921.0,4277.0,5198.0
6,51862.0,16320896.0,44180.0,8784466.0,5739.0,7465511.0,6662.0,8015.0,24717.0,3089.0,40224.0,29255.0,3.0,3092.0,348.0,0.0,164.0,2579.0,6029.0,8608.0
7,71353.0,16318688.0,64672.0,6529178.0,4325.0,9784045.0,10238.0,8792.0,41638.0,5814.0,58780.0,25330.0,4.0,5818.0,495.0,0.0,24.0,5299.0,8843.0,14142.0
8,103024.0,16320295.0,94946.0,3773678.0,2560.0,12544498.0,15159.0,9903.0,67451.0,10092.0,86680.0,39119.0,5.0,10097.0,595.0,0.0,4.0,9498.0,13090.0,22588.0
9,245340.0,16319911.0,231860.0,1511055.0,1087.0,14808326.0,24993.0,9765.0,196078.0,44674.0,219890.0,2628898.0,618.0,45292.0,122.0,374.0,0.0,45544.0,21897.0,67442.0


### PUF Distribution Table

In [8]:
create_distribution_table(calc_puf.records, groupby='weighted_deciles', result_type='weighted_avg')

Unnamed: 0,expanded_income,s006,c00100,num_returns_StandardDed,standard,num_returns_ItemDed,c04470,c04600,c04800,taxbc,c62100,num_returns_AMT,c09600,c05800,c07100,othertaxes,refund,iitax,payrolltax,combined
0,-6953.0,16441745.0,-7255.0,11288964.0,6060.0,9916.0,22.0,4171.0,52.0,5.0,-7636.0,0.0,0.0,5.0,0.0,7.0,144.0,-133.0,349.0,216.0
1,8726.0,16442575.0,6124.0,13702321.0,7176.0,46911.0,25.0,4949.0,295.0,26.0,6103.0,12231.0,0.0,26.0,0.0,4.0,790.0,-760.0,844.0,84.0
2,15492.0,16442473.0,10161.0,13277515.0,7736.0,283102.0,186.0,6523.0,1539.0,151.0,10032.0,27238.0,1.0,152.0,20.0,7.0,1645.0,-1507.0,1438.0,-69.0
3,22631.0,16442367.0,15600.0,14440049.0,7933.0,705376.0,574.0,6900.0,4187.0,442.0,15239.0,31321.0,2.0,444.0,86.0,13.0,1463.0,-1092.0,2089.0,997.0
4,31366.0,16444016.0,23159.0,14356040.0,8060.0,1539134.0,1322.0,7252.0,9158.0,1058.0,22337.0,19623.0,2.0,1060.0,232.0,17.0,1044.0,-199.0,2967.0,2768.0
5,42263.0,16442576.0,33797.0,13357837.0,7800.0,2941897.0,2747.0,7407.0,17433.0,2134.0,32170.0,17946.0,3.0,2137.0,378.0,27.0,510.0,1277.0,4085.0,5362.0
6,56489.0,16442166.0,48182.0,11438896.0,7153.0,4969071.0,5255.0,7781.0,28571.0,3774.0,45059.0,15594.0,2.0,3776.0,512.0,43.0,150.0,3157.0,5464.0,8620.0
7,77132.0,16442982.0,70261.0,9094430.0,6146.0,7341053.0,8443.0,8656.0,47139.0,6849.0,65546.0,47432.0,5.0,6854.0,680.0,62.0,78.0,6157.0,7964.0,14121.0
8,114373.0,16442021.0,106430.0,5905115.0,4330.0,10534367.0,14229.0,10190.0,77752.0,12288.0,98863.0,194696.0,20.0,12308.0,856.0,101.0,96.0,11457.0,12912.0,24369.0
9,352442.0,16443547.0,334905.0,2023885.0,1504.0,14407177.0,37721.0,9612.0,286464.0,71832.0,319414.0,4710693.0,2449.0,74280.0,1725.0,2410.0,50.0,74915.0,22559.0,97474.0


### Diagnostic Table Comparison

In [9]:
cps_diag = create_diagnostic_table(calc_cps)

In [10]:
puf_diag = create_diagnostic_table(calc_puf)

In [11]:
diag_data = pd.DataFrame()
diag_data['SOI'] = soi_stats['Value']
diag_data['CPS'] = cps_diag[2015]
diag_data['PUF'] = puf_diag[2015]
diag_data['% Change'] = ((cps_diag[2015] / puf_diag[2015]) - 1) * 100

In [12]:
diag_data

Unnamed: 0,SOI,CPS,PUF,% Change
Returns (#m),148.6,163.2,164.4,-0.7
AGI ($b),9771.0,8414.2,10546.0,-20.2
Itemizers (#m),44.0,57.9,42.8,35.3
Itemized Deduction ($b),1206.7,1083.9,1159.7,-6.5
Standard Deduction Filers (#m),117.4,87.9,108.9,-19.3
Standard Deduction ($b),876.2,769.5,951.0,-19.1
Personal Exemption ($b),1121.6,1110.2,1137.0,-2.4
Taxable Income ($b),6997.9,5847.7,7770.9,-24.7
Regular Tax ($b),,1091.8,1620.7,-32.6
AMT Income ($b),,7834.1,9983.0,-21.5


In [13]:
(diag_data['% Change']['Itemizers (#m)'] + diag_data['% Change']['Itemized Deduction ($b)'] +
 diag_data['% Change']['Standard Deduction Filers (#m)'] + diag_data['% Change']['Standard Deduction ($b)'])

-9.5359939025916702

In [14]:
# Total value of missing itemized deductions
in_billions = 1e-9
state = (calc_puf.records.e18400 * calc_puf.records.s006).sum()
print 'State and Local: {} ($b)'.format(round(state * in_billions, 2))
int_paid = (calc_puf.records.e19200 * calc_puf.records.s006).sum()
print 'Interest Paid: {} ($b)'.format(round(int_paid * in_billions, 2))
net_cas = (calc_puf.records.g20500 * calc_puf.records.s006).sum()
print 'Net Casualty or Theft Loss: {} ($b)'.format(round(net_cas * in_billions, 2))
print '-----------------'
print 'Total: {} ($b)'.format((round((state + int_paid + net_cas) * in_billions, 2)))

State and Local: 335.95 ($b)
Interest Paid: 342.21 ($b)
Net Casualty or Theft Loss: 4.78 ($b)
-----------------
Total: 682.94 ($b)


### Income Levels

In [15]:
inc_dict = OrderedDict()
inc_dict['CPS'] = [] 
inc_dict['PUF'] = []
inc_dict['Diff'] = []
inc_dict['Pct Diff'] = []
inc_list = ['WAS', 'Taxable Interest', 'Ordinary Dividends', 'Qualified Dividends', 'Business Income']
was_cps = (calc_cps.records.e00200 * calc_cps.records.s006).sum()
inc_dict['CPS'].append(was_cps)
was_puf = (calc_puf.records.e00200 * calc_puf.records.s006).sum()
inc_dict['PUF'].append(was_puf)
was_diff = (was_cps - was_puf)
inc_dict['Diff'].append(was_diff)
inc_dict['Pct Diff'].append((was_diff / was_puf) * 100)

int_cps = (calc_cps.records.e00300 * calc_cps.records.s006).sum()
inc_dict['CPS'].append(int_cps)
int_puf = (calc_puf.records.e00300 * calc_puf.records.s006).sum()
inc_dict['PUF'].append(int_puf)
int_diff = (int_cps - int_puf)
inc_dict['Diff'].append(int_diff)
inc_dict['Pct Diff'].append((int_diff / int_puf) * 100)

odiv_cps = (calc_cps.records.e00600 * calc_cps.records.s006).sum()
inc_dict['CPS'].append(odiv_cps)
odiv_puf = (calc_puf.records.e00600 * calc_puf.records.s006).sum()
inc_dict['PUF'].append(odiv_puf)
odiv_diff = (odiv_cps - odiv_puf)
inc_dict['Diff'].append(odiv_diff)
inc_dict['Pct Diff'].append((odiv_diff / odiv_puf) * 100)

qdiv_cps = (calc_cps.records.e00650 * calc_cps.records.s006).sum()
inc_dict['CPS'].append(qdiv_cps)
qdiv_puf = (calc_puf.records.e00650 * calc_puf.records.s006).sum()
inc_dict['PUF'].append(qdiv_puf)
qdiv_diff = (qdiv_cps - qdiv_puf)
inc_dict['Diff'].append(qdiv_diff)
inc_dict['Pct Diff'].append((qdiv_diff / qdiv_puf) * 100)

biz_cps = (calc_cps.records.e00900 * calc_cps.records.s006).sum()
inc_dict['CPS'].append(biz_cps)
biz_puf = (calc_puf.records.e00900 * calc_puf.records.s006).sum()
inc_dict['PUF'].append(biz_puf)
biz_diff = (biz_cps - biz_puf)
inc_dict['Diff'].append(biz_diff)
inc_dict['Pct Diff'].append((biz_diff / biz_puf) * 100)

inc_df = pd.DataFrame.from_dict(inc_dict)
inc_df.index = inc_list
inc_df

Unnamed: 0,CPS,PUF,Diff,Pct Diff
WAS,6648838107939.5,7146623659349.3,-497785551409.8,-7.0
Taxable Interest,91960641825.4,98016622270.4,-6055980445.1,-6.2
Ordinary Dividends,245424347487.0,287599002040.1,-42174654553.1,-14.7
Qualified Dividends,185442636961.2,210639585110.9,-25196948149.7,-12.0
Business Income,309555316495.2,324061778926.4,-14506462431.3,-4.5


### Distribution of Income Variables

In [16]:
# Generate data for distribution plots
cps_dist = pd.DataFrame()
puf_dist = pd.DataFrame()
cps_was = distribution(calc_cps.records.e00200, calc_cps.records.s006, calc_cps.records.c00100)
puf_was = distribution(calc_puf.records.e00200, calc_puf.records.s006, calc_puf.records.c00100)
cps_int = distribution(calc_cps.records.e00300, calc_cps.records.s006, calc_cps.records.c00100)
puf_int = distribution(calc_puf.records.e00300, calc_puf.records.s006, calc_puf.records.c00100)
cps_odiv = distribution(calc_cps.records.e00600, calc_cps.records.s006, calc_cps.records.c00100)
puf_odiv = distribution(calc_puf.records.e00600, calc_puf.records.s006, calc_puf.records.c00100)
cps_qdiv = distribution(calc_cps.records.e00650, calc_cps.records.s006, calc_cps.records.c00100)
puf_qdiv = distribution(calc_puf.records.e00650, calc_puf.records.s006, calc_puf.records.c00100)
cps_biz = distribution(calc_cps.records.e00900, calc_cps.records.s006, calc_cps.records.c00100)
puf_biz = distribution(calc_puf.records.e00900, calc_puf.records.s006, calc_puf.records.c00100)
cps_dist['WAS'] = cps_was[1]
puf_dist['WAS'] = puf_was[1]
cps_dist['INT'] = cps_int[1]
puf_dist['INT'] = puf_int[1]
cps_dist['ODIV'] = cps_odiv[1]
puf_dist['ODIV'] = puf_odiv[1]
cps_dist['QDIV'] = cps_qdiv[1]
puf_dist['QDIV'] = puf_qdiv[1]
cps_dist['BIZ'] = cps_biz[1]
puf_dist['BIZ'] = puf_biz[1]
cps_dist['AGI Bin'] = index_list()
puf_dist['AGI Bin'] = index_list()
cps_dist['label'] = 'CPS'
puf_dist['label'] = 'PUF'
# Create scatter plot objects
items_tups = [('WAS', 'WAS'), ('INT', 'Interest Income'), ('ODIV', 'Ordinary Dividends'),
              ('QDIV', 'Qualified Dividends'), ('BIZ', 'Business Income')]
soi_dist = pd.DataFrame()
for item in items_tups:
    soi_dist[item[0]] = (soi_income[item[0]] / soi_income[item[0]].sum()) * 100
soi_dist['AGI Bin'] = index_list()
soi_dist['label'] = 'SOI'
scatter_data = pd.concat([cps_dist, puf_dist, soi_dist])
scatter_list = list()  # list for scatter plot objects
for item in items_tups:
    title = 'Percent of Total {} by AGI Bin'.format(item[1])
    scatter = Scatter(scatter_data, x='AGI Bin', y=item[0], color='label', ylabel='Percent',
                      title=title, tooltips=[('PCT', '@{}'.format(item[0]))])
    scatter_list.append(scatter)

In [17]:
show(column(scatter_list))

In [18]:
cps_tot = pd.DataFrame()
puf_tot = pd.DataFrame()
cps_tot['WAS'] = cps_was[0]
puf_tot['WAS'] = puf_was[0]
cps_tot['INT'] = cps_int[0]
puf_tot['INT'] = puf_int[0]
cps_tot['ODIV'] = cps_odiv[0]
puf_tot['ODIV'] = puf_odiv[0]
cps_tot['QDIV'] = cps_qdiv[0]
puf_tot['QDIV'] = puf_qdiv[0]
cps_tot['BIZ'] = cps_biz[0]
puf_tot['BIZ'] = puf_biz[0]
cps_tot['AGI Bin'] = index_list()
puf_tot['AGI Bin'] = index_list()
cps_tot['label'] = 'CPS'
puf_tot['label'] = 'PUF'
soi_income['AGI Bin'] = index_list()
soi_income['label'] = 'SOI'
total_data = pd.concat([cps_tot, puf_tot, soi_income])
# Create scatter plot objects
items_tups = [('WAS', 'WAS'), ('INT', 'Interest Income'), ('ODIV', 'Ordinary Dividends'),
              ('QDIV', 'Qualified Dividends'), ('BIZ', 'Business Income')]
total_list = list()  # list for scatter plot objects
for item in items_tups:
    title = 'Total {} by AGI Bin'.format(item[1])
    scatter = Scatter(total_data, x='AGI Bin', y=item[0], color='label', ylabel='Total',
                      title=title, tooltips=[('Total', '@{}'.format(item[0]))])
    total_list.append(scatter)

In [19]:
show(column(total_list))

### Itemized Deduction Amounts

In [20]:
deductions_cps = {'Medical Expenses': (calc_cps.records.e17500[calc_cps.records.c04470 > 0] *
                                       calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'State and Local Taxes':  (calc_cps.records.e18400[calc_cps.records.c04470 > 0] *
                                             calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'Real Estate Taxes':  (calc_cps.records.e18500[calc_cps.records.c04470 > 0] *
                                         calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'Interest Paid':  (calc_cps.records.e19200[calc_cps.records.c04470 > 0] *
                                     calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'Charitable Cash Contributions': (calc_cps.records.e19800[calc_cps.records.c04470 > 0] *
                                                    calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'Charitable Non-Cash Contributions': (calc_cps.records.e20100[calc_cps.records.c04470 > 0] *
                                                        calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'Total Misc. Expenses':  (calc_cps.records.e20400[calc_cps.records.c04470 > 0] *
                                            calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum(),
                  'Net Casualty or Loss': (calc_cps.records.g20500[calc_cps.records.c04470 > 0] *
                                           calc_cps.records.s006[calc_cps.records.c04470 > 0]).sum()}
ded_cps_df = pd.DataFrame.from_dict(deductions_cps, 'index')
ded_cps_df.columns = ['Total']
ded_cps_df['source'] = 'CPS'

deductions_puf = {'Medical Expenses': (calc_puf.records.e17500[calc_puf.records.c04470 > 0] *
                                       calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'State and Local Taxes':  (calc_puf.records.e18400[calc_puf.records.c04470 > 0] *
                                             calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'Real Estate Taxes':  (calc_puf.records.e18500[calc_puf.records.c04470 > 0] *
                                         calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'Interest Paid':  (calc_puf.records.e19200[calc_puf.records.c04470 > 0] *
                                     calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'Charitable Cash Contributions': (calc_puf.records.e19800[calc_puf.records.c04470 > 0] *
                                                    calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'Charitable Non-Cash Contributions': (calc_puf.records.e20100[calc_puf.records.c04470 > 0] *
                                                        calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'Total Misc. Expenses':  (calc_puf.records.e20400[calc_puf.records.c04470 > 0] *
                                            calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum(),
                  'Net Casualty or Loss': (calc_puf.records.g20500[calc_puf.records.c04470 > 0] *
                                           calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum()}
ded_puf_df = pd.DataFrame.from_dict(deductions_puf, 'index')
ded_puf_df.columns = ['Total']
ded_puf_df['source'] = 'PUF'
soi_deductions['source'] = 'SOI'

ded_full_df = pd.concat([ded_cps_df, ded_puf_df, soi_deductions])
ded_full_df['index'] = ded_full_df.index

In [21]:
(calc_puf.records.e19200[calc_puf.records.c04470 > 0] *
                                     calc_puf.records.s006[calc_puf.records.c04470 > 0]).sum()

318302533899.28314

In [22]:
ded_bar = Bar(ded_full_df, 'index', 'Total', group='source', title='Itemized Deduction Totals',
              xlabel='Deduction', ylabel='Total', tooltips=[('Deduction', '@index'), ('Total', '@height'),
                                                            ('Data', '@source')])
show(ded_bar)

In [23]:
ded_error_df = pd.DataFrame()
ded_error_df['Difference - CPS'] = ded_cps_df['Total'] - soi_deductions['Total']
ded_error_df['% Difference - CPS'] = 100 * ded_error_df['Difference - CPS'] / soi_deductions['Total']
ded_error_df['Difference - PUF'] = ded_puf_df['Total'] - soi_deductions['Total']
ded_error_df['% Difference - PUF'] = 100 * ded_error_df['Difference - PUF'] / soi_deductions['Total']
print 'Error in Itemized Deductions Relative to SOI Totals'
ded_error_df

Error in Itemized Deductions Relative to SOI Totals


Unnamed: 0,Difference - CPS,% Difference - CPS,Difference - PUF,% Difference - PUF
Charitable Cash Contributions,6382229509.0,4.1,2671633335.8,1.7
Charitable Non-Cash Contributions,-29836512087.3,-45.7,-29943383704.6,-45.8
Interest Paid,9796287376.5,3.2,10340148899.3,3.4
Medical Expenses,-16108893366.9,-12.5,-5985934555.6,-4.6
Net Casualty or Loss,-2204349000.0,-100.0,2411947393.6,109.4
Real Estate Taxes,-57509601924.6,-31.8,27226854392.2,15.0
State and Local Taxes,-19938190726.9,-6.1,-1491588810.8,-0.5
Total Misc. Expenses,-22006233592.0,-17.4,10319333533.9,8.2


### Refundable Credits

In [24]:
eitc_cps = (calc_cps.records.eitc * calc_cps.records.s006).sum()
c11070_cps = (calc_cps.records.c11070 * calc_cps.records.s006).sum()
c10960_cps = (calc_cps.records.c10960 * calc_cps.records.s006).sum()
personal_credit_cps = (calc_cps.records.personal_credit * calc_cps.records.s006).sum()
ctc_new_cps = (calc_cps.records.ctc_new * calc_cps.records.s006).sum()

eitc_puf = (calc_puf.records.eitc * calc_puf.records.s006).sum()
c11070_puf = (calc_puf.records.c11070 * calc_puf.records.s006).sum()
c10960_puf = (calc_puf.records.c10960 * calc_puf.records.s006).sum()
personal_credit_puf = (calc_puf.records.personal_credit * calc_puf.records.s006).sum()
ctc_new_puf = (calc_puf.records.ctc_new * calc_puf.records.s006).sum()

In [25]:
print 'Refundable Credits'
pd.DataFrame(OrderedDict({'PUF': [eitc_puf, c11070_puf, c10960_puf, personal_credit_puf, ctc_new_puf],
                          'CPS': [eitc_cps, c11070_cps, c10960_cps, personal_credit_cps, ctc_new_cps],
                          'Diff': [eitc_cps - eitc_puf, c11070_cps - c11070_puf, c10960_cps - c10960_puf,
                                   personal_credit_cps - personal_credit_puf,
                                   ctc_new_cps - ctc_new_puf]}),
                          index=['eitc', 'c11070', 'c10960', 'personal credit', 'ctc new'])

Refundable Credits


Unnamed: 0,Diff,PUF,CPS
eitc,-7363363471.2,65703321662.4,58339958191.2
c11070,-4143797601.1,23856185234.2,19712387633.1
c10960,-8627081579.1,8627081579.1,0.0
personal credit,0.0,0.0,0.0
ctc new,0.0,0.0,0.0


In [26]:
eic0cps = sum(calc_cps.records.s006[calc_cps.records.EIC == 0])
eic1cps = sum(calc_cps.records.s006[calc_cps.records.EIC == 1])
eic2cps = sum(calc_cps.records.s006[calc_cps.records.EIC == 2])
eic3cps = sum(calc_cps.records.s006[calc_cps.records.EIC == 3])
eic0puf = sum(calc_puf.records.s006[calc_puf.records.EIC == 0])
eic1puf = sum(calc_puf.records.s006[calc_puf.records.EIC == 1])
eic2puf = sum(calc_puf.records.s006[calc_puf.records.EIC == 2])
eic3puf = sum(calc_puf.records.s006[calc_puf.records.EIC == 3]) 
eic_tot_cps = calc_cps.records.s006.sum()
eic_tot_puf = calc_puf.records.s006.sum()
print 'Percent and Number of Tax Units with Specified Number of EIC Qualified Children'
pd.DataFrame({'CPS': [eic0cps, eic1cps, eic2cps, eic3cps],
              'PUF': [eic0puf, eic1puf, eic2puf, eic3puf],
              '% - CPS': [eic0cps / eic_tot_cps, eic1cps / eic_tot_cps,
                          eic2cps / eic_tot_cps, eic3cps / eic_tot_cps],
              '% - PUF': [eic0puf / eic_tot_puf, eic1puf / eic_tot_puf,
                          eic2puf / eic_tot_puf, eic3puf / eic_tot_puf]})

Percent and Number of Tax Units with Specified Number of EIC Qualified Children


Unnamed: 0,% - CPS,% - PUF,CPS,PUF
0,0.7,0.9,114745569.5,143002628.3
1,0.1,0.1,22983420.2,10220299.5
2,0.1,0.0,16819448.2,7952108.4
3,0.1,0.0,8647332.5,3251431.7


### Benefit Programs

In [27]:
# Columns used for participation rates
cps['ssi_part'] = np.where(cps.ssi > 0, 1, 0)
cps['snap_part'] = np.where(cps.snap > 0, 1, 0)
cps['mcare_part'] = np.where(cps.mcare > 0, 1, 0)
cps['mcaid_part'] = np.where(cps.mcaid > 0, 1, 0)
cps['ss_part'] = np.where(cps.ss > 0, 1, 0)
cps['vb_part'] = np.where(cps.vb > 0, 1, 0)

In [28]:
benefits = pd.DataFrame({'SSI': [(cps.ssi * cps.s006).sum()],
                         'SNAP': [(cps.snap * cps.s006).sum()],
                         'Medicare': [(cps.mcare * cps.s006).sum()],
                         'Medicaid': [(cps.mcaid * cps.s006).sum()],
                         'Social Security': [(cps.ss * cps.s006).sum()],
                         'VB': [(cps.vb * cps.s006).sum()]}).transpose()
benefits.columns = ['Total']
print 'Benefits Totals'
benefits

Benefits Totals


Unnamed: 0,Total
Medicaid,352985014221.6
Medicare,488236708639.1
SNAP,82336140621.7
SSI,54186915454.5
Social Security,630068059192.4
VB,146836134085.4


#### Benefit Program Participation Rates

In [29]:
benefit_participation_names = [('ssi_part', 'SSI'), ('snap_part', 'SNAP'), ('mcare_part', 'Medicare'),
                               ('mcaid_part', 'Medicaid'), ('ss_part', 'Social Security'),
                               ('vb_part', "Veteran's Benefits")]

In [30]:
# Create list of bokeh figure displaying participation rates in each program
fig_list = list()
for item in benefit_participation_names:
    wcps = percentile(cps, item[0], 100, 'e00200', 's006')
    f = figure(title='{} Participation Rate'.format(item[1]),
               x_axis_label='Wage Percentile')
    f.line(wcps.index, wcps)
    fig_list.append(f)

In [31]:
show(column(fig_list))

#### Average Benefits Received by Participants

In [32]:
benefit_program_names = [('ssi', 'SSI'), ('snap', 'SNAP'), ('mcare', 'Medicare'),
                         ('mcaid', 'Medicaid'), ('ss', 'Social Security'),
                         ('vb', "Veteran's Benefits")]

In [33]:
fig_list = list()
for item in benefit_program_names:
    # Only taking the average benefit of participants
    bcps = cps[cps[item[0]] > 0]
    wcps = percentile(bcps, item[0], 100, 'e00200', 's006')
    f = figure(title='Average {} Benefit - Participants'.format(item[1]),
               x_axis_label='Wage Percentile')
    f.line(wcps.index, wcps)
    fig_list.append(f)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  pdf.sort_values(by=income_measure, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  pdf['cumsum_temp'] = np.cumsum(pdf['s006'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  pdf['bins'] = pd.cut(pdf['cumsum_temp'], bins=bin_edges, labels=labels)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

In [34]:
show(column(fig_list))

#### Average Benefits Received by Entire Population

In [35]:
fig_list = list()
for item in benefit_program_names:
    # Only taking the average benefit of participants
    wcps = percentile(cps, item[0], 100, 'e00200', 's006')
    f = figure(title='Average {} Benefit - Entire Population'.format(item[1]),
               x_axis_label='Wage Percentile')
    f.line(wcps.index, wcps)
    fig_list.append(f)

In [36]:
show(column(fig_list))