This repository has been archived by the owner on Aug 1, 2020. It is now read-only.
/
createcps.py
199 lines (189 loc) · 8.44 KB
/
createcps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""
Script to run all files that create the CPS file
"""
import pandas as pd
from cpsrets import Returns
from adj_filst import adjfilst
from assemble import assemble
from topcoding import topcoding
from imputetobit import imputation
from targets import targets
from blankslate import blankslate
from merge_benefits import BenefitMerge
import cpsmar_2013
import cpsmar_2014
import cpsmar_2015
import os
from dask import compute, delayed
def createcps():
"""
Logic for creating the CPS tax unit file
"""
# first check for the presense of the CPS files with benefits merged
cps13_merged = os.path.isfile('data/cps_2013_aug.csv')
cps14_merged = os.path.isfile('data/cps_2014_aug.csv')
cps15_merged = os.path.isfile('data/cps_2015_aug.csv')
# if the CSV CPS files are present, use those for the rest of the script
if cps13_merged and cps14_merged and cps15_merged:
print('Reading Data')
lazy_cps13 = delayed(pd.read_csv)('data/cps_2013_aug.csv')
lazy_cps14 = delayed(pd.read_csv)('data/cps_2014_aug.csv')
lazy_cps15 = delayed(pd.read_csv)('data/cps_2015_aug.csv')
lazy_read = [lazy_cps13, lazy_cps14, lazy_cps15]
cps13, cps14, cps15 = compute(*lazy_read)
else:
cps13_raw = os.path.isfile('data/cpsmar2013.csv')
cps14_raw = os.path.isfile('data/cpsmar2014.csv')
cps15_raw = os.path.isfile('data/cpamar2015.csv')
# if raw files are available, simply read them in
if cps13_raw and cps14_raw and cps15_raw:
lazy_cps13 = pd.read_csv('data/cpsmar2013.csv')
lazy_cps14 = pd.read_csv('data/cpsmar2014.csv')
lazy_cps15 = pd.read_csv('data/cpsmar2015.csv')
lazy_read = [lazy_cps13, lazy_cps14, lazy_cps15]
raw_13, raw_14, raw_15 = compute(*lazy_read)
# otherwise create the raw cps files from the .DAT files
else:
cps13_dat = os.path.isfile('data/asec2013_pubuse.dat')
dat_2014_path = 'data/asec2014_pubuse_tax_fix_5x8_2017.dat'
cps14_dat = os.path.isfile(dat_2014_path)
cps15_dat = os.path.isfile('data/asec2015_pubuse.dat')
if cps13_dat and cps14_dat and cps15_dat:
print('Converting CPS Files from .DAT to .CSV')
# stage creation scripts to run in parallel
path_2013 = 'data/asec2013_pubuse.dat'
raw_13 = cpsmar_2013.create_cps(path_2013)
path_2014 = 'data/asec2014_pubuse_tax_fix_5x8_2017.dat'
raw_14 = cpsmar_2014.create_cps(path_2014)
path_2015 = 'data/asec2015_pubuse.dat'
raw_15 = cpsmar_2015.create_cps(path_2015)
else:
msg = ('CPS file(s) not found.' +
'You must have either CSV files for the 2013, 2014,' +
' and 2015 CPS files augmented with C-TAM imputed' +
' benefits or the raw DAT formatted versions of the' +
' CPS, availablefrom NBER')
raise IOError(msg)
# merge C-TAM imputed benefits
print('Merging Benefits')
cps13, cps14, cps15 = BenefitMerge(raw_13, raw_14, raw_15)
print('Creating Tax Units for 2013 CPS')
rets2013 = Returns(cps13, 2013)
taxunits13 = rets2013.computation()
print('Creating Tax Units for 2014 CPS')
rets2014 = Returns(cps14, 2014)
taxunits14 = rets2014.computation()
print('Creating Tax Units for 2015 CPS')
rets2015 = Returns(cps15, 2015)
taxunits15 = rets2015.computation()
del cps13, cps14, cps15
print('Adjusting Files')
lazy_taxunits13_adj = delayed(adjfilst)(taxunits13)
lazy_taxunits14_adj = delayed(adjfilst)(taxunits14)
lazy_taxunits15_adj = delayed(adjfilst)(taxunits15)
lazy_adjustments = [lazy_taxunits13_adj, lazy_taxunits14_adj,
lazy_taxunits15_adj]
taxunits13_adj, taxunits14_adj, taxunits15_adj = compute(*lazy_adjustments)
taxunits13_adj.to_csv('taxunits13.csv', index=None)
taxunits14_adj.to_csv('taxunits14.csv', index=None)
taxunits15_adj.to_csv('taxunits15.csv', index=None)
del taxunits13, taxunits14, taxunits15
print('Assembling File')
full_taxunits = assemble(taxunits13_adj, taxunits14_adj, taxunits15_adj)
full_taxunits.to_csv('fullunits.csv', index=None)
del taxunits13_adj, taxunits14_adj, taxunits15_adj
print('Adjusting for Top Coding')
cps_tc = topcoding(full_taxunits)
cps_tc.to_csv('topcoding.csv', index=None)
del full_taxunits
print('Imputing Deductions')
# read in beta coefficients used in imputations
logit_betas = pd.read_csv('data/logit_betas.csv', index_col=0)
ols_betas = pd.read_csv('data/ols_betas.csv', index_col=0)
tobit_betas = pd.read_csv('data/tobit_betas.csv', index_col=0)
# impute
cps_v2 = imputation(cps_tc, logit_betas, ols_betas, tobit_betas)
cps_v2.to_csv('postimputation.csv', index=None)
del cps_tc
print('Adjusting for State Targets')
state_targets = pd.read_csv('data/agg_state_data.csv', index_col='STATE')
cps_v3 = targets(cps_v2, state_targets)
cps_v3.to_csv('posttargeting.csv', index=None)
del cps_v2
print('Blank Slate Imputations')
blankslate(cps_v3)
print('Renaming Variables')
rename = {'MCAID_PROB1': 'MEDICAID_PROB1',
'MCAID_PROB10': 'MEDICAID_PROB10',
'MCAID_PROB11': 'MEDICAID_PROB11',
'MCAID_PROB12': 'MEDICAID_PROB12',
'MCAID_PROB13': 'MEDICAID_PROB13',
'MCAID_PROB14': 'MEDICAID_PROB14',
'MCAID_PROB15': 'MEDICAID_PROB15',
'MCAID_PROB2': 'MEDICAID_PROB2',
'MCAID_PROB3': 'MEDICAID_PROB3',
'MCAID_PROB4': 'MEDICAID_PROB4',
'MCAID_PROB5': 'MEDICAID_PROB5',
'MCAID_PROB6': 'MEDICAID_PROB6',
'MCAID_PROB7': 'MEDICAID_PROB7',
'MCAID_PROB8': 'MEDICAID_PROB8',
'MCAID_PROB9': 'MEDICAID_PROB9',
'MCAID_VAL1': 'MEDICAID_VAL1',
'MCAID_VAL10': 'MEDICAID_VAL10',
'MCAID_VAL11': 'MEDICAID_VAL11',
'MCAID_VAL12': 'MEDICAID_VAL12',
'MCAID_VAL13': 'MEDICAID_VAL13',
'MCAID_VAL14': 'MEDICAID_VAL14',
'MCAID_VAL15': 'MEDICAID_VAL15',
'MCAID_VAL2': 'MEDICAID_VAL2',
'MCAID_VAL3': 'MEDICAID_VAL3',
'MCAID_VAL4': 'MEDICAID_VAL4',
'MCAID_VAL5': 'MEDICAID_VAL5',
'MCAID_VAL6': 'MEDICAID_VAL6',
'MCAID_VAL7': 'MEDICAID_VAL7',
'MCAID_VAL8': 'MEDICAID_VAL8',
'MCAID_VAL9': 'MEDICAID_VAL9',
'MCARE_PROB1': 'MEDICARE_PROB1',
'MCARE_PROB10': 'MEDICARE_PROB10',
'MCARE_PROB11': 'MEDICARE_PROB11',
'MCARE_PROB12': 'MEDICARE_PROB12',
'MCARE_PROB13': 'MEDICARE_PROB13',
'MCARE_PROB14': 'MEDICARE_PROB14',
'MCARE_PROB15': 'MEDICARE_PROB15',
'MCARE_PROB2': 'MEDICARE_PROB2',
'MCARE_PROB3': 'MEDICARE_PROB3',
'MCARE_PROB4': 'MEDICARE_PROB4',
'MCARE_PROB5': 'MEDICARE_PROB5',
'MCARE_PROB6': 'MEDICARE_PROB6',
'MCARE_PROB7': 'MEDICARE_PROB7',
'MCARE_PROB8': 'MEDICARE_PROB8',
'MCARE_PROB9': 'MEDICARE_PROB9',
'MCARE_VAL1': 'MEDICARE_VAL1',
'MCARE_VAL10': 'MEDICARE_VAL10',
'MCARE_VAL11': 'MEDICARE_VAL11',
'MCARE_VAL12': 'MEDICARE_VAL12',
'MCARE_VAL13': 'MEDICARE_VAL13',
'MCARE_VAL14': 'MEDICARE_VAL14',
'MCARE_VAL15': 'MEDICARE_VAL15',
'MCARE_VAL2': 'MEDICARE_VAL2',
'MCARE_VAL3': 'MEDICARE_VAL3',
'MCARE_VAL4': 'MEDICARE_VAL4',
'MCARE_VAL5': 'MEDICARE_VAL5',
'MCARE_VAL6': 'MEDICARE_VAL6',
'MCARE_VAL7': 'MEDICARE_VAL7',
'MCARE_VAL8': 'MEDICARE_VAL8',
'MCARE_VAL9': 'MEDICARE_VAL9',
'SSI': 'ssi_ben',
'VB': 'vb_ben',
'MEDICARE': 'medicare_ben',
'MEDICAID': 'medicaid_ben',
'SS': 'ss_ben',
'SNAP': 'snap_ben',
'WT': 's006'}
cps_final = cps_v3.rename(columns=rename)
del cps_v3
# export and compress data
print('Exporting and Compressing Data')
cps_final.to_csv('cps_raw_new.csv.gz', index=False, compression='gzip')
if __name__ == '__main__':
createcps()