-
Notifications
You must be signed in to change notification settings - Fork 14
/
data_cleaning.py
287 lines (228 loc) · 12 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# -*- coding: utf-8 -*-
"""
@author: Neha
"""
import pandas as pd
import numpy as np
import math
from . import early_stops
def read_files(input_data, holdout_data):
# Read the input data
if type(input_data) == pd.core.frame.DataFrame:
df = input_data
elif input_data == False:
raise Exception("Need to specify either csv file name or pandas data "\
"frame in parameter 'input_data'")
else:
df = pd.read_csv(input_data)
# Now read the holdout data
if type(holdout_data) == pd.core.frame.DataFrame:
df_holdout = holdout_data
elif type(holdout_data) == float and holdout_data <= 1.0 and holdout_data > 0.0:
df_holdout = df.sample(frac=holdout_data)
elif holdout_data == False:
df_holdout = df.sample(frac=0.1) # default if it's not provided is the df.
else:
df_holdout = pd.read_csv(holdout_data)
return df, df_holdout
def check_stops(stop_unmatched_c, early_stop_un_c_frac, stop_unmatched_t,
early_stop_un_t_frac, early_stop_pe, early_stop_pe_frac, early_stop_bf,
early_stop_bf_frac, early_stop_iterations):
'''
This function checks the parameters passed to DAME/FLAME relating to early
stopping
'''
early_stops_obj = early_stops.EarlyStops()
# Validate
if (stop_unmatched_c == False and stop_unmatched_t == False):
raise Exception('Either stop_unmatched_c or stop_unmatched_t, or both'\
'must be true, so the algorithm terminates if there are no '\
'units left to match')
if early_stop_un_t_frac > 1.0 or early_stop_un_t_frac < 0.0:
raise Exception('The value provided for the early stopping critera '\
'of proportion of unmatched treatment units needs to '\
'be between 0.0 and 1.0')
if early_stop_un_c_frac > 1.0 or early_stop_un_c_frac < 0.0:
raise Exception('The value provided for the early stopping critera '\
'of proportion of unmatched control units needs to '\
'be between 0.0 and 1.0')
if early_stop_pe == True:
early_stop_pe = early_stop_pe_frac
if early_stop_pe_frac > 1.0 or early_stop_pe_frac < 0.0:
raise Exception('The value provided for the early stopping critera of'\
' PE needs to be between 0.0 and 1.0')
if early_stop_bf == True:
early_stop_bf = early_stop_bf_frac
if early_stop_bf_frac > 1.0 or early_stop_bf_frac < 0.0:
raise Exception('The value provided for the early stopping critera of'\
'BF needs to be between 0.0 and 1.0')
if (type(early_stop_iterations) != int and early_stop_iterations != False):
raise Exception('The value provided for early_stop_iteration needs '\
'to be an integer number of iterations, or False if '\
'not stopping early based on the number of iterations')
# Put all of those parameters into the object to return
early_stops_obj.unmatched_c = stop_unmatched_c
early_stops_obj.unmatched_t = stop_unmatched_t
early_stops_obj.un_c_frac = early_stop_un_c_frac
early_stops_obj.un_t_frac = early_stop_un_t_frac
early_stops_obj.pe = early_stop_pe
early_stops_obj.bf = early_stop_bf
early_stops_obj.iterations = early_stop_iterations
return early_stops_obj
def check_parameters(adaptive_weights, weight_array, df_holdout, df,
alpha):
'''
This function processes the parameters that were passed to DAME/FLAME
that aren't directly the input file or related to stop_criteria.
'''
# Checks on the weight array...if the weight array needs to exist
if adaptive_weights == False:
# Confirm that weight array has the right number of values in it
# Subtracting 2 because one col is the treatment and one is outcome.
if len(weight_array) != (len(df.columns)-2):
raise Exception('Invalid input error. Weight array size not equal'\
' to number of columns in dataframe')
# Confirm that weights in weight vector add to 1.
if abs(sum(weight_array) - 1.0) >= 0.001:
# I do this weird operation instead of seeing if it equals one
# to avoid floatig point addition errors that can occur.
raise Exception('Invalid input error. Weight array values must '\
'sum to 1.0')
else:
# make sure that the alpha is valid if it's a ridge regression.
if adaptive_weights == 'ridge' and (alpha < 0.0):
raise Exception('Invalid input error. The alpha needs to be '\
'positive for ridge regressions.')
# make sure that adaptive_weights is a valid value.
if (adaptive_weights != "ridge" and adaptive_weights != "decision tree"):
raise Exception("Invalid input error. The acceptable values for "\
"the adaptive_weights parameter are 'ridge', "\
"'decision tree', or 'False' along with a weight "\
"array")
# make sure the two dfs have the same number of columns first:
if (len(df.columns) != len(df_holdout.columns)):
raise Exception('Invalid input error. The holdout and main '\
'dataset must have the same number of columns')
# make sure that the holdout columns match the df columns.
if (set(df_holdout.columns) != set(df.columns)):
# they don't match
raise Exception('Invalid input error. The holdout and main '\
'dataset must have the same columns')
return
def replace_unique_large(df, treatment_column_name, outcome_column_name,
missing_indicator):
''' (helper)
This function replaces missing values from the df with unique large values
could possibly clean this up later
'''
max_val = df.max().max()
# now we replace all of the missing_indicators with unique large vals
# that are larger than max_val.
for col in df.columns:
if col != treatment_column_name and col != outcome_column_name:
for item_num in df.index.values:
if math.isnan(missing_indicator) == False:
if df[col][item_num] == missing_indicator:
df.loc[item_num, col] = max_val + 1
max_val += 1
else:
# Have to do them separately because nan == nan is false always.
if math.isnan(df[col][item_num]) == True:
df.loc[item_num, col] = max_val + 1
max_val += 1
return df
def drop_missing(df, treatment_column_name, outcome_column_name, missing_indicator):
'''
helper, this function drops rows that have missing_indicator in any of the cols
'''
if math.isnan(missing_indicator) == True:
# either the missing indicator is already NaN and we just drop those rows
df = df.dropna()
else:
# but if its not NaN, switch missing_indicator with nan and then drop
df = df.replace(missing_indicator, np.nan)
df = df.dropna()
return df
def check_missings(df, df_holdout, missing_indicator, missing_data_replace,
missing_holdout_replace, missing_holdout_imputations,
missing_data_imputations, treatment_column_name,
outcome_column_name):
'''
This function deals with all the missing data related stuff
'''
mice_on_matching = False
mice_on_holdout = False
if (missing_data_replace == 0 and df.isnull().values.any() == True):
print('There is missing data in this dataset. The default missing '\
'data handling is being done, so we are not matching on '\
'any missing values in the matching set')
missing_data_replace = 2
# TODO: iterate through all the columns and check for non-integer values
# and then replace them with nan if needed.
# df['hi'] = pd.to_numeric(df['hi'], errors='coerce')
if missing_data_replace == 1:
df = drop_missing(df, treatment_column_name, outcome_column_name,
missing_indicator)
if missing_data_replace == 2:
# so replacing with large unique values will only work if columns
# are in order!!
df = replace_unique_large(df, treatment_column_name, outcome_column_name,
missing_indicator)
# Reorder if they're not in order:
df = df.loc[:, df.max().sort_values(ascending=True).index]
if missing_data_replace == 3:
# this means do mice but only if theres something actually missing.
df = df.replace(missing_indicator, np.nan)
if df.isnull().values.any() == True:
mice_on_matching = missing_data_imputations
if missing_holdout_replace == 0 and df_holdout.isnull().values.any() == True:
print('There is missing data in this dataset. The default missing '\
'data handling is being done, so we are running MICE on 10 '\
'imputed holdout datasets')
missing_holdout_replace = 2
if missing_holdout_replace == 1:
df_holdout = drop_missing(df_holdout, treatment_column_name,
outcome_column_name, missing_indicator)
if missing_holdout_replace == 2:
# this means do mice ugh lol.
df_holdout = df_holdout.replace(missing_indicator, np.nan)
# but if there is actually nothing missing in the dataset, then dont
# need to do this.
if df_holdout.isnull().values.any() == True:
mice_on_holdout = missing_holdout_imputations
return df, df_holdout, mice_on_matching, mice_on_holdout
def process_input_file(df, treatment_column_name, outcome_column_name, adaptive_weights):
'''
This function processes the parameters passed to DAME/FLAME that are
directly the input file.
'''
# Confirm that the treatment column name exists.
if treatment_column_name not in df.columns:
raise Exception('Invalid input error. Treatment column name does not'\
' exist')
# Confirm that the outcome column name exists.
if outcome_column_name not in df.columns:
raise Exception('Invalid input error. Outcome column name does not'\
' exist')
# column only has 0s and 1s.
if set(df[treatment_column_name].unique()) != {0,1}:
raise Exception('Invalid input error. All rows in the treatment '\
'column must have either a 0 or a 1 value.')
if adaptive_weights == False:
# Ensure that the columns are sorted in order: binary, tertary, etc
max_column_size = 1
for col_name in df.columns:
if (col_name != treatment_column_name) and (col_name != outcome_column_name):
# Todo: before, this was df[col_name].unique().max(), which I removed when it didnt work
# this seems to work, but I wonder if it's a happy accident
# because, https://stackoverflow.com/questions/21319929/how-to-determine-whether-a-pandas-column-contains-a-particular-value
if df[col_name].max() >= max_column_size:
max_column_size = df[col_name].max()
else:
raise Exception('Invalid input error. Dataframe column '\
'size must be in increasing order from '\
'left to right.')
else:
# Reorder if they're not in order:
df = df.loc[:, df.max().sort_values(ascending=True).index]
return df