-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
341 lines (273 loc) · 15 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# -- coding: utf-8 --
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
def create_descriptive_stat(df, var_list):
"""
Creates descriptive statistics on a pandas DataFrame.
:param df: pandas DataFrame
:param var_list: list of variables to include
"""
df = df[var_list]
desc_df = df.describe(percentiles=np.arange(0.1, 1, 0.1).round(2))
desc_df = desc_df.append(df.agg({np.median
, lambda x: x.mode()[0]})).rename(index={'<lambda>': 'mode'}).transpose()
desc_df['missing nbr'] = df.isnull().sum()
desc_df['missing %'] = ((df.isnull().sum()) / df.shape[0] * 100)
desc_df = desc_df[['min', 'max', 'mean', 'mode', 'median', '10%', '20%',
'30%', '40%', '50%', '60%', '70%', '80%', '90%', 'missing nbr', 'missing %']].transpose()
return desc_df.round(2)
def plot_full_stacked_density(df, target_var, var, kind):
"""
Plots a full stacked density plot of the given variable.
:param df: pandas DataFrame
:param target_var: target variable
:param var: variable
:param kind: the kind of the plot, e.g. 'bar', 'area', 'density'
"""
pivot_df = df.groupby([target_var, var]).size().reset_index().pivot(columns=target_var, index=var, values=0)
pivot_df = pivot_df.div(pivot_df.sum(1), axis=0)
pivot_df.plot(kind=kind, stacked=True, rot=45, title='Density of churn')
plt.ylabel('Density of churn (%)')
return plt.show()
def calc_pearson_correlation(df, var_list):
"""
Calculates Pearson correlation between variables.
:param df: pandas DataFrame
:param var_list: list of variables
"""
pearson_corr = df[var_list].apply(lambda x: pd.factorize(x)[0]).corr(method='pearson').abs()
return pearson_corr
def select_by_correlation_threshold(df, threshold):
"""
List all variables with higher or equal to the correlation as the threshold.
:param df: pandas DataFrame with the correlatting variable pairs
:param threshold: threshold value
"""
corr_sorted = df.unstack().sort_values(ascending=False).drop_duplicates()
corr_df = pd.DataFrame(corr_sorted)
corr_df = corr_df.reset_index().rename(columns={'level_0': 'variable_1',
'level_1': 'variable_2',
0: 'correlation'})
corr_df = corr_df[corr_df['correlation'] != 1]
corr_df = corr_df[corr_df.correlation >= threshold]
return corr_df
def variable_selection(selected, target, train, nfolds, n, model_type):
"""
Performs variable selection based on the selected model type by the user.
Returns a variable importance plot and a list with the n most important variables.
:param selected: list of selected variables
:param target: target variable
:param train: h2o frame with train data
:param nfolds: the number of folds to use for cross-validation
:param n: number of important variables to be selected at the end of the training process
:param model_type: the name of the model the variable selection should be calculated on (can be 'GBM', 'GLM', 'DRF')
"""
if model_type == 'GBM':
model = H2OGradientBoostingEstimator(balance_classes=True,
seed=1234,
model_id='GBM_var_sel',
nfolds=nfolds,
stopping_rounds=3,
stopping_tolerance=0.01,
stopping_metric="lift_top_group")
model.train(x=selected, y=target, training_frame=train)
varimp = model.varimp(use_pandas=True)[0:n + 1]
result = list(varimp['variable'])
if model_type == 'GLM':
model = H2OGeneralizedLinearEstimator(family='binomial',
alpha=1.0, # lasso regularization
lambda_search=True,
nfolds=nfolds,
standardize=True,
seed=1234,
model_id='GLM_var_sel')
model.train(x=selected, y=target, training_frame=train)
varimp = model.coef_norm()
glm_df = pd.DataFrame(varimp.items(), columns=['variable', 'scaled_importance'])
glm_df = glm_df[glm_df.scaled_importance > 0.0]
glm_df = glm_df.sort_values(by=['scaled_importance'], ascending=False)
glm_df = glm_df[glm_df.variable != 'Intercept']
result = glm_df['variable'][0:n]
for i in range(len(result)):
result.iloc[i] = result.iloc[i].split('.')[0]
result = list(set(result))
if model_type == 'DRF':
model = H2ORandomForestEstimator(balance_classes=True,
seed=1234,
ntrees=10,
model_id="DRF_var_sel",
nfolds=nfolds,
stopping_tolerance=0.01,
stopping_metric="lift_top_group")
model.train(x=selected, y=target, training_frame=train)
varimp = model.varimp(use_pandas=True)[0:n]
result = list(varimp['variable'])
return model, result
def modelling_for_testing_variables(target, var_list, train, nfolds):
"""
Trains 3 models (GBM, GLM, Random Forest) for testing a list of variables.
:param selected: list of selected variables
:param target: target variable
:param train: h2o frame with train data
:param nfolds: the number of folds to use for cross-validation
"""
model_gbm = H2OGradientBoostingEstimator(balance_classes=True,
seed=1234,
nfolds=nfolds,
stopping_tolerance=0.01,
stopping_metric="lift_top_group")
model_glm = H2OGeneralizedLinearEstimator(family='binomial',
seed=1234,
nfolds=nfolds,
alpha=0.7,
lambda_search=True)
model_drf = H2ORandomForestEstimator(balance_classes=True,
seed=1234,
nfolds=nfolds,
stopping_tolerance=0.01,
stopping_metric="lift_top_group",
ntrees=15)
model_gbm.train(x=var_list, y=target, training_frame=train)
model_glm.train(x=var_list, y=target, training_frame=train)
model_drf.train(x=var_list, y=target, training_frame=train)
model_gbm.name = 'GBM'
model_glm.name = 'GLM'
model_drf.name = 'DRF'
return model_gbm, model_glm, model_drf
def result_comparison(model_list, test, target):
"""
Compare results of different models on the test set.
:param model_list: list of trained h2o models
:param test: h2o frame with test data
:param target: target variable
"""
model_name = []
cum_resp_rate_val = []
cum_cap_rate_val = []
cum_resp_rate_val_20 = []
cum_cap_rate_val_20 = []
auc_vars = []
nbr = []
target_nbr = []
for i in model_list:
auc = i.model_performance(test_data=test).auc()
lift_df = i.model_performance(test_data=test).gains_lift().as_data_frame().iloc[5:6, :]
lift_df_20 = i.model_performance(test_data=test).gains_lift().as_data_frame().iloc[7:8, :]
cum_resp_rate_val.append(lift_df.iloc[0]['cumulative_response_rate'])
cum_cap_rate_val.append(lift_df.iloc[0]['cumulative_capture_rate'])
cum_resp_rate_val_20.append(lift_df_20.iloc[0]['cumulative_response_rate'])
cum_cap_rate_val_20.append(lift_df_20.iloc[0]['cumulative_capture_rate'])
model_name.append(i.name)
nbr.append(test.shape[0])
target_nbr.append(test[target].sum())
auc_vars.append(auc)
res_df = pd.DataFrame({'Model': model_name, 'Customer Nbr': nbr, 'Target Nbr': target_nbr, 'AUC': auc_vars,
'Cum Response Rate TOP 10%': cum_resp_rate_val,'Cum Response Rate TOP 20%': cum_resp_rate_val_20,
'Cum Capture Rate TOP 10%': cum_cap_rate_val, 'Cum Capture Rate TOP 20%': cum_cap_rate_val_20
})
res_df = res_df.round(2)
res_df['Target Nbr'] = res_df['Target Nbr'].astype(int)
res_df['Target Found TOP 10%'] = (res_df['Target Nbr'] * res_df['Cum Capture Rate TOP 10%']).astype(int)
res_df['Target Found TOP 20%'] = (res_df['Target Nbr'] * res_df['Cum Capture Rate TOP 20%']).astype(int)
fin_df = res_df[['Model', 'AUC', 'Customer Nbr', 'Target Nbr',
'Cum Capture Rate TOP 10%', 'Cum Response Rate TOP 10%', 'Target Found TOP 10%',
'Cum Capture Rate TOP 20%', 'Cum Response Rate TOP 20%', 'Target Found TOP 20%']]\
.sort_values(by='AUC', ascending=False)
return fin_df
def gbm_grid_search_max_depth(selected, target, train, test, nfolds):
"""
Performs grid search on a GBM model to find the optimal max_depth parameter that maximizes the AUC on the test frame.
Returns the best model and the grid.
:param selected: list of selected variables
:param target: target variable
:param train: h2o frame with train data
:param test: h2o frame with test data
:param nfolds: the number of folds to use for cross-validation
"""
hyperparameters = {'max_depth': [3, 4, 5, 6, 7, 8]}
search_criteria = {'strategy': "Cartesian"}
gbm_grid = H2OGridSearch(H2OGradientBoostingEstimator(seed=1234,
balance_classes=True,
nfolds=nfolds),
hyperparameters,
search_criteria=search_criteria)
gbm_grid.train(x=selected, y=target, training_frame=train, validation_frame=test)
gbm_grid_table = gbm_grid.get_grid(sort_by='auc', decreasing=True)
gbm_best_model = gbm_grid.models[0]
gbm_best_model.name = 'best GBM - max_detph'
gbm_grid_table = gbm_grid_table.sorted_metric_table().drop('model_ids', axis=1)
return gbm_best_model, gbm_grid_table
def gbm_grid_search(selected, target, train, test, nfolds, max_depth, ntrees_list, min_rows_list, show_top):
"""
Performs grid search on a GBM model to find the optimal parameters that maximize the AUC on the test frame.
Returns the best model and the grid.
:param selected: list of selected variables
:param target: target variable
:param train: h2o frame with train data
:param test: h2o frame with test data
:param nfolds: the number of folds to use for cross-validation
:param max_depth: specifies the maximum depth to which each tree will be built
:param ntrees_list: list of number of trees to build in the model
:param min_rows_list: list of the minimum number of observations for a leaf in order to split
"""
hyperparameters = {'ntrees': ntrees_list,
'min_rows': min_rows_list,
'min_split_improvement': [1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8],
'learn_rate': [0.05, 0.08, 0.1, 0.25, 0.35, 0.5, 1],
'learn_rate_annealing': [0.9, 0.93, 0.95, 0.99, 0.1],
'col_sample_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
'sample_rate': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
'col_sample_rate_change_per_level': [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 1,
1.5, 1.8, 2],
'col_sample_rate_per_tree': [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1],
'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "Random"]
}
search_criteria = {'strategy': "RandomDiscrete",
'stopping_metric': "lift_top_group",
'stopping_tolerance': 0.01,
'stopping_rounds': 3,
'max_runtime_secs': 800,
'max_models': 50}
gbm_grid = H2OGridSearch(H2OGradientBoostingEstimator(seed=1234,
balance_classes=True,
max_depth=max_depth,
nfolds=nfolds),
hyperparameters,
search_criteria=search_criteria)
gbm_grid.train(x=selected, y=target, training_frame=train, validation_frame=test)
gbm_grid_table = gbm_grid.get_grid(sort_by='auc', decreasing=True)
gbm_best_model = gbm_grid.models[0]
gbm_best_model.name = 'best GBM - grid search'
gbm_grid_table = gbm_grid_table.sorted_metric_table().drop('model_ids', axis=1)[0:show_top]
return gbm_best_model, gbm_grid_table
def glm_grid_search(selected, target, train, test, nfolds, show_top):
"""
Performs grid search on a GLM model to find the optimal parameters that maximize the AUC on the test frame.
Returns the best model and the grid.
:param selected: list of selected variables
:param target: target variable
:param train: h2o frame with train data
:param test: h2o frame with test data
:param nfolds: the number of folds to use for cross-validation
"""
hyperparameters = {'alpha': [0.01, 0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 1],
'lambda': [0, 1, 0.5, 0.1, 0.001, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8]}
search_criteria = {'strategy': "Cartesian"}
glm_grid = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial',
seed=1234,
balance_classes=True,
nfolds=nfolds,
standardize=True),
hyperparameters,
search_criteria=search_criteria)
glm_grid.train(x=selected, y=target, training_frame=train, validation_frame=test)
glm_grid_table = glm_grid.get_grid(sort_by='auc', decreasing=True)
glm_best_model = glm_grid.models[0]
glm_best_model.name = 'best GLM - grid search'
glm_grid_table = glm_grid_table.sorted_metric_table().drop('model_ids', axis=1)[0:show_top]
return glm_best_model, glm_grid_table