#### AdaBoost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

##### Original Data with AdaBoost

In [None]:
# a) Use AdaBoost Classifier along with GridSearchCV
# Create a dictionary of parameters
param_grid = {'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5]}

# Create AdaBoost Classifier model
abc = AdaBoostClassifier(random_state = 42)

# Create GridSearch object with different combination of parameters
abc_grid = GridSearchCV(abc, param_grid, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = 10, verbose = 5)

# Fit GridSearch object with train data
abc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed: 112.7min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed: 210.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=42),
             iid='deprecated', n_jobs=10,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
                         'n_estimators': [100, 200, 300, 400]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=5)

In [None]:
# b) Use the best estimator from GridSearchCV to predict on the test data
# Identify the best performing model
abc_bt = abc_grid.best_estimator_
print(abc_bt)

# Get predicted probabilities and predicted classes for data
y_test_pred_abc = abc_bt.predict(X_test)
y_train_pred_abc = abc_bt.predict(X_train) 

y_test_proba_abc = abc_bt.predict_proba(X_test)
y_train_proba_abc = abc_bt.predict_proba(X_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.5,
                   n_estimators=400, random_state=42)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train, y_train_pred_abc))
print("Accuracy Train:", skm.accuracy_score(y_train, y_train_pred_abc))
print("Precision Train:", skm.precision_score(y_train, y_train_pred_abc))
print("Recall Train:", skm.recall_score(y_train, y_train_pred_abc))
print("AUC score", skm.roc_auc_score(y_train, y_train_proba_abc[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_abc))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_abc))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_abc))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_abc))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_abc[:,1]))

Train Confusion Matrix: 
 [[223843     53]
 [ 19783     77]]
Accuracy Train: 0.9186235415743612
Precision Train: 0.5923076923076923
Recall Train: 0.0038771399798590133
AUC score 0.7120462854669866

 Test Confusion Matrix: 
 [[55961    15]
 [ 4953    12]]
Accuracy Test: 0.918478528412727
Precision Test: 0.4444444444444444
Recall Test: 0.002416918429003021
AUC score 0.7104743710475256


##### Under-sampling with AdaBoost with different learning rate

In [None]:
# a) Use AdaBoost Classifier along with GridSearchCV
# Create a dictionary of parameters
param_grid1 = {'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5]}

# Create AdaBoost Classifier model
abc1 = AdaBoostClassifier(random_state = 42)

# Create GridSearch object with different combination of parameters
abc_grid1 = GridSearchCV(abc1, param_grid1, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
abc_grid1.fit(X_train_usm, y_train_usm)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 28.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=42),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
                         'n_estimators': [100, 200, 300, 400]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=5)

In [None]:
# b) Use the best estimator from GridSearchCV to predict on the test data
# Identify the best performing model
abc_bt1 = abc_grid1.best_estimator_
print(abc_bt1)

# Get predicted probabilities and predicted classes for data
y_test_pred_abc1 = abc_bt1.predict(X_test)
y_train_pred_abc1 = abc_bt1.predict(X_train_usm) 


y_test_proba_abc1 = abc_bt1.predict_proba(X_test)
y_train_proba_abc1 = abc_bt1.predict_proba(X_train_usm)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.2,
                   n_estimators=400, random_state=42)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train_usm, y_train_pred_abc1))
print("Accuracy Train:", skm.accuracy_score(y_train_usm, y_train_pred_abc1))
print("Precision Train:", skm.precision_score(y_train_usm, y_train_pred_abc1))
print("Recall Train:", skm.recall_score(y_train_usm, y_train_pred_abc1))
print("AUC score", skm.roc_auc_score(y_train_usm, y_train_proba_abc1[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_abc1))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_abc1))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_abc1))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_abc1))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_abc1[:,1]))

Train Confusion Matrix: 
 [[13070  6790]
 [ 6972 12888]]
Accuracy Train: 0.6535246727089628
Precision Train: 0.6549446081918894
Recall Train: 0.6489425981873111
AUC score 0.7098312735979652

 Test Confusion Matrix: 
 [[36689 19287]
 [ 1744  3221]]
Accuracy Test: 0.6548957188099965
Precision Test: 0.14310467389372666
Recall Test: 0.6487411883182276
AUC score 0.7073788277266289


In [None]:
# a) Use AdaBoost Classifier along with GridSearchCV
# Create a dictionary of parameters
param_grid2 = {'n_estimators': [300, 400, 500, 600],
              'learning_rate': [0.4, 0.5, 0.6, 0.7, 0.8]}

# Create AdaBoost Classifier model
abc2 = AdaBoostClassifier(random_state = 42)

# Create GridSearch object with different combination of parameters
abc_grid2 = GridSearchCV(abc2, param_grid2, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
abc_grid2.fit(X_train_usm, y_train_usm)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 35.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 52.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=42),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.4, 0.5, 0.6, 0.7, 0.8],
                         'n_estimators': [300, 400, 500, 600]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=5)

In [None]:
# b) Use the best estimator from GridSearchCV to predict on the test data
# Identify the best performing model
abc_bt2 = abc_grid2.best_estimator_
print(abc_bt2)

# Get predicted probabilities and predicted classes for data
y_test_pred_abc2 = abc_bt2.predict(X_test)
y_train_pred_abc2 = abc_bt2.predict(X_train_usm) 


y_test_proba_abc2 = abc_bt2.predict_proba(X_test)
y_train_proba_abc2 = abc_bt2.predict_proba(X_train_usm)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.4,
                   n_estimators=500, random_state=42)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train_usm, y_train_pred_abc2))
print("Accuracy Train:", skm.accuracy_score(y_train_usm, y_train_pred_abc2))
print("Precision Train:", skm.precision_score(y_train_usm, y_train_pred_abc2))
print("Recall Train:", skm.recall_score(y_train_usm, y_train_pred_abc2))
print("AUC score", skm.roc_auc_score(y_train_usm, y_train_proba_abc2[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_abc2))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_abc2))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_abc2))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_abc2))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_abc2[:,1]))

Train Confusion Matrix: 
 [[13116  6744]
 [ 6902 12958]]
Accuracy Train: 0.6564451158106748
Precision Train: 0.6576997259161507
Recall Train: 0.6524672708962739
AUC score 0.7133951456773446

 Test Confusion Matrix: 
 [[36741 19235]
 [ 1735  3230]]
Accuracy Test: 0.6558966869595182
Precision Test: 0.14377921210772313
Recall Test: 0.6505538771399798
AUC score 0.7075325297663896


#### XGBoost

##### Original Data with XGBoost

In [None]:
# a) Use XGBoost Classifier along with RandomizedSearchCV
# Create a dictionary of parameters
param_grid3 = {'n_estimators': np.arange(100, 1050, 50),
              'learning_rate': np.arange(0.1, 1.7, 0.1),
               'max_depth':[1,2],
               'gamma':np.arange(0, 5.25, 0.25)}

# Create XGBoost Classifier model
clf_xgb = xgb.XGBClassifier(random_state = 42)

# Create GridSearch object with different combination of parameters
xgb_grid = RandomizedSearchCV(clf_xgb, param_grid3, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 90.5min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=42, reg_alpha=0,
                                           reg_lambda=1, s...
       2.75, 3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ]),
                                        'learning_rate': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,

In [None]:
# Use the best estimator from RandomizedSearchCV to predict on the test data
# Identify the best performing model
xgb_bt = xgb_grid.best_estimator_
print("Best estimator:", xgb_bt)

# Get predicted probabilities and predicted classes
y_test_pred_xgb = xgb_bt.predict(X_test)
y_train_pred_xgb = xgb_bt.predict(X_train)

y_test_proba_xgb = xgb_bt.predict_proba(X_test)
y_train_proba_xgb = xgb_bt.predict_proba(X_train)

Best estimator: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=4.5,
              learning_rate=1.4000000000000001, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=None, n_estimators=550, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train, y_train_pred_xgb))
print("Accuracy Train:", skm.accuracy_score(y_train, y_train_pred_xgb))
print("Precision Train:", skm.precision_score(y_train, y_train_pred_xgb))
print("Recall Train:", skm.recall_score(y_train, y_train_pred_xgb))
print("AUC score", skm.roc_auc_score(y_train, y_train_proba_xgb[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_xgb))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_xgb))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_xgb))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_xgb))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_xgb[:,1]))

Train Confusion Matrix: 
 [[223807     89]
 [ 19760    100]]
Accuracy Train: 0.9185702095538161
Precision Train: 0.5291005291005291
Recall Train: 0.005035246727089627
AUC score 0.7121650501459262

 Test Confusion Matrix: 
 [[55956    20]
 [ 4951    14]]
Accuracy Test: 0.9184293004709473
Precision Test: 0.4117647058823529
Recall Test: 0.0028197381671701913
AUC score 0.7104054323526081


##### Under-sampling with XGBoost

In [None]:
# Use XGBoost Classifier along with RandomizedSearchCV
# Create a dictionary of parameters
param_grid4 = {'n_estimators': np.arange(100, 1050, 50),
              'learning_rate': np.arange(0.1, 1.7, 0.1),
               'max_depth':[1,2],
               'gamma':np.arange(0, 5.25, 0.25)}

# Create XGBoost Classifier model
clf_xgb1 = xgb.XGBClassifier(random_state = 42)

# Create GridSearch object with different combination of parameters
xgb_grid1 = RandomizedSearchCV(clf_xgb1, param_grid4, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
xgb_grid1.fit(X_train_usm, y_train_usm)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 12.7min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=42, reg_alpha=0,
                                           reg_lambda=1, s...
       2.75, 3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ]),
                                        'learning_rate': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,

In [None]:
# Use the best estimator from RandomizedSearchCV to predict on the test data
# Identify the best performing model
xgb_bt1 = xgb_grid1.best_estimator_
print("Best estimator:", xgb_bt1)

# Get predicted probabilities and predicted classes
y_test_pred_xgb1 = xgb_bt1.predict(X_test)
y_train_pred_xgb1 = xgb_bt1.predict(X_train_usm)

y_test_proba_xgb1 = xgb_bt1.predict_proba(X_test)
y_train_proba_xgb1 = xgb_bt1.predict_proba(X_train_usm)

Best estimator: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.5,
              learning_rate=0.30000000000000004, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train_usm, y_train_pred_xgb1))
print("Accuracy Train:", skm.accuracy_score(y_train_usm, y_train_pred_xgb1))
print("Precision Train:", skm.precision_score(y_train_usm, y_train_pred_xgb1))
print("Recall Train:", skm.recall_score(y_train_usm, y_train_pred_xgb1))
print("AUC score", skm.roc_auc_score(y_train_usm, y_train_proba_xgb1[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_xgb1))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_xgb1))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_xgb1))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_xgb1))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_xgb1[:,1]))

Train Confusion Matrix: 
 [[13152  6708]
 [ 6973 12887]]
Accuracy Train: 0.6555639476334341
Precision Train: 0.657667772390916
Recall Train: 0.6488922457200402
AUC score 0.7126920239257887

 Test Confusion Matrix: 
 [[36750 19226]
 [ 1746  3219]]
Accuracy Test: 0.6558638683316651
Precision Test: 0.1434172421474716
Recall Test: 0.6483383685800604
AUC score 0.7079848384165793


##### Cost-sensitive with XGBoost

In [None]:
# a) Use XGBoost Classifier along with RandomizedSearchCV
# Create a dictionary of parameters
param_grid5 = {'n_estimators': np.arange(100, 1050, 50),
              'learning_rate': np.arange(0.1, 1.7, 0.1),
               'max_depth':[1,2],
               'gamma':np.arange(0, 5.25, 0.25)}

# Create XGBoost Classifier model
clf_xgb2 = xgb.XGBClassifier(random_state = 42, scale_pos_weight = weight_1)

# Create GridSearch object with different combination of parameters
xgb_grid2 = RandomizedSearchCV(clf_xgb2, param_grid5, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
xgb_grid2.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 67.4min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=42, reg_alpha=0,
                                           reg_lambda=1,
                                           s...
       2.75, 3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ]),
                                        'learning_rate': array([0

In [None]:
# Use the best estimator from RandomizedSearchCV to predict on the test data
# Identify the best performing model
xgb_bt2 = xgb_grid2.best_estimator_
print("Best estimator:", xgb_bt2)

# Get predicted probabilities and predicted classes
y_test_pred_xgb2 = xgb_bt2.predict(X_test)
y_train_pred_xgb2 = xgb_bt2.predict(X_train)

y_test_proba_xgb2 = xgb_bt2.predict_proba(X_test)
y_train_proba_xgb2 = xgb_bt2.predict_proba(X_train)

In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train, y_train_pred_xgb2))
print("Accuracy Train:", skm.accuracy_score(y_train, y_train_pred_xgb2))
print("Precision Train:", skm.precision_score(y_train, y_train_pred_xgb2))
print("Recall Train:", skm.recall_score(y_train, y_train_pred_xgb2))
print("AUC score", skm.roc_auc_score(y_train, y_train_proba_xgb2[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_xgb2))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_xgb2))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_xgb2))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_xgb2))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_xgb2[:,1]))

Train Confusion Matrix: 
 [[150956  72940]
 [  6762  13098]]
Accuracy Train: 0.6730254845008944
Precision Train: 0.15223505892745065
Recall Train: 0.6595166163141994
AUC score 0.727116094281797

 Test Confusion Matrix: 
 [[37807 18169]
 [ 1780  3185]]
Accuracy Test: 0.6726505964785612
Precision Test: 0.14915238362836003
Recall Test: 0.6414904330312186
AUC score 0.7146429951060885


Search weights

In [None]:
# a) Use XGBoost Classifier along with RandomizedSearchCV
# Create a dictionary of parameters
param_grid6 = {'n_estimators': np.arange(100, 1050, 50),
              'learning_rate': np.arange(0.01, 0.1, 0.01),
               'max_depth':[1,2],
               'gamma':np.arange(0, 5.25, 0.25),
               'scale_pos_weight': [4, 6, 8, 10, 11.273716012084591, 12, 14]}

# Create XGBoost Classifier model
clf_xgb3 = xgb.XGBClassifier(random_state = 42)

# Create GridSearch object with different combination of parameters
xgb_grid3 = RandomizedSearchCV(clf_xgb3, param_grid6, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
xgb_grid3.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 83.3min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=42, reg_alpha=0,
                                           reg_lambda=1, s...
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09]),
                                        'max_depth': [1, 

In [None]:
# Use the best estimator from RandomizedSearchCV to predict on the test data
# Identify the best performing model
xgb_bt3 = xgb_grid3.best_estimator_
print("Best estimator:", xgb_bt3)

# Get predicted probabilities and predicted classes
y_test_pred_xgb3 = xgb_bt3.predict(X_test)
y_train_pred_xgb3 = xgb_bt3.predict(X_train)

y_test_proba_xgb3 = xgb_bt3.predict_proba(X_test)
y_train_proba_xgb3 = xgb_bt3.predict_proba(X_train)

Best estimator: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=4.25,
              learning_rate=0.06999999999999999, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=700, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=6, seed=None,
              silent=None, subsample=1, verbosity=1)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train, y_train_pred_xgb3))
print("Accuracy Train:", skm.accuracy_score(y_train, y_train_pred_xgb3))
print("Precision Train:", skm.precision_score(y_train, y_train_pred_xgb3))
print("Recall Train:", skm.recall_score(y_train, y_train_pred_xgb3))
print("AUC score", skm.roc_auc_score(y_train, y_train_proba_xgb3[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_xgb3))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_xgb3))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_xgb3))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_xgb3))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_xgb3[:,1]))

Train Confusion Matrix: 
 [[200190  23706]
 [ 13158   6702]]
Accuracy Train: 0.8487667995864717
Precision Train: 0.22040252565114443
Recall Train: 0.3374622356495468
AUC score 0.7218098361314782

 Test Confusion Matrix: 
 [[50064  5912]
 [ 3325  1640]]
Accuracy Test: 0.8484271672601369
Precision Test: 0.21716101694915255
Recall Test: 0.33031218529707956
AUC score 0.7142888672904126


Search learning rate with weight 11.273716012084591



In [None]:
# a) Use XGBoost Classifier along with RandomizedSearchCV
# Create a dictionary of parameters
param_grid7 = {'n_estimators': np.arange(100, 1050, 50),
              'learning_rate': np.arange(0.01, 0.1, 0.01),
               'max_depth':[1,2],
               'gamma':np.arange(0, 5.25, 0.25)}

# Create XGBoost Classifier model
clf_xgb4 = xgb.XGBClassifier(random_state = 42, scale_pos_weight = weight_1)

# Create GridSearch object with different combination of parameters
xgb_grid4 = RandomizedSearchCV(clf_xgb4, param_grid5, cv = 5, scoring = 'roc_auc',refit = True, n_jobs = -1, verbose = 5)

# Fit GridSearch object with train data
xgb_grid4.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 80.7min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=42, reg_alpha=0,
                                           reg_lambda=1,
                                           s...
       2.75, 3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ]),
                                        'learning_rate': array([0

In [None]:
# Use the best estimator from RandomizedSearchCV to predict on the test data
# Identify the best performing model
xgb_bt4 = xgb_grid4.best_estimator_
print("Best estimator:", xgb_bt4)

# Get predicted probabilities and predicted classes
y_test_pred_xgb4 = xgb_bt4.predict(X_test)
y_train_pred_xgb4 = xgb_bt4.predict(X_train)

y_test_proba_xgb4 = xgb_bt4.predict_proba(X_test)
y_train_proba_xgb4 = xgb_bt4.predict_proba(X_train)

Best estimator: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=2.5,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=550, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=11.273716012084591,
              seed=None, silent=None, subsample=1, verbosity=1)


In [None]:
print("Train Confusion Matrix: \n", skm.confusion_matrix(y_train, y_train_pred_xgb4))
print("Accuracy Train:", skm.accuracy_score(y_train, y_train_pred_xgb4))
print("Precision Train:", skm.precision_score(y_train, y_train_pred_xgb4))
print("Recall Train:", skm.recall_score(y_train, y_train_pred_xgb4))
print("AUC score", skm.roc_auc_score(y_train, y_train_proba_xgb4[:,1]))

print("\n Test Confusion Matrix: \n", skm.confusion_matrix(y_test, y_test_pred_xgb4))
print("Accuracy Test:", skm.accuracy_score(y_test, y_test_pred_xgb4))
print("Precision Test:", skm.precision_score(y_test, y_test_pred_xgb4))
print("Recall Test:", skm.recall_score(y_test, y_test_pred_xgb4))
print("AUC score", skm.roc_auc_score(y_test, y_test_proba_xgb4[:,1]))

Train Confusion Matrix: 
 [[150340  73556]
 [  6841  13019]]
Accuracy Train: 0.670174272633289
Precision Train: 0.15037828472422754
Recall Train: 0.6555387713997985
AUC score 0.7231947058591546

 Test Confusion Matrix: 
 [[37638 18338]
 [ 1766  3199]]
Accuracy Test: 0.6701071528199406
Precision Test: 0.14853507916608627
Recall Test: 0.6443101711983887
AUC score 0.7143377391202475
