In [2]:
import numpy as np
import pandas as pd
import xgboost
import lightgbm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

### 1. Data

In [3]:
train = pd.read_csv("./data/processed_train.csv")
test = pd.read_csv("./data/processed_test.csv")

In [4]:
y_train = train.pop('target')
train.drop('ID_code', inplace=True, axis=1)
X_train = train

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)

In [6]:
eval_set = [(X_val, y_val)]

### 2. Model Training - XGBoost

In [10]:
xgboost_model = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosiry=1)

In [11]:
xgboost_model.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,  # No new tree is built if model performance doesn't improve over 10 iterations
                  eval_set=eval_set,
                  verbose=True)



Parameters: { "verbosiry" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.61756
[1]	validation_0-auc:0.65707
[2]	validation_0-auc:0.67953
[3]	validation_0-auc:0.68938
[4]	validation_0-auc:0.69498
[5]	validation_0-auc:0.70060
[6]	validation_0-auc:0.70537
[7]	validation_0-auc:0.71301
[8]	validation_0-auc:0.71864
[9]	validation_0-auc:0.72294
[10]	validation_0-auc:0.73241
[11]	validation_0-auc:0.73389
[12]	validation_0-auc:0.73659
[13]	validation_0-auc:0.74013
[14]	validation_0-auc:0.74220
[15]	validation_0-auc:0.74465
[16]	validation_0-auc:0.74621
[17]	validation_0-auc:0.74742
[18]	validation_0-auc:0.75623
[19]	validation_0-auc:0.76043
[20]	validation_0-auc:0.76350
[21]	validation_0-auc:0.76632
[22]	validation_0-auc:0.77019
[23]	validation_0-auc:0.77194
[24

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=5000, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.5, tree_method='exact',
              validate_parameters=1, verbosiry=1, verbosity=None)

### 3. Evaluate model

In [15]:
y_train_pred = xgboost_model.predict_proba(X_train)[:,1]
y_val_pred = xgboost_model.predict_proba(X_val)[:,1] 

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                     roc_auc_score(y_val, y_val_pred)))



AUC Train: 0.9717
AUC Valid: 0.8911


### 4. Hyperparameter tuning
We will use GridSearchCV for hyperparameter tuning. First, let's define all the possible hyperparameter values which we want to check.

In [10]:
learning_rate = [0.02, 0.05, 0.1]
max_depth = [2, 3, 5]
n_estimators = [1000, 2000, 3000]

params_dict = {
    "learning_rate": learning_rate,
    "max_depth": max_depth,
    "n_estimators": n_estimators
}

num_combinations = 1

for val in params_dict.values(): 
    num_combinations *= len(val)
    
print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'max_depth': [2, 3, 5],
 'n_estimators': [1000, 2000, 3000]}

In [14]:
def my_roc_auc_score(model, X, y):
    return roc_auc_score(y, model.predict_proba(X)[:, 1])

In [15]:
model_xgboost_hp = GridSearchCV(estimator=xgboost.XGBClassifier(subsample=0.5,
                                                                colsample_bytree=0.25,
                                                                eval_metric="auc",
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2,
                                scoring=my_roc_auc_score,
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X_train, y_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits




[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.882, test=0.860) total time= 1.4min




[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.882, test=0.859) total time= 1.4min




[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.908, test=0.883) total time= 3.0min




[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.910, test=0.882) total time= 3.2min




[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.920, test=0.892) total time= 4.3min




[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.922, test=0.891) total time= 4.1min




[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.912, test=0.876) total time= 1.8min




[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.913, test=0.875) total time= 1.8min




[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=2000;, score=(train=0.937, test=0.893) total time= 3.5min




[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=2000;, score=(train=0.939, test=0.892) total time= 3.5min




[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=3000;, score=(train=0.949, test=0.899) total time= 5.6min




[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=3000;, score=(train=0.951, test=0.897) total time= 5.4min




[CV 1/2] END learning_rate=0.02, max_depth=5, n_estimators=1000;, score=(train=0.966, test=0.888) total time= 2.7min




[CV 2/2] END learning_rate=0.02, max_depth=5, n_estimators=1000;, score=(train=0.967, test=0.888) total time= 2.7min




[CV 1/2] END learning_rate=0.02, max_depth=5, n_estimators=2000;, score=(train=0.987, test=0.898) total time= 5.3min




[CV 2/2] END learning_rate=0.02, max_depth=5, n_estimators=2000;, score=(train=0.988, test=0.896) total time= 5.4min




[CV 1/2] END learning_rate=0.02, max_depth=5, n_estimators=3000;, score=(train=0.995, test=0.899) total time= 8.3min




[CV 2/2] END learning_rate=0.02, max_depth=5, n_estimators=3000;, score=(train=0.996, test=0.897) total time= 8.5min




[CV 1/2] END learning_rate=0.05, max_depth=2, n_estimators=1000;, score=(train=0.915, test=0.888) total time= 1.4min




[CV 2/2] END learning_rate=0.05, max_depth=2, n_estimators=1000;, score=(train=0.916, test=0.885) total time= 1.4min




[CV 1/2] END learning_rate=0.05, max_depth=2, n_estimators=2000;, score=(train=0.932, test=0.898) total time= 2.8min




[CV 2/2] END learning_rate=0.05, max_depth=2, n_estimators=2000;, score=(train=0.934, test=0.896) total time= 2.8min




[CV 1/2] END learning_rate=0.05, max_depth=2, n_estimators=3000;, score=(train=0.942, test=0.900) total time= 4.2min




[CV 2/2] END learning_rate=0.05, max_depth=2, n_estimators=3000;, score=(train=0.944, test=0.898) total time= 4.2min




[CV 1/2] END learning_rate=0.05, max_depth=3, n_estimators=1000;, score=(train=0.943, test=0.894) total time= 1.8min




[CV 2/2] END learning_rate=0.05, max_depth=3, n_estimators=1000;, score=(train=0.944, test=0.894) total time= 1.8min




[CV 1/2] END learning_rate=0.05, max_depth=3, n_estimators=2000;, score=(train=0.963, test=0.898) total time= 3.7min




[CV 2/2] END learning_rate=0.05, max_depth=3, n_estimators=2000;, score=(train=0.965, test=0.898) total time= 3.6min




[CV 1/2] END learning_rate=0.05, max_depth=3, n_estimators=3000;, score=(train=0.977, test=0.898) total time= 5.5min




[CV 2/2] END learning_rate=0.05, max_depth=3, n_estimators=3000;, score=(train=0.978, test=0.898) total time= 5.5min




[CV 1/2] END learning_rate=0.05, max_depth=5, n_estimators=1000;, score=(train=0.991, test=0.894) total time= 2.8min




[CV 2/2] END learning_rate=0.05, max_depth=5, n_estimators=1000;, score=(train=0.992, test=0.894) total time= 2.8min




[CV 1/2] END learning_rate=0.05, max_depth=5, n_estimators=2000;, score=(train=1.000, test=0.894) total time= 5.5min




[CV 2/2] END learning_rate=0.05, max_depth=5, n_estimators=2000;, score=(train=1.000, test=0.893) total time= 5.7min




[CV 1/2] END learning_rate=0.05, max_depth=5, n_estimators=3000;, score=(train=1.000, test=0.893) total time= 8.3min




[CV 2/2] END learning_rate=0.05, max_depth=5, n_estimators=3000;, score=(train=1.000, test=0.893) total time= 8.3min




[CV 1/2] END learning_rate=0.1, max_depth=2, n_estimators=1000;, score=(train=0.931, test=0.896) total time= 1.4min




[CV 2/2] END learning_rate=0.1, max_depth=2, n_estimators=1000;, score=(train=0.931, test=0.895) total time= 1.4min




[CV 1/2] END learning_rate=0.1, max_depth=2, n_estimators=2000;, score=(train=0.948, test=0.898) total time= 2.8min




[CV 2/2] END learning_rate=0.1, max_depth=2, n_estimators=2000;, score=(train=0.949, test=0.895) total time= 2.8min




[CV 1/2] END learning_rate=0.1, max_depth=2, n_estimators=3000;, score=(train=0.961, test=0.897) total time= 4.2min




[CV 2/2] END learning_rate=0.1, max_depth=2, n_estimators=3000;, score=(train=0.962, test=0.894) total time= 4.2min




[CV 1/2] END learning_rate=0.1, max_depth=3, n_estimators=1000;, score=(train=0.961, test=0.896) total time= 1.8min




[CV 2/2] END learning_rate=0.1, max_depth=3, n_estimators=1000;, score=(train=0.963, test=0.895) total time= 1.9min


KeyboardInterrupt: 

Now let's look at the output of grid search by ranking by test score i.e. performance on validation data.

### 5. Evaluate results

In [16]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results)
df_cv_results = df_cv_results[[
    'rank_test_score',
    'mean_test_score',
    'mean_train_score',
    'param_learning_rate',
    'param_max_depth',
    'param_n_estimators'
]]

df_cv_results.sort_values(by='rank_test_score', inplace=True, ignore_index=True)
df_cv_results

AttributeError: 'XGBClassifier' object has no attribute 'cv_results'

### 6. Build final model

In [7]:
xgboost_model = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosiry=1)

In [18]:
evaluation_results = xgboost_model.evals_result()

# Index into each key to find AUC values for training and validation data after each tree
train_auc_tree = evaluation_results['validation_0']['auc']
valid_auc_tree = evaluation_results['validation_1']['auc']


# Plotting Section
plt.figure(figsize=(15,5))

plt.plot(train_auc_tree, label='Train')
plt.plot(valid_auc_tree, label='valid')

plt.title("Train and validation AUC as number of trees increase")
plt.xlabel("Trees")
plt.ylabel("AUC")
plt.legend(loc='lower right')
plt.show()

KeyError: 'validation_1'

Let us also look at variable importance

In [None]:
df_var_imp = pd.DataFrame({"Variable": var_colums,
                           "Importance": xgboost_model.feature_importances_}) \
                        .sort_values(by='Importance', ascending=False)
df_var_imp[:10]

### 7. Score the test data

In [24]:
test.drop('ID_code', axis=1, inplace=True)

KeyError: "['ID_code'] not found in axis"

In [27]:
submission = xgboost_model.predict_proba(test)[:,1]

### A. Model Training - XGBoost

Specify parameters of LightGBM

In [28]:
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 63,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.01,
    'verbose': -1
}

Train the LightGBM model for maximum 5000 rounds. Early stopping criteria is 50 iterations.

In [32]:
train_data = lightgbm.Dataset(X_train, label=y_train)
valid_data = lightgbm.Dataset(X_val, label=y_val)

In [33]:
model_lgbm = lightgbm.train(parameters,
                            train_data,
                            valid_sets=valid_data,
                            num_boost_round=5000,
                            early_stopping_rounds=50)

[1]	valid_0's auc: 0.655092
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.678622
[3]	valid_0's auc: 0.704128
[4]	valid_0's auc: 0.728007
[5]	valid_0's auc: 0.737301
[6]	valid_0's auc: 0.74417
[7]	valid_0's auc: 0.75175
[8]	valid_0's auc: 0.754857
[9]	valid_0's auc: 0.756272
[10]	valid_0's auc: 0.760885
[11]	valid_0's auc: 0.762652
[12]	valid_0's auc: 0.764331
[13]	valid_0's auc: 0.767547
[14]	valid_0's auc: 0.769745
[15]	valid_0's auc: 0.770371
[16]	valid_0's auc: 0.769788
[17]	valid_0's auc: 0.769871
[18]	valid_0's auc: 0.77338
[19]	valid_0's auc: 0.774528
[20]	valid_0's auc: 0.778478
[21]	valid_0's auc: 0.778934
[22]	valid_0's auc: 0.78089
[23]	valid_0's auc: 0.783205
[24]	valid_0's auc: 0.785852
[25]	valid_0's auc: 0.786899
[26]	valid_0's auc: 0.788898
[27]	valid_0's auc: 0.790423
[28]	valid_0's auc: 0.792412
[29]	valid_0's auc: 0.794326
[30]	valid_0's auc: 0.795261
[31]	valid_0's auc: 0.796949
[32]	valid_0's auc: 0.797934
[33]	valid_0's auc: 0.79

In [35]:
y_train_pred = model_lgbm.predict(X_train)
y_val_pred = model_lgbm.predict(X_val)

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_val, y_val_pred)))

AUC Train: 0.9882
AUC Valid: 0.8981
