In [36]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# XGBoost

In [37]:
df = pd.read_csv('data/cleaned_data.csv')

In [38]:
# Evaluation Function
def evaluate(model,dt_test,dt_train,target_train,target_test):
    pred_test = model.predict(dt_test)
    pred_train = model.predict(dt_train)
    
    print('Evaluations for test:\n', confusion_matrix(target_test, pred_test))
    print(classification_report(target_test, pred_test))
    print('\n')
    print('Evaluations for train:\n',confusion_matrix(target_train, pred_train))
    print(classification_report(target_train, pred_train))

In [39]:
target = df['churn']
data = df.drop(columns=['churn'], axis=1)

data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=42)

In [40]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(data_train, target_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [41]:
evaluate(clf, data_test, data_train, target_train, target_test)

Evaluations for test:
 [[704   5]
 [ 36  89]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       709
           1       0.95      0.71      0.81       125

    accuracy                           0.95       834
   macro avg       0.95      0.85      0.89       834
weighted avg       0.95      0.95      0.95       834



Evaluations for train:
 [[2140    1]
 [  73  285]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2141
           1       1.00      0.80      0.89       358

    accuracy                           0.97      2499
   macro avg       0.98      0.90      0.93      2499
weighted avg       0.97      0.97      0.97      2499



# GridSearch Tuning

In [42]:
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [6],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [100],
}

In [43]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(data_train, target_train)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 6
min_child_weight: 1
n_estimators: 100
subsample: 0.5


In [29]:
evaluate(grid_clf, data_test, data_train, target_train, target_test)

Evaluations for test:
 [[703   6]
 [ 31  94]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       709
           1       0.94      0.75      0.84       125

    accuracy                           0.96       834
   macro avg       0.95      0.87      0.90       834
weighted avg       0.96      0.96      0.95       834



Evaluations for train:
 [[2141    0]
 [  43  315]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2141
           1       1.00      0.88      0.94       358

    accuracy                           0.98      2499
   macro avg       0.99      0.94      0.96      2499
weighted avg       0.98      0.98      0.98      2499

