In [1]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import warnings
import imblearn
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')
%matplotlib inline



# XGBoost

In [8]:
df = pd.read_csv('data/cleaned_data.csv')

In [9]:
# Evaluation Function
def evaluate(model,dt_test,dt_train,target_train,target_test):
    pred_test = model.predict(dt_test)
    pred_train = model.predict(dt_train)
    
    print('Evaluations for test:\n', confusion_matrix(target_test, pred_test))
    print(classification_report(target_test, pred_test))
    print('\n')
    print('Evaluations for train:\n',confusion_matrix(target_train, pred_train))
    print(classification_report(target_train, pred_train))

In [10]:
target = df['churn']
data = df.drop(columns=['churn'], axis=1)

data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=42)

In [11]:
data_train_smoted, target_train_smoted = SMOTE(random_state=42).fit_resample(data_train, target_train)
data_train = data_train_smoted
target_train = target_train_smoted

In [14]:
data_train = pd.DataFrame.from_records(data_train)
data_train = data_train.set_axis(['account length', 'international plan', 'voice mail plan', 'total day minutes', 
                                  'total day calls', 'total eve minutes', 'total eve calls', 'total night minutes', 
                                  'total night calls', 'total intl minutes', 'total intl calls', 
                                  'customer service calls'], axis=1, inplace=False)

In [15]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(data_train, target_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [16]:
evaluate(clf, data_test, data_train, target_train, target_test)

Evaluations for test:
 [[692  17]
 [ 29  96]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       709
           1       0.85      0.77      0.81       125

    accuracy                           0.94       834
   macro avg       0.90      0.87      0.89       834
weighted avg       0.94      0.94      0.94       834



Evaluations for train:
 [[2108   33]
 [ 121 2020]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2141
           1       0.98      0.94      0.96      2141

    accuracy                           0.96      4282
   macro avg       0.96      0.96      0.96      4282
weighted avg       0.96      0.96      0.96      4282



# GridSearch Tuning

In [17]:
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [6],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [100],
}

In [18]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(data_train, target_train)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

Grid Search found the following optimal parameters: 
learning_rate: 0.2
max_depth: 6
min_child_weight: 1
n_estimators: 100
subsample: 0.5


In [19]:
evaluate(grid_clf, data_test, data_train, target_train, target_test)

Evaluations for test:
 [[702   7]
 [ 28  97]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       709
           1       0.93      0.78      0.85       125

    accuracy                           0.96       834
   macro avg       0.95      0.88      0.91       834
weighted avg       0.96      0.96      0.96       834



Evaluations for train:
 [[2141    0]
 [  11 2130]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2141
           1       1.00      0.99      1.00      2141

    accuracy                           1.00      4282
   macro avg       1.00      1.00      1.00      4282
weighted avg       1.00      1.00      1.00      4282

