In [None]:
# Hyperparamter Tuning - XGBoost

# Here is the highlight of the process
# 1. Import the customer churn data (I have already cleaned it)
# 2. Split the data into test and train sets
# 3. Build data matrices - as XGBoost uses DMatrix
# 4. Find the logloss of the model with default parameters
# 5. Tune the parameters
# 6. Find the logloss of the model with tuned parameters

# For exploratory analysis and other models on this dataset, please use the following link
# https://github.com/Nickssingh/Churn-Prediction-Model-Telecommunication

In [None]:
import pandas as pd
import numpy as np

In [None]:
# We will import the dataset and view top rows
# I have already preapared the the data for analysis 
    # Removed the missing values
    # Converted the variables into appropriate data types
    # Encoded categorical variables using one hot encoding

df_churn=pd.read_csv("Data/telcom_customer_churn.csv")

In [None]:
df_churn.head()

In [None]:
# Checking the dimension of the data

df_churn.shape

In [None]:
# Splitting the data into train and test datasets
# test:train = 3:7
from sklearn.model_selection import train_test_split


df_temp = df_churn
y = df_temp['Churn']
X = df_temp.drop('Churn', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train,y_train)


In [None]:
pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
# Installing XGBoost

#!pip install xgboost

In [None]:
# XGBoost uses an internal data structure DMatrix - which optimizes both memory effieciency and speed
# Hence, rather than using pandas dataframe, we will use data matrix - DMatrix
import xgboost as xgb
dm_train = xgb.DMatrix(X_train, label=y_train)
dm_test = xgb.DMatrix(X_test, label=y_test)

In [None]:
# Building Model

# Ideal case would include an exhaustive gridsearch on all the parameters.
# However, such an approach is computationally intensive.
# Hence, we will focus on few important parameters and tune them sequentially.

# Following are the parameters that we will tune in this process
# max_depth
# min_child_weight
# subsample
# colsample_bytree
# eta
# num_boost_rounds
# early_stopping_rounds

# We will use logistic loss function to assess the accuracy of predictions, as this is a classification problem

In [None]:
# We will set num_boost_rounds to 100, early_stopping_rounds to 10, and objective to binary:logistic.
# All the other values at this stage are default values.
# We will tune our model by chaning the default values.

params = {'max_depth':6, 'min_child_weight':1, 'eta':0.3, 'subsample':1, 
          'colsample_bytree':1, 'objective':'binary:logistic',}

# We will use logloss function to evaluate the model's performance
params['eval_metric'] = "logloss"

xgmodel = xgb.train(params, dtrain = dm_train, num_boost_round = 100, evals = [(dm_test,"Test")], 
                    early_stopping_rounds = 10)

print("Best Logloss: {:.3f} | Rounds: {}".format(xgmodel.best_score,xgmodel.best_iteration+1))

In [None]:
# Here, we found that the tenth round gave the best result and the results did not improve in the next 10 rounds
# Hence, the iteration stopped at round 19 and we did not reach the maximum number of boosting rounds (100).

# Finding a suitable evidence to stop the iterations is important.
# Stopping the iterations when results do not improve prevents overfittig and the inefficient utilization of resources.

In [None]:
# We will use cross validation to tune the parameters within the params dictionary

In [None]:
# Parameters: max-depth and min_child_weight
# I realized that the optimal values are in the following ranges through multiple iterations

gridsearch_params = [(max_depth, min_child_weight)
                    for max_depth in range(1,4)
                    for min_child_weight in range(17,21)]

In [None]:
gridsearch_params

In [None]:
logloss_min = float("Inf")
best_params = None

for max_depth, min_child_weight in gridsearch_params:
    
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    xg_cvresults = xgb.cv(params, dtrain = dm_train, num_boost_round = 100,
                      seed = 0, nfold=10, metrics = {'logloss'}, early_stopping_rounds = 10,)
    
    logloss_mean = xg_cvresults['test-logloss-mean'].min()
    
    print("max_depth: {} | min_child_weight: {} with Logloss: {:.3}\n".format(max_depth,min_child_weight,logloss_mean))
    
    if logloss_mean < logloss_min:
        logloss_min = logloss_mean
        best_params = (max_depth, min_child_weight)

        
print("Best Parameters: max_depth: {} | min_child_weight: {} with Logloss: {:.3f}". format(best_params[0], 
                                                                                  best_params[1], logloss_min))

In [None]:
# Updating the parameters with the best values: max_depth = 2 and min_child_weight = 19

params['max_depth'] = 2
params['min_child_weight'] = 19

In [None]:
# Parameters: subsample and colsample_bytree
# I found that the optimal values are in the following ranges through multiple iterations

gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(1,5)]
]

In [None]:
logloss_min = float("Inf")
best_params = None

for subsample, colsample in (gridsearch_params):
    
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    
    xg_cvresults = xgb.cv(params, dtrain = dm_train, num_boost_round = 100,
                      seed = 0, nfold=10, metrics = {'logloss'}, early_stopping_rounds = 10,)
    
    logloss_mean = xg_cvresults['test-logloss-mean'].min()
    
    print("subsample: {} | colsample: {} with Logloss: {:.3f}\n".format(subsample,colsample,logloss_mean))
    
    if logloss_mean < logloss_min:
        logloss_min = logloss_mean
        best_params = (subsample, colsample)
        
print("Best Parameters: subsample: {} | colsample: {} with Logloss: {:.3f}". format(best_params[0], 
                                                                           best_params[1], logloss_min))

In [None]:
# Updating the parameters with the best values: subsample = 0.9 and colsample = 0.4

params['subsample'] = 0.9
params['colsample_bytree'] = 0.4

In [None]:
# Parameter: eta

logloss_min = float("Inf")
best_params = None

for eta in [0.3, 0.2, 0.1, 0.05, 0.01, 0.005]:
    
    params['eta'] = eta
    
    xg_cvresults = xgb.cv(params, dtrain = dm_train, num_boost_round = 100,
                      seed = 0, nfold=10, metrics = {'logloss'}, early_stopping_rounds = 10,)
    
    logloss_mean = xg_cvresults['test-logloss-mean'].min()
    print("eta: {} with Logloss: {:.3}\n".format(eta,logloss_mean))
    
    if logloss_mean < logloss_min:
        logloss_min = logloss_mean
        best_params = eta
        
print("Best Parameter: eta: {} with Logloss: {:.3f}". format(best_params, logloss_min))

In [None]:
# Updating the eta parameter with the best value

params['eta'] = 0.3

In [None]:
# Setting the optimum paramters

params = {'colsample_bytree': 0.4,
          'eta': 0.3,
          'eval_metric': 'logloss',
          'max_depth': 2,
          'min_child_weight': 19,
          'objective':'binary:logistic',
          'subsample': 0.9}

In [None]:
model = XGBClassifier(eta=0.3,max_depth=2,colsample_bytree=0.4,subsample=0.9,min_child_weight=19,
                      objective='binary:logistic')
model.fit(X_train,y_train)

In [None]:
pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
# Finding the optimal number of rounds for the model with new parameters

xgmodel_tuned = xgb.train(params, dtrain = dm_train, 
                          num_boost_round=100, evals=[(dm_test,"Test")], early_stopping_rounds=10)


print("Best Logloss: {:.3f} in {} rounds". format(xgmodel_tuned.best_score, xgmodel_tuned.best_iteration+1))

In [None]:
# With the tuned parameters we would need 27 rounds to achieve the best result

# The improvement after parameter tuning is marginal in our case.
    # Logloss of our model decreased from 0.424 to 0.417
# However, we were able to see how parameters can be tuned.

# Here we have used only a few combination of parameters.
# We can further improve the impact of tuning; however, doing so would be computationally more expensive.
# More combination of parameters and wider ranges of values for each of those paramaters would have to be tested.