# Modeling - XGBoost Classifier
In this notebook, I have created a XGBoost Classifier for predicting churn for an an Internet Service Provider and tuned its hyperparameters using Optuna which is a Hyperparameter Optimization Framework that uses Tree-structured Parzen Estimator (TPE) to find the most optimal parameters.

## Table of Contents:
1. Data Loading
2. Modeling
    - Finding Best Hyperparameters
    - Building Model with tuned parameters

In [2]:
# Importing required libraries and modules
import os
import sys
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import QuantileTransformer

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [3]:
# Setting seaborn figure size
sns.set(rc={'figure.figsize':(10,8)})

# Setting the seed
np.random.seed(42)

## Data Loading

In [4]:
train_prepared = pd.read_csv('../data/processed/train-prepared.csv')

In [5]:
print('Shape=>', train_prepared.shape)
train_prepared.head()

Shape=> (62273, 11)


Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,remaining_contract,is_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,1,1,1.77,7,0.19,1,0,114.1,8.7,0,0
1,1,0,0.05,6,0.59,1,0,12.7,1.3,0,0
2,0,0,1.42,18,0.0,0,0,0.4,0.0,0,1
3,1,0,0.73,20,0.0,1,0,9.3,0.4,0,1
4,1,0,0.25,17,0.0,1,0,6.1,0.5,0,1


In [6]:
# Getting an overview of the dataset
train_prepared.info(show_counts=True,verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62273 entries, 0 to 62272
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   is_tv_subscriber             62273 non-null  int64  
 1   is_movie_package_subscriber  62273 non-null  int64  
 2   subscription_age             62273 non-null  float64
 3   bill_avg                     62273 non-null  int64  
 4   remaining_contract           62273 non-null  float64
 5   is_contract                  62273 non-null  int64  
 6   service_failure_count        62273 non-null  int64  
 7   download_avg                 61948 non-null  float64
 8   upload_avg                   61948 non-null  float64
 9   download_over_limit          62273 non-null  int64  
 10  churn                        62273 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 5.2 MB


## Modeling

In [7]:
# Separating predictors and target
X, y = train_prepared.loc[:, train_prepared.columns != 'churn'], train_prepared.loc[:, 'churn']

### Finding Best Hyperparameters

In [8]:
def create_xgb_pipeline(quantile_transform: str, n_estimators: int, max_depth: int,
                        learning_rate: float, gamma: float, reg_lambda: float,
                        scale_pos_weight: float, subsample: float,
                        colsample_bytree: float) -> Pipeline:
    """ Returns a pipeline object created around XGBoost algorithm
    
    Takes data preparation and XGBoost Classifier
        parameters as input, creates a Scikit-learn
        pipeline object and returns it
    
    Parameters
    ----------
    quantile_transform : str
        - "Yes": Quantile Transformation will be performed
        - "No": No Transformation
    
    n_estimators: int
        n_estimators argument of xgb.XGBClassifier
    
    max_depth: int
        max_depth argument of xgb.XGBClassifier

    learning_rate: float
        learning_rate argument of xgb.XGBClassifier
    
    gamma: float
        gamma argument of xgb.XGBClassifier
    
    reg_lambda: float
        reg_lambda argument of xgb.XGBClassifier
    
    scale_pos_weight: float
        scale_pos_weight argument of xgb.XGBClassifier
    
    subsample: float
        subsample argument of xgb.XGBClassifier
    
    colsample_bytree: float
        colsample_bytree argument of xgb.XGBClassifier
    
    Returns
    -------
    pipeline : Pipeline
        The pipeline object from Scikit-Learn
    """
    pipeline_steps = []
    
    # Adding SimpleImputer to pipeline
    imputer = SimpleImputer(strategy = 'median')
    pipeline_steps.append(('median_imputer', imputer))

    # Adding QuantileTransformer to pipeline (if required)
    if quantile_transform == "Yes":
        quantile_transformer = QuantileTransformer(n_quantiles=1000,
                                                   output_distribution='normal',
                                                   random_state=42)

        transformer = ColumnTransformer(transformers=[('quantile_transformer',
                                                       quantile_transformer,
                                                       [2, 3, 4, 6, 7, 8, 9])],
                                        n_jobs=-1,
                                        remainder='passthrough')
        pipeline_steps.append(('transformer', transformer))
    
    # Adding CART Model to pipeline
    model = xgb.XGBClassifier(objective = "binary:logistic",
                              use_label_encoder = False,
                              tree_method = "gpu_hist", 
                              n_estimators = n_estimators, 
                              max_depth = max_depth,
                              learning_rate = learning_rate,
                              gamma = gamma,
                              reg_lambda = reg_lambda,
                              scale_pos_weight = scale_pos_weight,
                              subsample = subsample,
                              colsample_bytree = colsample_bytree,
                              verbosity = 0,
                              n_jobs = -1,
                              random_state = 42)
    
    pipeline_steps.append(('xgb_model', model))
    
    # Building Pipeline Object
    pipeline = Pipeline(steps = pipeline_steps)
    
    return pipeline

In [9]:
def objective(trial: optuna.trial.Trial) -> np.ndarray:
    """ Returns mean ROC-AUC score for XGBoost Classification
        algorithm
    
    Objective function for optimizing XGBoost algorithm
        using Optuna. Takes optuna Trial object as input,
        performs 10-fold cross-validation and returns
        mean ROC-AUC score for a set of hyperparameters
        of XGBoost modeling pipeline.
        
    Parameters
    ----------
    trial : optuna.trial.Trial
        A trial is a process of evaluating an objective function.
        This object is passed to an objective function and provides
        interfaces to get parameter suggestion, manage the trial’s
        state, and set/get user-defined attributes of the trial.
    
    Returns
    -------
    roc_auc_score : np.ndarray
        Mean ROC-AUC Score of 10-fold cross-validation
        for a XGBoost modeling pipeline with a set of
        hyperparameters.
    """
    # Data preparation parameters
    quantile_transform = trial.suggest_categorical("quantile_transform", ["Yes", "No"])
    
    # Modeling parameters
    n_estimators = trial.suggest_int("n_estimators", low=100, high=1000, step=10)
    max_depth = trial.suggest_int("max_depth", low=3, high=13, step=2)
    
    learning_rate = trial.suggest_loguniform("learning_rate", low=0.01, high=0.2)
    
    gamma = trial.suggest_float("gamma", low=0.05, high=1.0, step=0.05)
    reg_lambda = trial.suggest_float("reg_lambda", low=0.001, high=100.0, log = True)
    scale_pos_weight = trial.suggest_float("scale_pos_weight", low=1.0, high=2.0, step=0.05)
    subsample = trial.suggest_float("subsample", low=0.6, high=1.0, step=0.1)
    colsample_bytree = trial.suggest_float("colsample_bytree", low=0.6, high=1.0, step=0.1)
    
    # Building modeling pipeline
    pipeline = create_xgb_pipeline(quantile_transform, n_estimators, max_depth,
                                   learning_rate, gamma,reg_lambda,
                                   scale_pos_weight, subsample, colsample_bytree)
    
    # Defining Cross-Validation
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    
    return np.mean(scores)

In [10]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, show_progress_bar = True, n_trials = 50)

[32m[I 2021-10-13 09:08:36,451][0m A new study created in memory with name: no-name-4c7cd9f6-dde2-4827-a3c4-d8400179393b[0m
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2021-10-13 09:10:03,982][0m Trial 0 finished with value: 0.9830265407727726 and parameters: {'quantile_transform': 'Yes', 'n_estimators': 990, 'max_depth': 7, 'learning_rate': 0.02117304281067758, 'gamma': 0.3, 'reg_lambda': 2.3159038099097997, 'scale_pos_weight': 1.8, 'subsample': 0.6, 'colsample_bytree': 0.6}. Best is trial 0 with value: 0.9830265407727726.[0m
[32m[I 2021-10-13 09:11:01,205][0m Trial 1 finished with value: 0.9801597175697137 and parameters: {'quantile_transform': 'No', 'n_estimators': 710, 'max_depth': 9, 'learning_rate': 0.17653036620552098, 'gamma': 0.5, 'reg_lambda': 0.5879160719090933, 'scale_pos_weight': 1.45, 'subsample': 0.7, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.9830265407727726.[0m
[32m[I 2021-10-13 09:11:32,605][0m Trial 2 finished with value: 0.9821728534431386 and parameters: {'quantile_transform': 'No', 'n_estimators': 900, 'max_depth': 5, 'learning_rate': 0.02523845411952925, 'gamma': 0.7500000000000001, 'reg_lambda': 0.

In [11]:
print("Highest Score: ", study.best_value)
print("Best Parameters: ", study.best_params)
print("Best Trial: ", study.best_trial)

Highest Score:  0.9840120040450003
Best Parameters:  {'quantile_transform': 'Yes', 'n_estimators': 570, 'max_depth': 13, 'learning_rate': 0.024950194312574222, 'gamma': 0.6000000000000001, 'reg_lambda': 1.1653190123688406, 'scale_pos_weight': 1.3, 'subsample': 0.9, 'colsample_bytree': 0.6}
Best Trial:  FrozenTrial(number=33, values=[0.9840120040450003], datetime_start=datetime.datetime(2021, 10, 13, 10, 7, 20, 250637), datetime_complete=datetime.datetime(2021, 10, 13, 10, 10, 24, 284671), params={'quantile_transform': 'Yes', 'n_estimators': 570, 'max_depth': 13, 'learning_rate': 0.024950194312574222, 'gamma': 0.6000000000000001, 'reg_lambda': 1.1653190123688406, 'scale_pos_weight': 1.3, 'subsample': 0.9, 'colsample_bytree': 0.6}, distributions={'quantile_transform': CategoricalDistribution(choices=('Yes', 'No')), 'n_estimators': IntUniformDistribution(high=1000, low=100, step=10), 'max_depth': IntUniformDistribution(high=13, low=3, step=2), 'learning_rate': LogUniformDistribution(high=

### Building Model with tuned parameters

In [12]:
tuned_params = study.best_params

In [13]:
# Building modeling pipeline
pipeline = create_xgb_pipeline(quantile_transform = tuned_params["quantile_transform"],
                               n_estimators = tuned_params["n_estimators"],
                               max_depth = tuned_params["max_depth"],
                               learning_rate = tuned_params["learning_rate"],
                               gamma = tuned_params["gamma"],
                               reg_lambda = tuned_params["reg_lambda"],
                               scale_pos_weight = tuned_params["scale_pos_weight"],
                               subsample = tuned_params["subsample"],
                               colsample_bytree = tuned_params["colsample_bytree"])

# Defining model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)

# Evaluating Model
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv = cv, n_jobs = -1)

In [14]:
print("XGBoost Classifier Pipeline: ", pipeline)
print('Mean AUC-ROC Score of XGBoost Classifier: %.4f \u00B1 %.4f' % (np.mean(scores), np.std(scores)))

XGBoost Classifier Pipeline:  Pipeline(steps=[('median_imputer', SimpleImputer(strategy='median')),
                ('transformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('quantile_transformer',
                                                  QuantileTransformer(output_distribution='normal',
                                                                      random_state=42),
                                                  [2, 3, 4, 6, 7, 8, 9])])),
                ('xgb_model',
                 XGBClassifier(base_score=None, booster=None,
                               colsample_bylevel=None, colsample_by...
                               learning_rate=0.024950194312574222,
                               max_delta_step=None, max_depth=13,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=570,
                   