In [91]:
from typing import Dict

import pandas as pd
import numpy as np

import xgboost
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK, space_eval
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, accuracy_score, roc_auc_score

In [18]:
churn_db = pd.read_csv("../../data/Bank Customer Churn Prediction.csv")
churn_db

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,15569892,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,15584532,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,15682355,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# XGBoost modeling

In [103]:

from typing import Any, Dict, List
import xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from abc import ABC, abstractmethod
from xgboost import XGBModel

class BaseChurnModel(ABC):
    """BaseChurnModel.

    This class is a wrapper for the propensity model. It is used to fit the data and
    predict the propensity.

    Attributes:
        model (Any): A estimator or pipeline to fit.
        seed (int): The seed to use.
        split_data (Dict[str, pd.DataFrame]): A dictionary with the split data.
        target_col (str): The target column to predict.


    """
    def __init__(self, data:pd.DataFrame, target_col: str, seed: int = 100, exclude_cols: List[str] | str = []) -> None:
        """__init__ method.

        Args:
            estimator (str): A estimator to fit.
            target_col (str): The target column to predict.
            seed (int): The seed to use.

        """
        self.seed = seed
        self.data = data
        self.target_col = target_col
        if exclude_cols:
            self.excluded_data = self.data[exclude_cols]
        self.split_data =self._split_data_randomly()
        


    def _split_data_randomly(
        self,
        
    ) -> None:
        """_split_data method.

        This method splits the data into train and test sets.

        Args:
            data (pd.DataFrame): A dataframe with the data to fit.

        """
        x_values = self.data.drop(list(self.excluded_data.columns) + [self.target_col], axis=1)
        y_values = self.data[self.target_col]
        x_train, x_test, y_train, y_test = train_test_split(
            x_values, y_values, test_size=0.2, random_state=self.seed
        )
        return  { 
            "x_train": x_train,
            "x_test": x_test,
            "y_train": y_train,
            "y_test": y_test,
        }
      
    def split_time_based(self, date_column: str, split_dates: Dict) -> Dict:
      pass


    @abstractmethod
    def generate_estimator(self) -> Any:
      ...
    
    @abstractmethod
    def train(self, data: pd.DataFrame) -> Any:
        """train method.

        This method trains the model.

        Args:
            data (pd.DataFrame): A dataframe with the data to fit.

        Returns:
            Any: A fitted model.

        """
        ...

class ChurnModel(BaseChurnModel):
    """ChurnModel.

    

    Attributes:
        model (Any): A estimator or pipeline to fit.
        seed (int): The seed to use.
        split_data (Dict[str, pd.DataFrame]): A dictionary with the split data.
        target_col (str): The target column to predict.


    """

    def __init__(self, data: pd.DataFrame, target_col: str,  params: Dict, seed=100, exclude_cols: List[str] = []) -> None:
        """__init__ method.

        Args:
            estimator (str): A estimator to fit.
            target_col (str): The target column to predict.
            seed (int): The seed to use.

        """
        super().__init__(data=data, target_col=target_col, seed=seed, exclude_cols=exclude_cols)
        self.model = self.generate_estimator(params=params)

    def generate_estimator(self, params: Dict) -> xgboost.XGBModel:
        """generate_estimator method.

        This method generates a XGBoost estimator.

        Args:
            params (Dict): A dictionary with the parameters.

        Returns:
            xgboost.XGBModel: A XGBoost estimator.

        """

        return xgboost.XGBClassifier(**params)
    
    def train(self) -> xgboost.XGBModel:
        """train method.

        This method trains the model.

        Args:
            data (pd.DataFrame): A dataframe with the data to fit.

        Returns:
            xgboost.XGBModel: A fitted model.

        """
        return self.model.fit(self.split_data["x_train"], self.split_data["y_train"])


def create_xgboost_churn_model(data: pd.DataFrame ,params: Dict, target_col: str, exclude_cols: List[str] | str = []) -> BaseChurnModel:

  model = ChurnModel(data=data, target_col=target_col, seed=100, params=params, exclude_cols=exclude_cols)

  return model

In [104]:
XGB_MAX_DEPTH = 25
xgb_space = {
  'objective' : hp.choice('objective', ['binary:logistic']),
  'max_depth' : hp.choice('max_depth', np.arange(1, XGB_MAX_DEPTH, dtype=int)),
  "min_child_weight": hp.uniform("min_child_weight", 0, 5),
  "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(0.2)),
  "gamma": hp.uniform("gamma", 0, 5),
  "colsample_bytree": hp.quniform("colsample_bytree", 0.1, 1, 0.01),
  "colsample_bynode": hp.quniform("colsample_bynode", 0.1, 1, 0.01),
  "colsample_bylevel": hp.quniform("colsample_bylevel", 0.1, 1, 0.01),
  "subsample": hp.quniform("subsample", 0.5, 1, 0.05),
  "reg_alpha": hp.uniform("reg_alpha", 0, 5),
  "reg_lambda": hp.uniform("reg_lambda", 0, 5),

}

In [105]:
def xgboost_objective_function(params: Dict) -> float:

  xgb_model = create_xgboost_churn_model( data=churn_db, params={}, target_col="churn", exclude_cols = ["customer_id", "country", "gender"])
  train = xgboost.DMatrix(xgb_model.split_data["x_train"], xgb_model.split_data["y_train"])
  res = xgboost.cv(params, train, num_boost_round=100, nfold=5,
             metrics={'auc'}, seed=0,
             callbacks=[
               xgboost.callback.EvaluationMonitor(show_stdv=True),
                xgboost.callback.EarlyStopping(15),
                ])
  best_loss = res['test-auc-mean'].iloc[-1]
  return {'loss':best_loss, 'status': STATUS_OK }

trials = Trials()
best = fmin(fn=xgboost_objective_function, space=xgb_space, algo=tpe.suggest, max_evals=3,trials=trials)

[0]	train-auc:0.71041+0.00344	test-auc:0.71039+0.01393                                                                                                                                                        
[1]	train-auc:0.71374+0.00319	test-auc:0.71235+0.01753                                                                                                                                                        
[2]	train-auc:0.72209+0.01555	test-auc:0.71727+0.02389                                                                                                                                                        
[3]	train-auc:0.75395+0.03549	test-auc:0.75022+0.04773                                                                                                                                                        
[4]	train-auc:0.77385+0.04358	test-auc:0.77142+0.04442                                                                                                                      

# MLflow

In [106]:
best_params = space_eval(xgb_space, best)
best_params

{'colsample_bylevel': 0.54,
 'colsample_bynode': 0.46,
 'colsample_bytree': 0.23,
 'gamma': 2.6091373473524233,
 'learning_rate': 0.035756543959883834,
 'max_depth': 20,
 'min_child_weight': 2.295787187553348,
 'objective': 'binary:logistic',
 'reg_alpha': 3.6167612292883367,
 'reg_lambda': 0.2788037090045353,
 'subsample': 0.8500000000000001}

In [109]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("churn optimized xgboost")

mlflow.xgboost.autolog()
with mlflow.start_run():

    churn_model = create_xgboost_churn_model( data=churn_db, params={}, target_col="churn", exclude_cols = ["customer_id", "country", "gender"])
    x_train, y_train = churn_model.split_data["x_train"], churn_model.split_data["y_train"]
    x_test, y_test = churn_model.split_data["x_test"], churn_model.split_data["y_test"]
    best_xgb = xgboost.XGBClassifier(**best_params)
    best_xgb.fit(x_train, y_train)

    predictions = best_xgb.predict(x_test)

    accuracy_metric = accuracy_score(y_test, predictions)
    precision_metric = precision_score(y_test, predictions)
    recall_metric = recall_score(y_test, predictions)
    f1_score_metric = f1_score(y_test, predictions)

    mlflow.log_metric("AUC score", roc_auc_score(y_test, best_xgb.predict_proba(x_test)[:, 1]))

    mlflow.end_run()
    

<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'
<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


In [108]:
auc_metric

0.832434093076716

In [102]:
churn_model.excluded_data

Unnamed: 0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,42,2,0.00,1,1,1,101348.88,1
1,608,41,1,83807.86,1,0,1,112542.58,0
2,502,42,8,159660.80,3,1,0,113931.57,1
3,699,39,1,0.00,2,0,0,93826.63,0
4,850,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0
9996,516,35,10,57369.61,1,1,1,101699.77,0
9997,709,36,7,0.00,1,0,1,42085.58,1
9998,772,42,3,75075.31,2,1,0,92888.52,1
