In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("ggplot")

### **0. Chargement des données**

In [2]:
import os
from pathlib import Path

data_folder = Path("../data")
os.listdir(data_folder)

['test_data.csv', 'submissions', 'train_data.csv']

In [3]:
data = pd.read_csv(data_folder / "train_data.csv")
data.head()

Unnamed: 0,ID,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,37765,15794860,Ch'eng,627,France,Male,28.0,7,131694.04,1,1.0,1.0,161205.61,0
1,130453,15728005,Hargreaves,597,France,Male,34.0,2,0.0,2,0.0,1.0,181419.29,0
2,77297,15686810,Ts'ui,724,France,Male,39.0,7,0.0,2,1.0,1.0,100862.54,0
3,40858,15760244,Trevisano,663,Germany,Female,56.0,5,118577.24,3,1.0,0.0,61164.45,1
4,19804,15810563,French,627,France,Female,33.0,5,0.0,2,1.0,1.0,103737.82,0


### **1. Traitement des variables**

In [4]:
# Suppression des variables a priori inutiles
df = data.copy().sort_values("Exited")
df.drop(columns=[
    "CustomerId",
    "Surname"
], inplace=True)

In [5]:
# Création de nouvelles variables potentiellement pertinenentes après analyse
# Les intervalles ont été créés après essais-erreurs en combinant observation
# des données et techniques de clustering (k-means) 
def create_columns(df):
    X = df.copy()
    
    X["IsNewClient"] = X["Tenure"] == 0

    X["HasNullBalance"] = X["Balance"] == 0

    X["NumOfProducts_2"] = X["NumOfProducts"].replace({4: 3})

    X["EstimatedSalary_2"] = pd.cut(
        x=X["EstimatedSalary"],
        bins=[-np.inf, 39500, 78260, 115400, 154430, np.inf],
        labels=[0, 1, 2, 3, 4],
    )

    X["Balance_2"] = pd.cut(
        x=X["Balance"],
        bins=[-np.inf, 50000, 100000, 150000, 200000, np.inf],
        labels=[0, 1, 2, 3, 4],
    )

    X["CreditScore_2"] = pd.cut(
        x=X["CreditScore"],
        bins=[-np.inf, 545, 612, 673, 744, np.inf],
        labels=[0, 1, 2, 3, 4],
    )
    
    return X

### **2. Modèle avec recheche d'hyperparamètres**

In [6]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

import mlflow
from mlflow.models import infer_signature

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

import warnings
warnings.filterwarnings("ignore")

import json

In [7]:
# Séparation des données (features / target)
X = df.drop(columns="Exited")
y = df["Exited"]

In [8]:
# Signature du modèle
np.random.seed(42)
ids = np.random.choice(range(len(X)), 10)
X_sample = X.iloc[ids]
y_sample = y.iloc[ids]
signature = infer_signature(X_sample, y_sample)

In [9]:
# Preprocessing
preprocessor = Pipeline(steps=[
    ("CreateColumns", FunctionTransformer(create_columns)),
    ("Transformer", ColumnTransformer(
    transformers=[
        ("OneHotEncoder", OneHotEncoder(drop="first", handle_unknown="error"), ["Gender", "Geography"]),
        ("MinMaxScaler", MinMaxScaler(), ["Age", "NumOfProducts_2", "NumOfProducts", "Balance_2", "Balance", "CreditScore", "CreditScore_2", "EstimatedSalary", "EstimatedSalary_2", "Tenure"]),
        ("Passthrough", "passthrough", ["IsActiveMember", "HasNullBalance", "HasCrCard", "IsNewClient"])
    ])),
])

In [10]:
# Configuration et entraînement du modèle
def train_model(params, X, y):
    # Conversion en entier de certains hyperparamètres 
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    
    # Modèle
    model = Pipeline(steps=[
        ("Preprocessor", preprocessor),
        ("Classifier", XGBClassifier(**params)),
    ])

    with mlflow.start_run(nested=True):
        # Entraînement avec cross-validation
        # Maximiser le f1-score <=> minimiser son opposé
        # +1 pour rester dans [0, 1]
        loss = cross_val_score(estimator=model, X=X, y=y, scoring="f1", cv=5)
        loss = 1 - loss.mean()
        
        # Hyperparamètres et métriques
        mlflow.log_params(params)
        mlflow.log_metric("f1_score", 1 - loss)

        # Sauvegarde du modèle
        mlflow.sklearn.log_model(
            model,
            artifact_path="bank-churn-model-xgb-hyperopt",
            signature=signature,
            input_example=X_sample,
            registered_model_name="bank-churn-classifier",
        )
        
        return {"loss": loss, "status": STATUS_OK, "model": model}

def objective(params):
    return train_model(params, X=X, y=y)

In [11]:
# Espace de recherche des hyperparamètres
space = {
    # Valeurs fixes
    "learning_rate": 0.22,
    "random_state": 42,
    "scale_pos_weight": 1.9,

    # A explorer
    "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 1.0, 0.05),
    "gamma": hp.uniform("gamma", 0, 10),
    "max_depth": hp.quniform("max_depth", 5, 15, 1),
    "max_delta_step": hp.uniform("max_delta_step", 0, 10),
    "min_child_weight": hp.uniform("min_child_weight", 0, 5),
    "n_estimators": hp.quniform("n_estimators", 10, 1000, 10),
    "reg_alpha": hp.uniform("reg_alpha", 0, 5),
    "reg_lambda": hp.uniform("reg_lambda", 0, 5),
    "subsample": hp.quniform("subsample", 0.5, 1, 0.05),
}

### **3. Run et logging avec MLflow**

In [None]:
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("mlpro-classification-bank-churn")

with mlflow.start_run():
    # Recherche d'hyperparamètres
    trials = Trials()
    best_params = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=5,
        trials=trials,
        rstate=np.random.default_rng(42),
    )
    
    # Meilleur modèle
    best_run = trials.best_trial["result"]
    
    # Réentraîner le modèle sur l'ensemble des données pour la sauvegarde
    model = best_run["model"]
    model.fit(X, y)
    
    # Sauvegarde
    mlflow.log_params(best_params)
    mlflow.log_metric("f1_score", 1 - best_run["loss"])
    model_info = mlflow.sklearn.log_model(
        model,
        signature=signature,
        input_example=X_sample,
        artifact_path="bank-churn-model-xgb-hyperopt",
        registered_model_name="bank-churn-classifier",
    )

In [13]:
# Meilleurs paramètres et score associé
print(f"Best parameters: {json.dumps(best_params, indent=4)}")
print(f"Best eval f1-score: {1 - best_run["loss"]}")

Best parameters: {
    "colsample_bytree": 0.75,
    "gamma": 3.2399903571543387,
    "max_delta_step": 7.155584591130996,
    "max_depth": 6.0,
    "min_child_weight": 3.8392819888268797,
    "n_estimators": 660.0,
    "reg_alpha": 4.419633127363078,
    "reg_lambda": 3.100594988383205,
    "subsample": 0.9
}
Best eval f1-score: 0.6651497257749395


### **4. Prédictions sur le test set de Kaggle**

In [14]:
# Données
test_data = pd.read_csv(data_folder / "test_data.csv")

In [15]:
# Chargement du meilleur modèle depuis le serveur MLflow
model_uri = model_info.model_uri
loaded_model = mlflow.pyfunc.load_model(model_uri)

# Prédiction
exited = loaded_model.predict(test_data)
submission = test_data[["ID"]].copy()
submission["Exited"] = exited

# Sauvegarde
submission_folder = data_folder / "submissions"
filename = f"submission_{model_info.run_id}.csv"
submission.to_csv(submission_folder / filename, index=False)
print(f"{filename} sauvegardé avec succès !")

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 64.97it/s]


submission_3590a436e72044b18f075351f0559a70.csv sauvegardé avec succès !


Kaggle results on **2025-02-14 3:10 PM**
- Leaderboard position : 2nd
- Score : 0.66391
- 1st score : 0.66483