In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("ggplot")

### **0. Chargement des données**

In [2]:
import os
from pathlib import Path

data_folder = Path("../data")
os.listdir(data_folder)

['test_data.csv', 'submissions', 'train_data.csv']

In [3]:
data = pd.read_csv(data_folder / "train_data.csv")
data.head()

Unnamed: 0,ID,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,37765,15794860,Ch'eng,627,France,Male,28.0,7,131694.04,1,1.0,1.0,161205.61,0
1,130453,15728005,Hargreaves,597,France,Male,34.0,2,0.0,2,0.0,1.0,181419.29,0
2,77297,15686810,Ts'ui,724,France,Male,39.0,7,0.0,2,1.0,1.0,100862.54,0
3,40858,15760244,Trevisano,663,Germany,Female,56.0,5,118577.24,3,1.0,0.0,61164.45,1
4,19804,15810563,French,627,France,Female,33.0,5,0.0,2,1.0,1.0,103737.82,0


### **1. Traitement des variables**

In [4]:
# Suppression des variables a priori inutiles
df = data.copy().sort_values("Exited")
df.drop(columns=[
    "CustomerId",
    "Surname"
], inplace=True)

In [5]:
# Création de nouvelles variables potentiellement pertinenentes après analyse
def create_columns(df):
    X = df.copy()
    
    X["IsNewClient"] = (X["Tenure"] == 0).astype(int)

    X["HasNullBalance"] = (X["Balance"] == 0).astype(int)

    X["NumOfProducts_2"] = X["NumOfProducts"].replace({4: 3})

    X["EstimatedSalary_2"] = pd.cut(
        x=X["EstimatedSalary"],
        bins=[-np.inf, 39500, 78260, 115400, 154430, np.inf],
        labels=[0, 1, 2, 3, 4],
    )

    X["Balance_2"] = pd.cut(
        x=X["Balance"],
        bins=[-np.inf, 50000, 100000, 150000, 200000, np.inf],
        labels=[0, 1, 2, 3, 4],
    )

    X["CreditScore_2"] = pd.cut(
        x=X["CreditScore"],
        bins=[-np.inf, 545, 612, 673, 744, np.inf],
        labels=[0, 1, 2, 3, 4],
    )
    
    return X

### **2. Preprocessing et modèle**

In [6]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Exited"), df["Exited"], test_size=0.2, random_state=42)

In [8]:
preprocessor = Pipeline(steps=[
    ("CreateColumns", FunctionTransformer(create_columns)),
    ("Transformer", ColumnTransformer(
    transformers=[
        ("OneHotEncoder", OneHotEncoder(drop="first", handle_unknown="error"), ["Gender", "Geography"]),
        ("MinMaxScaler", MinMaxScaler(), ["Age", "NumOfProducts_2", "NumOfProducts", "Balance_2", "Balance", "CreditScore", "CreditScore_2", "EstimatedSalary", "EstimatedSalary_2", "Tenure"]),
        ("Passthrough", "passthrough", ["IsActiveMember", "HasNullBalance", "HasCrCard", "IsNewClient"])
    ])),
])

In [9]:
params = {
    "scale_pos_weight": 1.9,
    "seed": 42,
    "eta": 0.22,
}

model = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Classifier", XGBClassifier(**params)),
])

model.fit(X_train, y_train)

In [10]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_f1_score = f1_score(y_train, y_pred_train)
test_f1_score = f1_score(y_test, y_pred_test)

print(f"{train_f1_score = :.6f}")
print(f"{test_f1_score = :.6f}")

train_f1_score = 0.703623
test_f1_score = 0.665471


### **3. Sauvegarde avec MLflow**

In [11]:
import mlflow
from mlflow.models import infer_signature

In [12]:
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("mlpro-classification-bank-churn")

with mlflow.start_run():
    # Hyperparamètres et métriques
    mlflow.log_params(params)
    mlflow.log_metric("train_f1_score", train_f1_score)
    mlflow.log_metric("test_f1_score", test_f1_score)

    # Signature du modèle
    X_sample = X_train.sample(10, random_state=42)
    signature = infer_signature(X_sample, model.predict(X_sample))

    # Sauvegarde du modèle
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="bank-churn-model",
        signature=signature,
        input_example=X_sample,
        registered_model_name="bank-churn-classifier",
    )

Registered model 'bank-churn-classifier' already exists. Creating a new version of this model...
2025/02/11 12:09:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bank-churn-classifier, version 3


🏃 View run bold-dove-263 at: http://localhost:8080/#/experiments/766743011901140237/runs/72df59c87c1e49ff8bd0788ec4c5f72b
🧪 View experiment at: http://localhost:8080/#/experiments/766743011901140237


Created version '3' of model 'bank-churn-classifier'.


### **4. Prédictions sur le test set de Kaggle**

In [13]:
# Données
test_data = pd.read_csv(data_folder / "test_data.csv")

In [14]:
# Chargement depuis le serveur MLflow
model_uri = model_info.model_uri
loaded_model = mlflow.pyfunc.load_model(model_uri)

# Prédiction
exited = loaded_model.predict(test_data)
submission = test_data[["ID"]].copy()
submission["Exited"] = exited

# Sauvegarde
submission_folder = data_folder / "submissions"
filename = f"submission_{model_info.run_id}.csv"
submission.to_csv(submission_folder / filename, index=False)
print(f"{filename} sauvegardé avec succès !")



submission_72df59c87c1e49ff8bd0788ec4c5f72b.csv sauvegardé avec succès !


Kaggle results on **2025-02-10 12:40 PM**
- Leaderboard position : 2nd
- Score : 0.66205
- 1st score : 0.66350