In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("ggplot")

In [2]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

### **0. Chargement des données**

In [3]:
import os
from pathlib import Path

data_folder = Path("../data")
os.listdir(data_folder)

['test_data.csv', 'example_submission.csv', 'train_data.csv']

In [4]:
data = pd.read_csv(data_folder / "train_data.csv")
data.head()

Unnamed: 0,ID,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,37765,15794860,Ch'eng,627,France,Male,28.0,7,131694.04,1,1.0,1.0,161205.61,0
1,130453,15728005,Hargreaves,597,France,Male,34.0,2,0.0,2,0.0,1.0,181419.29,0
2,77297,15686810,Ts'ui,724,France,Male,39.0,7,0.0,2,1.0,1.0,100862.54,0
3,40858,15760244,Trevisano,663,Germany,Female,56.0,5,118577.24,3,1.0,0.0,61164.45,1
4,19804,15810563,French,627,France,Female,33.0,5,0.0,2,1.0,1.0,103737.82,0


### **1. Traitement des variables**

In [5]:
# Suppression des variables a priori inutiles
df = data.copy().sort_values("Exited")
df.drop(columns=[
    "CustomerId",
    "Surname"
], inplace=True)

In [6]:
# Création de nouvelles variables potentiellement pertinenentes après analyse
df ["IsNewClient"] = df["Tenure"] == 0

df["HasNullBalance"] = df["Balance"] == 0

df["NumOfProducts_2"] = df["NumOfProducts"].replace({4: 3})

df["EstimatedSalary_2"] = pd.cut(
    x=df["EstimatedSalary"],
    bins=[0, 39500, 78260, 115400, 154430, 200000],
    labels=[0, 1, 2, 3, 4],
)

df["Balance_2"] = pd.cut(
    x=df["Balance"],
    bins=[-1, 50000, 100000, 150000, 200000, df["Balance"].max()],
    labels=[0, 1, 2, 3, 4],
).astype(int)

df["CreditScore_2"] = pd.cut(
    x=df["CreditScore"],
    bins=[300, 545, 612, 673, 744, 900],
    labels=[0, 1, 2, 3, 4],
)

### **2. Preprocessing et modèle**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Exited"), df["Exited"], test_size=0.2, random_state=42)

preprocessor = Pipeline(steps=[("Transformer", ColumnTransformer(
    transformers=[
        ("OneHotEncoder", OneHotEncoder(drop="first", handle_unknown="error"), ["Gender", "Geography"]),
        ("MinMaxScaler", MinMaxScaler(), ["Age", "NumOfProducts_2", "NumOfProducts", "Balance_2", "Balance", "CreditScore", "CreditScore_2", "EstimatedSalary", "EstimatedSalary_2", "Tenure"]),
        ("Passthrough", "passthrough", ["IsActiveMember", "HasNullBalance", "HasCrCard", "IsNewClient"])
    ]
))])

X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
X_test_preprocessed = preprocessor.transform(X_test)

xgb = XGBClassifier(scale_pos_weight=1.9, seed=42, eta=0.22)
xgb.fit(X_train_preprocessed, y_train)

y_pred_train = xgb.predict(X_train_preprocessed)
y_pred_test = xgb.predict(X_test_preprocessed)

print(f"test_f1_score = {f1_score(y_test, y_pred_test)}")
print(f"train_f1_score = {f1_score(y_train, y_pred_train)}")

test_f1_score = 0.6654705594519654
train_f1_score = 0.7036231884057971


### **3. Prédictions sur le test set de Kaggle**

In [8]:
# Données
test_data = pd.read_csv(data_folder / "test_data.csv")
test_data_df = test_data.copy()

In [9]:
# Création des nouvelles variables
# TODO : L'intégrer dans un FunctionTransformer
test_data_df["IsNewClient"] = test_data_df["Tenure"] == 0

test_data_df["HasNullBalance"] = test_data_df["Balance"] == 0

test_data_df["NumOfProducts_2"] = test_data_df["NumOfProducts"].replace({4: 3})

test_data_df["EstimatedSalary_2"] = pd.cut(
    x=test_data_df["EstimatedSalary"],
    bins=[0, 39500, 78260, 115400, 154430, 200000],
    labels=[0, 1, 2, 3, 4],
)

test_data_df["Balance_2"] = pd.cut(
    x=test_data_df["Balance"],
    bins=[-1, 50000, 100000, 150000, 200000, test_data_df["Balance"].max()],
    labels=[0, 1, 2, 3, 4],
).astype(int)

test_data_df["CreditScore_2"] = pd.cut(
    x=test_data_df["CreditScore"],
    bins=[300, 545, 612, 673, 744, 900],
    labels=[0, 1, 2, 3, 4],
)

In [10]:
# Preprocessing et prédictions
# TODO : Utiliser dans la mesure du posisble MLflow pour tracker les expériences
X_test_data_preprocessed = preprocessor.transform(test_data_df)
exited = xgb.predict(X_test_data_preprocessed)

submission = test_data[["ID"]].copy()
submission["Exited"] = exited
submission.to_csv("submission_1.csv", index=False)

Kaggle results on **2025-02-10 12:40 PM**
- Leaderboard position : 2nd
- Score : 0.66205
- 1st score : 0.66350