# 03 - Modélisation

Ce notebook couvre l'entraînement de plusieurs modèles pour la prédiction du churn.

In [16]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: C:\Users\aless\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report

## Chargement des données prétraitées

In [18]:
# Chargement des données prétraitées et encodées
# On utilise le fichier nettoyé et encodé pour la modélisation
# Les modèles nécessitent des variables numériques

df = pd.read_csv('../data/processed/telco_churn_clean.csv')
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Entraînement des modèles

In [19]:
# Logistic Regression
logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
y_proba_logreg = logreg.predict_proba(X_test)[:,1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

In [21]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:,1]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [22]:
# Sauvegarde automatique des variables nécessaires pour l'évaluation
import joblib
joblib.dump({
    'y_test': y_test,
    'y_pred_logreg': y_pred_logreg,
    'y_proba_logreg': y_proba_logreg,
    'y_pred_rf': y_pred_rf,
    'y_proba_rf': y_proba_rf,
    'y_pred_xgb': y_pred_xgb,
    'y_proba_xgb': y_proba_xgb
}, '../outputs/models/eval_vars.joblib')

['../outputs/models/eval_vars.joblib']