
# 🔍 Amélioration d'un pipeline de classification supervisée

Ce notebook applique une série d'améliorations sur un pipeline de classification en machine learning en suivant les axes suivants :
- Feature engineering
- Validation croisée
- Comparaison de modèles
- Optimisation des hyperparamètres
- Évaluation approfondie


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


In [None]:

# Chargement des données prétraitées
Xtrain = pd.read_csv('../../data/data_preprocessed/data_resampled_Xtrain.csv')
ytrain = pd.read_csv('../../data/data_preprocessed/data_resampled_ytrain.csv')
Xtest = pd.read_csv('../../data/data_preprocessed/data_resampled_Xtest.csv')
ytest = pd.read_csv('../../data/data_preprocessed/data_resampled_ytest.csv')

# Conversion éventuelle des y en Series
if isinstance(ytrain, pd.DataFrame):
    ytrain = ytrain.iloc[:, 0]
if isinstance(ytest, pd.DataFrame):
    ytest = ytest.iloc[:, 0]


In [None]:

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(Xtrain, ytrain)
importances = pd.Series(rf.feature_importances_, index=Xtrain.columns)
importances.sort_values(ascending=False).plot(kind='bar', figsize=(12,5), title="Feature Importances")
plt.tight_layout()
plt.show()


In [None]:

scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)


In [None]:

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier()
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}
for name, model in models.items():
    scores = cross_val_score(model, Xtrain_scaled, ytrain, cv=cv, scoring='f1_weighted', n_jobs=-1)
    results[name] = scores
    print(f"{name}: F1 score moyen = {scores.mean():.4f} (+/- {scores.std():.4f})")


In [None]:

params = {
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300],
    'subsample': [0.8, 1.0]
}

grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
                    param_grid=params, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=1)

grid.fit(Xtrain_scaled, ytrain)
print("Meilleurs paramètres XGBoost :", grid.best_params_)
best_model = grid.best_estimator_


In [None]:

y_pred = best_model.predict(Xtest_scaled)
print(classification_report(ytest, y_pred))
cm = confusion_matrix(ytest, y_pred, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues', values_format=".2%")
plt.title("Matrice de confusion normalisée - Meilleur modèle XGBoost")
plt.show()
