
# üöÄ Am√©liorations avanc√©es du pipeline de classification

Ce notebook applique des strat√©gies avanc√©es pour :
- Optimiser Random Forest
- Interpr√©ter le mod√®le avec SHAP
- R√©duire la dimension
- Combiner plusieurs mod√®les (stacking)
- Analyser les erreurs de classification


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')


In [None]:

Xtrain = pd.read_csv('../data/data_preprocessed/data_resampled_Xtrain.csv')
ytrain = pd.read_csv('../data/data_preprocessed/data_resampled_ytrain.csv')
Xtest = pd.read_csv('../data/data_preprocessed/data_resampled_Xtest.csv')
ytest = pd.read_csv('../data/data_preprocessed/data_resampled_ytest.csv')

if isinstance(ytrain, pd.DataFrame):
    ytrain = ytrain.iloc[:, 0]
if isinstance(ytest, pd.DataFrame):
    ytest = ytest.iloc[:, 0]

scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)


In [None]:

param_grid = {
    'n_estimators': [200, 500],
    'max_depth': [20, 40, None],
    'min_samples_split': [2, 10],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_rf = GridSearchCV(rf, param_grid=param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=1)

grid_rf.fit(Xtrain_scaled, ytrain)
print("‚úÖ Meilleurs param√®tres Random Forest :", grid_rf.best_params_)
best_rf = grid_rf.best_estimator_


In [None]:

explainer = shap.TreeExplainer(best_rf)
shap_values = explainer.shap_values(Xtrain_scaled)
shap.summary_plot(shap_values, Xtrain_scaled, feature_names=Xtrain.columns.tolist())


In [None]:

selector = SelectFromModel(best_rf, threshold='median')
Xtrain_reduced = selector.fit_transform(Xtrain_scaled, ytrain)
Xtest_reduced = selector.transform(Xtest_scaled)
print("‚úÖ Nombre de variables apr√®s r√©duction :", Xtrain_reduced.shape[1])


In [None]:

pca = PCA(n_components=2)
X_vis = pca.fit_transform(Xtrain_scaled)
plt.figure(figsize=(8, 6))
plt.scatter(X_vis[:, 0], X_vis[:, 1], c=ytrain, cmap='viridis', s=1)
plt.title("Projection PCA (2D)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Classe")
plt.show()


In [None]:

estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]
stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1)
stack_model.fit(Xtrain_reduced, ytrain)
y_pred_stack = stack_model.predict(Xtest_reduced)
print(classification_report(ytest, y_pred_stack))


In [None]:

cm = confusion_matrix(ytest, y_pred_stack, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Oranges', values_format='.2%')
plt.title("Matrice de confusion - Mod√®le empil√© (Stacking)")
plt.show()


In [None]:

joblib.dump(stack_model, "stacked_model.joblib")
print("‚úÖ Mod√®le sauvegard√© dans 'stacked_model.joblib'")
