## MODELE DE CLASSIFICATION

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve, auc,
    ConfusionMatrixDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

### üìÇChargement des donn√©es 

In [18]:
data = pd.read_csv("../data/donnees_dpe_71_clean.csv", delimiter=",")

In [19]:
print("Nombre de lignes :", data.shape[0])
print("Nombre de colonnes :", data.shape[1])

data.head(3)
data.info()

Nombre de lignes : 87615
Nombre de colonnes : 141
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87615 entries, 0 to 87614
Columns: 141 entries, numero_dpe to latitude
dtypes: float64(83), int64(5), object(53)
memory usage: 94.3+ MB


### üßπ Nettoyage et pr√©paration

In [None]:
target = "etiquette_dpe"

# Variables explicatives 
features_user = [
    "annee_construction",
    "surface_habitable_logement",
    "type_batiment",
    "type_energie_principale_chauffage",
    "classe_inertie_batiment",
    "qualite_isolation_murs",
    "qualite_isolation_menuiseries",
    "classe_altitude",
    "logement_traversant"
]
X = data[features_user]
y = data[target]

missing_in_data = [f for f in features_user if f not in data.columns]
print("Variables manquantes dans le dataset :", missing_in_data)

X = data[features_user]
y = data[target]


Variables manquantes dans le dataset : []


### üéØ Cr√©ation des cibles

In [21]:
# Encodage ordinal du DPE
etiquette_mapping = {"A":7, "B":6, "C":5, "D":4, "E":3, "F":2, "G":1}
y_ordinal = y.map(etiquette_mapping)

# Cible binaire : √©ligible MaPrimeR√©nov (E, F, G)
y_mpr = y.apply(lambda val: 1 if val in ["E", "F", "G"] else 0)

print("R√©partition multiclasses :")
print(y.value_counts(normalize=True))

print("\nR√©partition binaire (MPR) :")
print(y_mpr.value_counts(normalize=True))

R√©partition multiclasses :
etiquette_dpe
D    0.318827
E    0.229219
C    0.222690
F    0.106352
G    0.067603
A    0.028968
B    0.026343
Name: proportion, dtype: float64

R√©partition binaire (MPR) :
etiquette_dpe
0    0.596827
1    0.403173
Name: proportion, dtype: float64


### ‚úÇÔ∏è Split des donn√©es

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_ordinal, test_size=0.2, random_state=42, stratify=y_ordinal
)
print("Train :", X_train.shape, " Test :", X_test.shape)

Train : (70092, 9)  Test : (17523, 9)


###  ‚öôÔ∏è Pr√©processing

In [23]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


### üß† Classification multiclasses

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=800),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15,min_samples_split=5, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = pipe.score(X_test, y_test)
    results[name] = score
    print(f"\nüìä {name} ‚Äî Accuracy: {score:.3f}")
    print(classification_report(y_test, y_pred))

# Tableau comparatif
results_df = pd.DataFrame.from_dict(results, orient="index", columns=["Accuracy"])
results_df.sort_values(by="Accuracy", ascending=False)


üìä Logistic Regression ‚Äî Accuracy: 0.482
              precision    recall  f1-score   support

           1       0.53      0.48      0.50      1184
           2       0.40      0.17      0.24      1863
           3       0.39      0.37      0.38      4017
           4       0.46      0.64      0.53      5587
           5       0.60      0.50      0.55      3902
           6       0.44      0.21      0.28       462
           7       0.81      0.83      0.82       508

    accuracy                           0.48     17523
   macro avg       0.52      0.46      0.47     17523
weighted avg       0.48      0.48      0.47     17523


üìä Random Forest ‚Äî Accuracy: 0.559
              precision    recall  f1-score   support

           1       0.54      0.44      0.48      1184
           2       0.41      0.23      0.30      1863
           3       0.45      0.53      0.49      4017
           4       0.55      0.65      0.60      5587
           5       0.73      0.62      0.67   

Unnamed: 0,Accuracy
Random Forest,0.559208
Gradient Boosting,0.538207
Logistic Regression,0.48171


### üè° Classification binaire (MPR)

In [27]:
X_train_mpr, X_test_mpr, y_train_mpr, y_test_mpr = train_test_split(
    X, y_mpr, test_size=0.2, random_state=42, stratify=y_mpr
)

results_mpr = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train_mpr, y_train_mpr)
    y_pred_mpr = pipe.predict(X_test_mpr)
    y_proba_mpr = pipe.predict_proba(X_test_mpr)[:, 1]
    
    auc_score = roc_auc_score(y_test_mpr, y_proba_mpr)
    results_mpr[name] = auc_score
    
    print(f"\nüè† {name} ‚Äî AUC: {auc_score:.3f}")
    print(classification_report(y_test_mpr, y_pred_mpr))

results_mpr_df = pd.DataFrame.from_dict(results_mpr, orient="index", columns=["AUC"])
results_mpr_df.sort_values(by="AUC", ascending=False)


üè† Logistic Regression ‚Äî AUC: 0.864
              precision    recall  f1-score   support

           0       0.80      0.84      0.82     10458
           1       0.75      0.69      0.72      7065

    accuracy                           0.78     17523
   macro avg       0.77      0.77      0.77     17523
weighted avg       0.78      0.78      0.78     17523


üè† Random Forest ‚Äî AUC: 0.898
              precision    recall  f1-score   support

           0       0.85      0.84      0.84     10458
           1       0.77      0.78      0.77      7065

    accuracy                           0.81     17523
   macro avg       0.81      0.81      0.81     17523
weighted avg       0.82      0.81      0.81     17523


üè† Gradient Boosting ‚Äî AUC: 0.888
              precision    recall  f1-score   support

           0       0.83      0.84      0.84     10458
           1       0.76      0.75      0.75      7065

    accuracy                           0.80     17523
   macro avg 

Unnamed: 0,AUC
Random Forest,0.89796
Gradient Boosting,0.887778
Logistic Regression,0.864488


### üíæ Sauvegarde du meilleur mod√®le binaire

In [29]:
# Meilleur mod√®le multiclasses
best_model_multi_name = results_df["Accuracy"].idxmax()
best_model_multi = models[best_model_multi_name]

final_pipe_multi = Pipeline([
    ("preprocess", preprocessor),
    ("model", best_model_multi)
])
final_pipe_multi.fit(X_train, y_train)
joblib.dump(final_pipe_multi, f"../models/pipeline_DPE_{best_model_multi_name.replace(' ','_')}.pkl")

# Meilleur mod√®le binaire
best_model_bin_name = results_mpr_df["AUC"].idxmax()
best_model_bin = models[best_model_bin_name]

final_pipe_bin = Pipeline([
    ("preprocess", preprocessor),
    ("model", best_model_bin)
])
final_pipe_bin.fit(X_train_mpr, y_train_mpr)
joblib.dump(final_pipe_bin, f"../models/pipeline_MPR_{best_model_bin_name.replace(' ','_')}.pkl")

print("\n‚úÖ Sauvegardes termin√©es avec succ√®s !")
print("üìò Meilleur mod√®le multiclasses :", best_model_multi_name, f"(Accuracy={results_df.loc[best_model_multi_name,'Accuracy']:.3f})")
print("üìó Meilleur mod√®le binaire :", best_model_bin_name, f"(AUC={results_mpr_df.loc[best_model_bin_name,'AUC']:.3f})")


‚úÖ Sauvegardes termin√©es avec succ√®s !
üìò Meilleur mod√®le multiclasses : Random Forest (Accuracy=0.559)
üìó Meilleur mod√®le binaire : Random Forest (AUC=0.898)
