## MODELE DE CLASSIFICATION

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve, auc,
    ConfusionMatrixDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

### üìÇChargement des donn√©es 

In [2]:
data = pd.read_csv("../data/donnees_dpe_73_clean.csv", delimiter=",")

In [3]:
print("Nombre de lignes :", data.shape[0])
print("Nombre de colonnes :", data.shape[1])

data.head(3)
data.info()

Nombre de lignes : 117708
Nombre de colonnes : 143
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117708 entries, 0 to 117707
Columns: 143 entries, numero_dpe to latitude
dtypes: float64(84), int64(4), object(55)
memory usage: 128.4+ MB


### üßπ Nettoyage et pr√©paration

In [4]:
target = "etiquette_dpe"

# Variables explicatives 
features_user = [
    "annee_construction",
    "surface_habitable_logement",
    "type_batiment",
    "type_energie_principale_chauffage",
    "classe_inertie_batiment",
    "qualite_isolation_murs",
    "qualite_isolation_menuiseries",
    "classe_altitude",
    "logement_traversant"
]
X = data[features_user]
y = data[target]

missing_in_data = [f for f in features_user if f not in data.columns]
print("Variables manquantes dans le dataset :", missing_in_data)

X = data[features_user]
y = data[target]


Variables manquantes dans le dataset : []


### üéØ Cr√©ation des cibles

In [5]:
# Encodage ordinal du DPE
etiquette_mapping = {"A":7, "B":6, "C":5, "D":4, "E":3, "F":2, "G":1}
y_ordinal = y.map(etiquette_mapping)

# Cible binaire : √©ligible MaPrimeR√©nov (E, F, G)
y_mpr = y.apply(lambda val: 1 if val in ["E", "F", "G"] else 0)

print("R√©partition multiclasses :")
print(y.value_counts(normalize=True))

print("\nR√©partition binaire (MPR) :")
print(y_mpr.value_counts(normalize=True))

R√©partition multiclasses :
etiquette_dpe
C    0.270908
E    0.237681
D    0.235991
F    0.099968
G    0.075025
B    0.052316
A    0.028112
Name: proportion, dtype: float64

R√©partition binaire (MPR) :
etiquette_dpe
0    0.587326
1    0.412674
Name: proportion, dtype: float64


### ‚úÇÔ∏è Split des donn√©es

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_ordinal, test_size=0.2, random_state=42, stratify=y_ordinal
)
print("Train :", X_train.shape, " Test :", X_test.shape)

Train : (94166, 9)  Test : (23542, 9)


###  ‚öôÔ∏è Pr√©processing

In [7]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


### üß† Classification multiclasses

In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=800),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15,min_samples_split=5, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = pipe.score(X_test, y_test)
    results[name] = score
    print(f"\nüìä {name} ‚Äî Accuracy: {score:.3f}")
    print(classification_report(y_test, y_pred))

# Tableau comparatif
results_df = pd.DataFrame.from_dict(results, orient="index", columns=["Accuracy"])
results_df.sort_values(by="Accuracy", ascending=False)


üìä Logistic Regression ‚Äî Accuracy: 0.519
              precision    recall  f1-score   support

           1       0.46      0.46      0.46      1766
           2       0.23      0.01      0.02      2353
           3       0.45      0.59      0.51      5595
           4       0.44      0.48      0.46      5556
           5       0.66      0.73      0.69      6378
           6       0.51      0.36      0.42      1232
           7       0.70      0.44      0.54       662

    accuracy                           0.52     23542
   macro avg       0.49      0.44      0.44     23542
weighted avg       0.50      0.52      0.49     23542


üìä Random Forest ‚Äî Accuracy: 0.641
              precision    recall  f1-score   support

           1       0.62      0.51      0.56      1766
           2       0.54      0.21      0.30      2353
           3       0.55      0.70      0.62      5595
           4       0.58      0.64      0.61      5556
           5       0.78      0.80      0.79   

Unnamed: 0,Accuracy
Random Forest,0.641322
Gradient Boosting,0.587716
Logistic Regression,0.518648


### üè° Classification binaire (MPR)

In [9]:
X_train_mpr, X_test_mpr, y_train_mpr, y_test_mpr = train_test_split(
    X, y_mpr, test_size=0.2, random_state=42, stratify=y_mpr
)

results_mpr = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train_mpr, y_train_mpr)
    y_pred_mpr = pipe.predict(X_test_mpr)
    y_proba_mpr = pipe.predict_proba(X_test_mpr)[:, 1]
    
    auc_score = roc_auc_score(y_test_mpr, y_proba_mpr)
    results_mpr[name] = auc_score
    
    print(f"\nüè† {name} ‚Äî AUC: {auc_score:.3f}")
    print(classification_report(y_test_mpr, y_pred_mpr))

results_mpr_df = pd.DataFrame.from_dict(results_mpr, orient="index", columns=["AUC"])
results_mpr_df.sort_values(by="AUC", ascending=False)


üè† Logistic Regression ‚Äî AUC: 0.916
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     13827
           1       0.81      0.78      0.79      9715

    accuracy                           0.83     23542
   macro avg       0.83      0.82      0.83     23542
weighted avg       0.83      0.83      0.83     23542


üè† Random Forest ‚Äî AUC: 0.951
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     13827
           1       0.84      0.86      0.85      9715

    accuracy                           0.87     23542
   macro avg       0.87      0.87      0.87     23542
weighted avg       0.87      0.87      0.87     23542


üè† Gradient Boosting ‚Äî AUC: 0.935
              precision    recall  f1-score   support

           0       0.88      0.87      0.87     13827
           1       0.82      0.82      0.82      9715

    accuracy                           0.85     23542
   macro avg 

Unnamed: 0,AUC
Random Forest,0.951095
Gradient Boosting,0.934555
Logistic Regression,0.915633


### üíæ Sauvegarde du meilleur mod√®le binaire

In [12]:
# üîé Meilleur mod√®le (multiclasses)
best_model_multi_name = results_df["Accuracy"].idxmax()
best_model_multi = models[best_model_multi_name]
best_model_multi.fit(preprocessor.transform(X_train), y_train)

# üîé Meilleur mod√®le (binaire)
best_model_bin_name = results_mpr_df["AUC"].idxmax()
best_model_bin = models[best_model_bin_name]
best_model_bin.fit(preprocessor.transform(X_train_mpr), y_train_mpr)

# üì¶ Sauvegarde all√©g√©e et compress√©e
os.makedirs("../models", exist_ok=True)
joblib.dump(preprocessor, "../models/preprocessor_classif.pkl", compress=3)
joblib.dump(best_model_multi, f"../models/model_DPE_{best_model_multi_name.replace(' ','_')}.pkl", compress=3)
joblib.dump(best_model_bin, f"../models/model_MPR_{best_model_bin_name.replace(' ','_')}.pkl", compress=3)

print("\n‚úÖ Sauvegarde termin√©e avec succ√®s (mod√®les compress√©s) !")
print(f"üìò Meilleur mod√®le multiclasses : {best_model_multi_name} (Accuracy={results_df.loc[best_model_multi_name,'Accuracy']:.3f})")
print(f"üìó Meilleur mod√®le binaire : {best_model_bin_name} (AUC={results_mpr_df.loc[best_model_bin_name,'AUC']:.3f})")
print("üíæ Fichiers enregistr√©s : preprocessor_classif.pkl + model_DPE.pkl + model_MPR.pkl")



‚úÖ Sauvegarde termin√©e avec succ√®s (mod√®les compress√©s) !
üìò Meilleur mod√®le multiclasses : Random Forest (Accuracy=0.641)
üìó Meilleur mod√®le binaire : Random Forest (AUC=0.951)
üíæ Fichiers enregistr√©s : preprocessor_classif.pkl + model_DPE.pkl + model_MPR.pkl
