## MODELE DE CLASSIFICATION

In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import class_weight
import joblib
import glob
from sklearn.utils import class_weight
import copy


pd.set_option("display.max_columns", None)

### üìÇChargement des donn√©es 

In [2]:
DATA_PATH = "../data/donnees_dpe_73_clean.csv"
df = pd.read_csv(DATA_PATH)
print("üì¶ Donn√©es :", df.shape)

TARGET_DPE = "etiquette_dpe"
FEATURES = [
    "annee_construction",
    "surface_habitable_logement",
    "type_batiment",
    "type_energie_principale_chauffage",
    "classe_inertie_batiment",
    "qualite_isolation_murs",
    "qualite_isolation_menuiseries",
    "classe_altitude",
    "logement_traversant",
]

missing = [c for c in FEATURES if c not in df.columns]
if missing:
    raise KeyError(f"Colonnes manquantes dans le dataset: {missing}")

X = df[FEATURES].copy()
y = df[TARGET_DPE].astype(str).str.strip()

üì¶ Donn√©es : (117708, 143)


### üßπ Nettoyage et pr√©paration

In [3]:
# Harmonisation des valeurs de logement_traversant
if "logement_traversant" in X.columns:
    if X["logement_traversant"].dtype == object:
        X["logement_traversant"] = (
            X["logement_traversant"]
            .astype(str)
            .str.strip()
            .str.lower()
            .map({"oui": 1, "non": 0})
        )
    X["logement_traversant"] = pd.to_numeric(X["logement_traversant"], errors="coerce")

# Nettoyage l√©ger des cha√Ænes cat√©gorielles
cat_cols_text = [
    "type_batiment",
    "type_energie_principale_chauffage",
    "classe_inertie_batiment",
    "qualite_isolation_murs",
    "qualite_isolation_menuiseries",
    "classe_altitude"
]
for c in cat_cols_text:
    X[c] = X[c].astype(str).str.strip()

# Cible binaire MPR : E/F/G -> 1
y_mpr = y.isin(["E", "F", "G"]).astype(int)

print("\nR√©partition DPE (multiclasses) :")
print(y.value_counts(normalize=True).round(3).sort_index())
print("\nR√©partition binaire (MPR) :")
print(y_mpr.value_counts(normalize=True).round(3).sort_index())


R√©partition DPE (multiclasses) :
etiquette_dpe
A    0.028
B    0.052
C    0.271
D    0.236
E    0.238
F    0.100
G    0.075
Name: proportion, dtype: float64

R√©partition binaire (MPR) :
etiquette_dpe
0    0.587
1    0.413
Name: proportion, dtype: float64


### ‚úÇÔ∏è Split des donn√©es

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train_mpr, X_test_mpr, y_train_mpr, y_test_mpr = train_test_split(
    X, y_mpr, test_size=0.2, random_state=42, stratify=y_mpr
)

print("\nSplit DPE -> Train:", X_train.shape, " Test:", X_test.shape)
print("Split MPR -> Train:", X_train_mpr.shape, " Test:", X_test_mpr.shape)


Split DPE -> Train: (94166, 9)  Test: (23542, 9)
Split MPR -> Train: (94166, 9)  Test: (23542, 9)


###  ‚öôÔ∏è Pr√©processing

In [5]:
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in FEATURES if c not in num_cols]

numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

### ü§ñ Mod√®les Candidats

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, max_depth=15, min_samples_split=5, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42),
}

### üß† Evaluation DPE (multiclasses A..G)

In [7]:
print("\n================ DPE (A..G) ================")
results = []
for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")

    results.append((name, acc, f1m))
    print(f"\nüìä {name}")
    print(f"Accuracy: {acc:.3f} | Macro-F1: {f1m:.3f}")
    print(classification_report(y_test, y_pred, zero_division=0))

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "MacroF1"]).set_index("Model")
display(results_df.sort_values(by=["MacroF1","Accuracy"], ascending=False))

best_model_multi_name = results_df.sort_values(by=["MacroF1","Accuracy"], ascending=False).index[0]
best_model_multi = models[best_model_multi_name]
print(f"\nüèÜ Meilleur mod√®le DPE: {best_model_multi_name}")



üìä Logistic Regression
Accuracy: 0.521 | Macro-F1: 0.447
              precision    recall  f1-score   support

           A       0.73      0.49      0.59       662
           B       0.59      0.28      0.38      1232
           C       0.66      0.75      0.70      6378
           D       0.44      0.49      0.46      5556
           E       0.45      0.59      0.51      5595
           F       0.40      0.02      0.03      2353
           G       0.46      0.45      0.46      1766

    accuracy                           0.52     23542
   macro avg       0.53      0.44      0.45     23542
weighted avg       0.52      0.52      0.49     23542


üìä Random Forest
Accuracy: 0.642 | Macro-F1: 0.613
              precision    recall  f1-score   support

           A       0.82      0.70      0.75       662
           B       0.76      0.57      0.65      1232
           C       0.79      0.80      0.79      6378
           D       0.58      0.64      0.61      5556
           E     

Unnamed: 0_level_0,Accuracy,MacroF1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Random Forest,0.642129,0.612868
Gradient Boosting,0.582278,0.539953
Logistic Regression,0.521281,0.447051



üèÜ Meilleur mod√®le DPE: Random Forest


### üè° Evaluation MPR (binaire)

In [8]:
print("\n================ MPR (binaire) ================")
results_mpr = []
for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocessor), ("model", model)])
    pipe.fit(X_train_mpr, y_train_mpr)
    y_pred = pipe.predict(X_test_mpr)
    proba = pipe.predict_proba(X_test_mpr)[:, 1] if hasattr(pipe, "predict_proba") else None

    f1m = f1_score(y_test_mpr, y_pred, average="macro")
    auc = roc_auc_score(y_test_mpr, proba) if proba is not None else np.nan

    results_mpr.append((name, auc, f1m))
    print(f"\nüè† {name}")
    print(f"AUC: {auc:.3f} | Macro-F1: {f1m:.3f}")
    print(classification_report(y_test_mpr, y_pred, zero_division=0))

results_mpr_df = pd.DataFrame(results_mpr, columns=["Model", "AUC", "MacroF1"]).set_index("Model")
display(results_mpr_df.sort_values(by=["AUC","MacroF1"], ascending=False))

best_model_bin_name = results_mpr_df.sort_values(by=["AUC","MacroF1"], ascending=False).index[0]
best_model_bin = models[best_model_bin_name]
print(f"\nüèÜ Meilleur mod√®le MPR: {best_model_bin_name}")



üè† Logistic Regression
AUC: 0.916 | Macro-F1: 0.826
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     13827
           1       0.81      0.78      0.79      9715

    accuracy                           0.83     23542
   macro avg       0.83      0.82      0.83     23542
weighted avg       0.83      0.83      0.83     23542


üè† Random Forest
AUC: 0.950 | Macro-F1: 0.868
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     13827
           1       0.84      0.85      0.85      9715

    accuracy                           0.87     23542
   macro avg       0.87      0.87      0.87     23542
weighted avg       0.87      0.87      0.87     23542


üè† Gradient Boosting
AUC: 0.932 | Macro-F1: 0.845
              precision    recall  f1-score   support

           0       0.87      0.87      0.87     13827
           1       0.82      0.82      0.82      9715

    accuracy            

Unnamed: 0_level_0,AUC,MacroF1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Random Forest,0.950354,0.868372
Gradient Boosting,0.931705,0.845405
Logistic Regression,0.915618,0.825611



üèÜ Meilleur mod√®le MPR: Random Forest


###  üèÅ Entra√Ænement final

In [9]:

print("\n============= Entra√Ænement final (s√©curis√©) =============")

# --- V√©rification des cibles ---
unique_y = sorted(pd.Series(y).unique())
unique_y_mpr = sorted(pd.Series(y_mpr).unique())
print("üß© V√©rification des cibles :")
print(" - DPE :", unique_y)
print(" - MPR :", unique_y_mpr)

if not all(c in list("ABCDEFG") for c in unique_y):
    raise ValueError(f"‚ö†Ô∏è ERREUR : la cible DPE contient des valeurs inattendues : {unique_y}")
if not all(c in [0, 1] for c in unique_y_mpr):
    raise ValueError(f"‚ö†Ô∏è ERREUR : la cible MPR contient des valeurs inattendues : {unique_y_mpr}")

# --- DPE multiclasses ---
weights = class_weight.compute_sample_weight("balanced", y)

# ‚ö†Ô∏è cr√©er une copie ind√©pendante du meilleur mod√®le
best_model_multi_copy = copy.deepcopy(best_model_multi)

pipe_dpe = Pipeline([
    ("preprocess", preprocessor),
    ("model", best_model_multi_copy)
])

print("\nüöÄ Entra√Ænement du mod√®le DPE multiclasses...")
pipe_dpe.fit(X, y, model__sample_weight=weights)
print("‚úÖ Mod√®le DPE entra√Æn√© avec succ√®s !")
print("   Classes :", pipe_dpe.named_steps["model"].classes_)

# --- MPR binaire ---
# ‚ö†Ô∏è idem, copie ind√©pendante
best_model_bin_copy = copy.deepcopy(best_model_bin)

pipe_mpr = Pipeline([
    ("preprocess", preprocessor),
    ("model", best_model_bin_copy)
])

print("\nüöÄ Entra√Ænement du mod√®le MPR binaire...")
pipe_mpr.fit(X, y_mpr)
print("‚úÖ Mod√®le MPR entra√Æn√© avec succ√®s !")
print("   Classes :", pipe_mpr.named_steps["model"].classes_)

print("\nüèÅ Entra√Ænement final termin√© avec succ√®s.")


üß© V√©rification des cibles :
 - DPE : ['A', 'B', 'C', 'D', 'E', 'F', 'G']
 - MPR : [np.int64(0), np.int64(1)]

üöÄ Entra√Ænement du mod√®le DPE multiclasses...
‚úÖ Mod√®le DPE entra√Æn√© avec succ√®s !
   Classes : ['A' 'B' 'C' 'D' 'E' 'F' 'G']

üöÄ Entra√Ænement du mod√®le MPR binaire...
‚úÖ Mod√®le MPR entra√Æn√© avec succ√®s !
   Classes : [0 1]

üèÅ Entra√Ænement final termin√© avec succ√®s.


### üßπ Nettoyage anciens mod√®les avant sauvegarde

In [10]:
MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

for f in glob.glob(os.path.join(MODELS_DIR, "model_DPE_*.pkl")) + glob.glob(os.path.join(MODELS_DIR, "model_MPR_*.pkl")):
    os.remove(f)
    print("üóëÔ∏è Ancien mod√®le supprim√© :", os.path.basename(f))

### üíæ Sauvegarde du meilleur mod√®le binaire

In [11]:
dpe_name = f"model_DPE_{best_model_multi_name.replace(' ','_')}.pkl"
mpr_name = f"model_MPR_{best_model_bin_name.replace(' ','_')}.pkl"

joblib.dump(pipe_dpe, os.path.join(MODELS_DIR, dpe_name), compress=3)
joblib.dump(pipe_mpr, os.path.join(MODELS_DIR, mpr_name), compress=3)
joblib.dump(pipe_dpe, os.path.join(MODELS_DIR, "model_DPE_latest.pkl"), compress=3)
joblib.dump(pipe_mpr, os.path.join(MODELS_DIR, "model_MPR_latest.pkl"), compress=3)

print("\n‚úÖ Sauvegarde termin√©e avec succ√®s.")


‚úÖ Sauvegarde termin√©e avec succ√®s.


### ‚úÖ Fonction de contr√¥le post-sauvegarde


In [12]:
def inspect_models(folder="../models"):
    """Affiche un r√©sum√© clair des mod√®les enregistr√©s"""
    import datetime
    print(f"\nüîç V√©rification des mod√®les dans {folder}\n")
    for f in sorted(glob.glob(os.path.join(folder, "*.pkl"))):
        try:
            pipe = joblib.load(f)
            model = pipe.named_steps.get("model", None)
            classes = getattr(model, "classes_", None)
            n_classes = len(classes) if classes is not None else "-"
            mtime = datetime.datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M")
            print(f"{os.path.basename(f):40}  {type(model).__name__:<25}  {n_classes} classes  {classes}  ({mtime})")
        except Exception as e:
            print(f"{os.path.basename(f):40}  [Erreur lecture] {e}")

# Lancer le contr√¥le
inspect_models(MODELS_DIR)

# V√©rification sp√©cifique DPE
classes_dpe = pipe_dpe.named_steps["model"].classes_
if not all(c in list("ABCDEFG") for c in classes_dpe):
    raise ValueError(f"‚ö†Ô∏è Le mod√®le DPE semble incorrect (classes trouv√©es: {classes_dpe})")
else:
    print("\n‚úÖ V√©rification OK : mod√®le DPE multiclasses A..G sauvegard√© correctement !")


üîç V√©rification des mod√®les dans ../models

model_CONSO_Random_Forest.pkl             [Erreur lecture] 'RandomForestRegressor' object has no attribute 'named_steps'
model_DPE_Random_Forest.pkl               RandomForestClassifier     7 classes  ['A' 'B' 'C' 'D' 'E' 'F' 'G']  (2025-11-02 16:18)
model_DPE_latest.pkl                      RandomForestClassifier     7 classes  ['A' 'B' 'C' 'D' 'E' 'F' 'G']  (2025-11-02 16:18)
model_MPR_Random_Forest.pkl               RandomForestClassifier     2 classes  [0 1]  (2025-11-02 16:18)
model_MPR_latest.pkl                      RandomForestClassifier     2 classes  [0 1]  (2025-11-02 16:18)
preprocessor_conso.pkl                    [Erreur lecture] 'ColumnTransformer' object has no attribute 'named_steps'

‚úÖ V√©rification OK : mod√®le DPE multiclasses A..G sauvegard√© correctement !


In [13]:
# ============================================================
# üîç TEST LOCAL DES MOD√àLES (DPE / MPR / CONSO)
# ============================================================

import joblib
import pandas as pd

# --- Chargement des mod√®les sauvegard√©s ---
MODELS_DIR = "../models"
model_dpe = joblib.load(f"{MODELS_DIR}/model_DPE_latest.pkl")
model_mpr = joblib.load(f"{MODELS_DIR}/model_MPR_latest.pkl")
model_conso = joblib.load(f"{MODELS_DIR}/model_CONSO_Random_Forest.pkl")
preproc_conso = joblib.load(f"{MODELS_DIR}/preprocessor_conso.pkl")

# --- D√©finition de sc√©narios de test ---
scenarios = pd.DataFrame([
    {
        "Nom": "Scenario 1 - Maison r√©cente √©lec",
        "annee_construction": 2015,
        "surface_habitable_logement": 110,
        "type_batiment": "maison",
        "type_energie_principale_chauffage": "√âlectricit√©",
        "classe_inertie_batiment": "Moyenne",
        "qualite_isolation_murs": "bonne",
        "qualite_isolation_menuiseries": "bonne",
        "classe_altitude": "inf√©rieur √† 400m",
        "logement_traversant": 1,
    },
    {
        "Nom": "Scenario 2 - Appartement ancien fioul",
        "annee_construction": 1960,
        "surface_habitable_logement": 70,
        "type_batiment": "appartement",
        "type_energie_principale_chauffage": "Fioul domestique",
        "classe_inertie_batiment": "L√©g√®re",
        "qualite_isolation_murs": "insuffisante",
        "qualite_isolation_menuiseries": "insuffisante",
        "classe_altitude": "400-800m",
        "logement_traversant": 0,
    },
    {
        "Nom": "Scenario 3 - Maison ancienne bois",
        "annee_construction": 1955,
        "surface_habitable_logement": 120,
        "type_batiment": "maison",
        "type_energie_principale_chauffage": "Bois ‚Äì B√ªches",
        "classe_inertie_batiment": "Lourde",
        "qualite_isolation_murs": "moyenne",
        "qualite_isolation_menuiseries": "moyenne",
        "classe_altitude": "inf√©rieur √† 400m",
        "logement_traversant": 1,
    },
])

# --- Pr√©dictions ---
scenarios["DPE"] = model_dpe.predict(scenarios.drop(columns=["Nom"]))
scenarios["MaPrimeR√©nov"] = model_mpr.predict(scenarios.drop(columns=["Nom"])).astype(int)
scenarios["MaPrimeR√©nov"] = scenarios["MaPrimeR√©nov"].map({1: "‚úÖ Oui", 0: "‚ùå Non"})

X_conso = preproc_conso.transform(scenarios.drop(columns=["Nom", "DPE", "MaPrimeR√©nov"]))
scenarios["Conso (kWh/m¬≤/an)"] = model_conso.predict(X_conso).round(1)

print("\n=== R√©sultats des sc√©narios ===")
display(scenarios[["Nom", "DPE", "MaPrimeR√©nov", "Conso (kWh/m¬≤/an)"]])



=== R√©sultats des sc√©narios ===


Unnamed: 0,Nom,DPE,MaPrimeR√©nov,Conso (kWh/m¬≤/an)
0,Scenario 1 - Maison r√©cente √©lec,B,‚ùå Non,119.7
1,Scenario 2 - Appartement ancien fioul,G,‚úÖ Oui,428.1
2,Scenario 3 - Maison ancienne bois,D,‚ùå Non,254.4
