## MODELE DE REGRESSION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### üìÇChargement des donn√©es 

In [3]:
data = pd.read_csv("../data/donnees_dpe_73_clean.csv", delimiter=",")
print("Nombre de lignes :", data.shape[0])
print("Nombre de colonnes :", data.shape[1])
data.head(3)

Nombre de lignes : 117708
Nombre de colonnes : 143


Unnamed: 0,numero_dpe,date_derniere_modification_dpe,date_visite_diagnostiqueur,date_etablissement_dpe,date_reception_dpe,date_fin_validite_dpe,modele_dpe,version_dpe,methode_application_dpe,etiquette_dpe,etiquette_ges,classe_altitude,zone_climatique,type_batiment,type_installation_chauffage,type_installation_ecs,hauteur_sous_plafond,nombre_appartement,nombre_niveau_logement,surface_habitable_immeuble,surface_habitable_logement,classe_inertie_batiment,adresse_ban,numero_voie_ban,nom_rue_ban,nom_commune_ban,code_postal_ban,code_insee_ban,code_departement_ban,code_region_ban,identifiant_ban,score_ban,statut_geocodage,adresse_brut,nom_commune_brut,code_postal_brut,numero_etage_appartement,complement_adresse_batiment,complement_adresse_logement,indicateur_confort_ete,protection_solaire_exterieure,logement_traversant,presence_brasseur_air,inertie_lourde,isolation_toiture,deperditions_enveloppe,deperditions_ponts_thermiques,deperditions_murs,deperditions_planchers_hauts,deperditions_planchers_bas,deperditions_portes,deperditions_baies_vitrees,deperditions_renouvellement_air,qualite_isolation_enveloppe,qualite_isolation_murs,qualite_isolation_plancher_bas,qualite_isolation_menuiseries,ubat_w_par_m2_k,besoin_chauffage,besoin_ecs,besoin_refroidissement,conso_5_usages_ep,conso_5_usages_par_m2_ep,conso_chauffage_ep,conso_ecs_ep,conso_refroidissement_ep,conso_eclairage_ep,conso_auxiliaires_ep,conso_5_usages_ef,conso_5_usages_par_m2_ef,conso_chauffage_ef,conso_ecs_ef,conso_refroidissement_ef,conso_eclairage_ef,conso_auxiliaires_ef,emission_ges_5_usages,emission_ges_5_usages_par_m2,emission_ges_chauffage,emission_ges_ecs,emission_ges_refroidissement,emission_ges_eclairage,emission_ges_auxiliaires,type_energie_n1,conso_5_usages_ef_energie_n1,conso_chauffage_ef_energie_n1,conso_ecs_ef_energie_n1,cout_total_5_usages_energie_n1,cout_chauffage_energie_n1,cout_ecs_energie_n1,emission_ges_5_usages_energie_n1,emission_ges_chauffage_energie_n1,emission_ges_ecs_energie_n1,type_energie_n2,conso_5_usages_ef_energie_n2,conso_chauffage_ef_energie_n2,conso_ecs_ef_energie_n2,cout_total_5_usages_energie_n2,cout_chauffage_energie_n2,cout_ecs_energie_n2,emission_ges_5_usages_energie_n2,emission_ges_chauffage_energie_n2,emission_ges_ecs_energie_n2,cout_total_5_usages,cout_chauffage,cout_ecs,cout_refroidissement,cout_eclairage,cout_auxiliaires,type_energie_principale_chauffage,type_generateur_chauffage_principal,type_installation_chauffage_n1,type_emetteur_installation_chauffage_n1,configuration_installation_chauffage_n1,description_installation_chauffage_n1,conso_chauffage_installation_chauffage_n1,surface_chauffee_installation_chauffage_n1,type_generateur_n1_installation_n1,type_energie_generateur_n1_installation_n1,usage_generateur_n1_installation_n1,conso_chauffage_generateur_n1_installation_n1,type_energie_principale_ecs,type_generateur_chauffage_principal_ecs,type_installation_ecs_n1,configuration_installation_ecs_n1,description_installation_ecs_n1,conso_ef_installation_ecs_n1,nombre_logements_desservis_par_installation_ecs_n1,surface_habitable_desservie_par_installation_ecs_n1,type_installation_solaire_n1,type_generateur_n1_ecs_n1,type_energie_generateur_n1_ecs_n1,usage_generateur_n1_ecs_n1,description_generateur_n1_ecs_n1,volume_stockage_generateur_n1_ecs_n1,conso_ef_generateur_n1_ecs_n1,ventilation_posterieure_2012,production_electricite_pv_kwhep_par_an,Logement,annee_construction,annee_reception_DPE,periode_construction,longitude,latitude
0,2173N0055368S,2023-11-08,2021-06-30,2021-06-30,2021-07-01,2031-06-29,DPE NEUF logement : RT2012,1.0,dpe issu d'une √©tude thermique r√©glementaire R...,A,A,,,maison,,,2.5,1.0,1.0,95.4,95.4,,Rue de la Gare 73100 Gr√©sy-sur-Aix,,271 RUE DE LA GARE,Gr√©sy-sur-Aix,73100,73128,73.0,84.0,73128_0144,0.55,adresse g√©ocod√©e ban √† l'adresse,271 RUE DE LA GARE,Gr√©sy-sur-Aix,73100,,,,,,,,,,,,,,,,,,tr√®s bonne,tr√®s bonne,tr√®s bonne,tr√®s bonne,0.39,,,,5956.5,62.3,3859.4,1773.9,0.0,447.5,139.8,2589.8,27.1,1678.0,547.0,0.0,194.6,60.8,192.4,2.0,132.6,50.1,0.0,13.4,3.9,√âlectricit√©,2225.0,1678.0,547.0,706.0,457.0,149.0,24.3,0.0,0.0,,,,,,,,,,,706.0,457.0,149.0,0.0,53.0,17.0,√âlectricit√©,,,,,,,,,,,,Non affect√©,,,,,,,,,,,,,,,0,0.0,Neuf,2025.0,2021,Apr√®s 2010,5.92425,45.724631
1,2173N0056245L,2023-11-08,2021-07-01,2021-07-01,2021-07-02,2031-06-30,DPE NEUF logement : RT2012,1.0,dpe issu d'une √©tude thermique r√©glementaire R...,B,A,,,maison,,,2.5,1.0,1.0,102.2,102.2,,Chemin de la montaz 73250 Saint-Pierre-d'Albigny,,Chemin de la montaz,Saint-Pierre-d'Albigny,73250,73270,73.0,84.0,73270_0449,0.6,adresse non g√©ocod√©e ban car aucune correspond...,Lieu Dit : La Montaz,SAINT-PIERRE-D'ALBIGNY,73250,,,,,,,,,,,,,,,,,,tr√®s bonne,tr√®s bonne,bonne,tr√®s bonne,0.32,,,,7417.5,72.5,5200.3,1968.5,0.0,547.4,82.1,3225.0,31.6,2261.0,607.0,0.0,238.0,35.7,242.1,2.3,178.6,55.6,0.0,16.4,2.3,√âlectricit√©,2868.0,2261.0,607.0,822.0,576.0,155.0,24.0,0.0,0.0,,,,,,,,,,,822.0,576.0,155.0,0.0,61.0,9.0,√âlectricit√©,,,,,,,,,,,,Non affect√©,,,,,,,,,,,,,,,0,0.0,Neuf,2025.0,2021,Apr√®s 2010,6.155453,45.57234
2,2173N0056170O,2023-11-08,2021-07-01,2021-07-01,2021-07-02,2031-06-30,DPE NEUF logement : RT2012,1.0,dpe issu d'une √©tude thermique r√©glementaire R...,A,A,,,maison,,,2.5,1.0,1.0,92.0,92.0,,Route du ravet 73520 Saint-B√©ron,,Route du ravet,Saint-B√©ron,73520,73226,73.0,84.0,73226_3570,0.46,adresse non g√©ocod√©e ban car aucune correspond...,Lieu Dit ''Le Sourd'',SAINT-BERON,73520,,,,,,,,,,,,,,,,,,tr√®s bonne,tr√®s bonne,bonne,tr√®s bonne,0.31,,,,4367.7,47.5,2343.7,1949.0,0.0,468.9,0.0,1899.0,20.6,1019.0,601.0,0.0,203.9,0.0,138.4,1.5,80.5,55.1,0.0,14.1,0.0,√âlectricit√©,1620.0,1019.0,601.0,559.0,300.0,177.0,18.9,0.0,0.0,,,,,,,,,,,559.0,300.0,177.0,0.0,60.0,0.0,√âlectricit√©,,,,,,,,,,,,Non affect√©,,,,,,,,,,,,,,,0,0.0,Neuf,2025.0,2021,Apr√®s 2010,5.72356,45.495255


### üßπ Nettoyage et pr√©paration

In [4]:
target = "conso_5_usages_par_m2_ep"

features_user = [
    "annee_construction",
    "surface_habitable_logement",
    "type_batiment",
    "type_energie_principale_chauffage",
    "classe_inertie_batiment",
    "qualite_isolation_murs",
    "qualite_isolation_menuiseries",
    "classe_altitude",
    "logement_traversant"
]

# V√©rification de la pr√©sence des colonnes
missing_in_data = [f for f in features_user if f not in data.columns]
print("Variables manquantes dans le dataset :", missing_in_data)

# Suppression des lignes avec cible manquante
missing_target_count = data[target].isna().sum()
print(f"Valeurs manquantes dans la cible : {missing_target_count} / {len(data)}")

if missing_target_count > 0:
    data = data.dropna(subset=[target])
    print(f"‚úÖ Lignes supprim√©es : {missing_target_count}")

X = data[features_user]
y = data[target]

print(f"‚úÖ Donn√©es pr√™tes pour le split : {X.shape[0]} lignes, {X.shape[1]} variables explicatives.")


Variables manquantes dans le dataset : []
Valeurs manquantes dans la cible : 1 / 117708
‚úÖ Lignes supprim√©es : 1
‚úÖ Donn√©es pr√™tes pour le split : 117707 lignes, 9 variables explicatives.


### ‚úÇÔ∏è Split des donn√©es

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train :", X_train.shape, " Test :", X_test.shape)

Train : (94165, 9)  Test : (23542, 9)


###  ‚öôÔ∏è Pr√©processing

In [6]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

### üß† Mod√®les de r√©gression

In [7]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=300, max_depth=15, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
}


### üß™ Entra√Ænement et √©valuation

In [8]:
results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    results[name] = {"R2": r2, "RMSE": rmse, "MAE": mae}
    
    print(f"\nüìä {name}")
    print(f"R¬≤: {r2:.3f} | RMSE: {rmse:.2f} | MAE: {mae:.2f}")

results_df = pd.DataFrame(results).T.sort_values(by="R2", ascending=False)
display(results_df)


üìä Linear Regression
R¬≤: 0.579 | RMSE: 95.98 | MAE: 65.75

üìä Random Forest
R¬≤: 0.720 | RMSE: 78.20 | MAE: 47.15

üìä Gradient Boosting
R¬≤: 0.677 | RMSE: 84.10 | MAE: 54.85


Unnamed: 0,R2,RMSE,MAE
Random Forest,0.720485,78.196698,47.154819
Gradient Boosting,0.676724,84.095574,54.846155
Linear Regression,0.578885,95.981321,65.754838


### üíæ Sauvegarde du meilleur mod√®le

In [9]:
# üîé S√©lection du meilleur mod√®le
best_model_name = results_df["R2"].idxmax()
best_model = models[best_model_name]

# üéì Entra√Ænement final
best_model.fit(preprocessor.transform(X_train), y_train)

# üì¶ Sauvegarde all√©g√©e (mod√®le et pr√©processeur s√©par√©s)
os.makedirs("../models", exist_ok=True)
joblib.dump(preprocessor, "../models/preprocessor_conso.pkl", compress=3)
joblib.dump(best_model, f"../models/model_CONSO_{best_model_name.replace(' ','_')}.pkl", compress=3)

print("\n‚úÖ Sauvegarde termin√©e avec succ√®s (mod√®le compress√©) !")
print(f"üìò Meilleur mod√®le : {best_model_name} (R¬≤={results_df.loc[best_model_name, 'R2']:.3f})")
print("üíæ Fichiers enregistr√©s : preprocessor_conso.pkl + model_CONSO.pkl")



‚úÖ Sauvegarde termin√©e avec succ√®s (mod√®le compress√©) !
üìò Meilleur mod√®le : Random Forest (R¬≤=0.720)
üíæ Fichiers enregistr√©s : preprocessor_conso.pkl + model_CONSO.pkl
