## MODELE DE REGRESSION

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### üìÇChargement des donn√©es 

In [3]:
data = pd.read_csv("../data/donnees_dpe_71_clean.csv", delimiter=",")
print("Nombre de lignes :", data.shape[0])
print("Nombre de colonnes :", data.shape[1])
data.head(3)

Nombre de lignes : 87615
Nombre de colonnes : 141


Unnamed: 0,numero_dpe,date_derniere_modification_dpe,date_visite_diagnostiqueur,date_etablissement_dpe,date_reception_dpe,date_fin_validite_dpe,modele_dpe,version_dpe,methode_application_dpe,etiquette_dpe,etiquette_ges,classe_altitude,zone_climatique,type_batiment,type_installation_chauffage,type_installation_ecs,hauteur_sous_plafond,nombre_appartement,nombre_niveau_logement,surface_habitable_logement,classe_inertie_batiment,adresse_ban,numero_voie_ban,nom_rue_ban,nom_commune_ban,code_postal_ban,code_insee_ban,code_departement_ban,code_region_ban,identifiant_ban,score_ban,statut_geocodage,adresse_brut,nom_commune_brut,code_postal_brut,numero_etage_appartement,complement_adresse_batiment,indicateur_confort_ete,protection_solaire_exterieure,logement_traversant,presence_brasseur_air,inertie_lourde,isolation_toiture,deperditions_enveloppe,deperditions_ponts_thermiques,deperditions_murs,deperditions_planchers_hauts,deperditions_planchers_bas,deperditions_portes,deperditions_baies_vitrees,deperditions_renouvellement_air,qualite_isolation_enveloppe,qualite_isolation_murs,qualite_isolation_plancher_bas,qualite_isolation_menuiseries,ubat_w_par_m2_k,besoin_chauffage,besoin_ecs,besoin_refroidissement,conso_5_usages_ep,conso_5_usages_par_m2_ep,conso_chauffage_ep,conso_ecs_ep,conso_refroidissement_ep,conso_eclairage_ep,conso_auxiliaires_ep,conso_5_usages_ef,conso_5_usages_par_m2_ef,conso_chauffage_ef,conso_ecs_ef,conso_refroidissement_ef,conso_eclairage_ef,conso_auxiliaires_ef,emission_ges_5_usages,emission_ges_5_usages_par_m2,emission_ges_chauffage,emission_ges_ecs,emission_ges_refroidissement,emission_ges_eclairage,emission_ges_auxiliaires,type_energie_n1,conso_5_usages_ef_energie_n1,conso_chauffage_ef_energie_n1,conso_ecs_ef_energie_n1,cout_total_5_usages_energie_n1,cout_chauffage_energie_n1,cout_ecs_energie_n1,emission_ges_5_usages_energie_n1,emission_ges_chauffage_energie_n1,emission_ges_ecs_energie_n1,type_energie_n2,conso_5_usages_ef_energie_n2,conso_chauffage_ef_energie_n2,conso_ecs_ef_energie_n2,cout_total_5_usages_energie_n2,cout_chauffage_energie_n2,cout_ecs_energie_n2,emission_ges_5_usages_energie_n2,emission_ges_chauffage_energie_n2,emission_ges_ecs_energie_n2,cout_total_5_usages,cout_chauffage,cout_ecs,cout_refroidissement,cout_eclairage,cout_auxiliaires,type_energie_principale_chauffage,type_generateur_chauffage_principal,type_installation_chauffage_n1,type_emetteur_installation_chauffage_n1,configuration_installation_chauffage_n1,description_installation_chauffage_n1,conso_chauffage_installation_chauffage_n1,surface_chauffee_installation_chauffage_n1,type_generateur_n1_installation_n1,type_energie_generateur_n1_installation_n1,usage_generateur_n1_installation_n1,conso_chauffage_generateur_n1_installation_n1,type_energie_principale_ecs,type_generateur_chauffage_principal_ecs,type_installation_ecs_n1,configuration_installation_ecs_n1,description_installation_ecs_n1,conso_ef_installation_ecs_n1,nombre_logements_desservis_par_installation_ecs_n1,surface_habitable_desservie_par_installation_ecs_n1,type_installation_solaire_n1,type_generateur_n1_ecs_n1,type_energie_generateur_n1_ecs_n1,usage_generateur_n1_ecs_n1,description_generateur_n1_ecs_n1,volume_stockage_generateur_n1_ecs_n1,conso_ef_generateur_n1_ecs_n1,ventilation_posterieure_2012,production_electricite_pv_kwhep_par_an,Logement,annee_construction,annee_reception_DPE,periode_construction,longitude,latitude
0,2171N0069527F,2023-11-08,2021-06-30,2021-06-30,2021-07-01,2031-06-29,DPE NEUF logement : RT2012,1.0,dpe issu d'une √©tude thermique r√©glementaire R...,A,A,,,maison,,,2.5,1.0,1.0,120.1,,Rue du Chateau Coulon 71380 Oslon,,,Oslon,71380,71333,71.0,27.0,71333_0040,0.78,adresse non g√©ocod√©e ban car aucune correspond...,Rue du Chateau Coulon,OSLON,71380,,,,,,,,,,,,,,,,,tr√®s bonne,tr√®s bonne,tr√®s bonne,tr√®s bonne,0.27,,,,5301.9,44.2,3213.1,1676.6,0.0,578.4,128.5,2305.2,19.2,1397.0,517.0,0.0,251.5,55.9,170.3,1.3,110.4,47.4,0.0,17.4,3.6,√âlectricit√©,1914.0,1397.0,517.0,646.0,392.0,145.0,26.3,0.0,0.0,,,,,,,,,,,646.0,392.0,145.0,0.0,70.0,16.0,√âlectricit√©,,,,,,,,,,,,Non affect√©,,,,,,,,,,,,,,,0,0.0,Neuf,2025.0,2021,Apr√®s 2010,4.925088,46.780243
1,2171N0090688C,2023-11-08,2021-06-30,2021-06-30,2021-07-01,2031-06-29,DPE NEUF logement : RT2012,1.0,dpe issu d'une √©tude thermique r√©glementaire R...,A,A,,,maison,,,2.5,1.0,1.0,129.6,,Rue du Bourg 71380 Lans,,,Lans,71380,71253,71.0,27.0,71253_0005,0.75,adresse non g√©ocod√©e ban car aucune correspond...,Rue du Bourg,LANS,71380,,,,,,,,,,,,,,,,,tr√®s bonne,tr√®s bonne,bonne,tr√®s bonne,0.28,,,,5463.7,42.2,3298.2,1842.0,0.0,652.9,68.7,2375.5,18.3,1434.0,568.0,0.0,283.9,29.9,175.5,1.3,113.3,52.1,0.0,19.6,1.9,√âlectricit√©,2002.0,1434.0,568.0,661.0,399.0,158.0,25.3,0.0,0.0,,,,,,,,,,,661.0,399.0,158.0,0.0,79.0,8.0,√âlectricit√©,,,,,,,,,,,,Non affect√©,,,,,,,,,,,,,,,0,0.0,Neuf,2025.0,2021,Apr√®s 2010,4.921976,46.770695
2,2171N0090699N,2023-11-08,2021-06-30,2021-06-30,2021-07-01,2031-06-29,DPE NEUF logement : RT2012,1.0,dpe issu d'une √©tude thermique r√©glementaire R...,A,A,,,maison,,,2.5,1.0,1.0,135.1,,Route de Saint-christophe 71380 Lans,,Route de Saint-christophe,Lans,71380,71253,71.0,27.0,71253_0088,0.42,adresse non g√©ocod√©e ban car aucune correspond...,Rue du Bourg - Route de st Christophe en Bres...,LANS,71380,,,,,,,,,,,,,,,,,tr√®s bonne,tr√®s bonne,tr√®s bonne,tr√®s bonne,0.28,,,,5588.4,41.3,3286.7,1855.0,0.0,694.0,73.0,2429.8,18.0,1429.0,572.0,0.0,301.7,31.8,179.0,1.3,112.9,52.4,0.0,20.8,2.0,√âlectricit√©,2001.0,1429.0,572.0,673.0,396.0,158.0,28.9,0.0,0.0,,,,,,,,,,,673.0,396.0,158.0,0.0,84.0,9.0,√âlectricit√©,,,,,,,,,,,,Non affect√©,,,,,,,,,,,,,,,0,0.0,Neuf,2025.0,2021,Apr√®s 2010,4.925513,46.767843


### üßπ Nettoyage et pr√©paration

In [4]:
target = "conso_5_usages_par_m2_ep"

features_user = [
    "annee_construction",
    "surface_habitable_logement",
    "type_batiment",
    "type_energie_principale_chauffage",
    "classe_inertie_batiment",
    "qualite_isolation_murs",
    "qualite_isolation_menuiseries",
    "classe_altitude",
    "logement_traversant"
]

# V√©rification de la pr√©sence des colonnes
missing_in_data = [f for f in features_user if f not in data.columns]
print("Variables manquantes dans le dataset :", missing_in_data)

# Suppression des lignes avec cible manquante
missing_target_count = data[target].isna().sum()
print(f"Valeurs manquantes dans la cible : {missing_target_count} / {len(data)}")

if missing_target_count > 0:
    data = data.dropna(subset=[target])
    print(f"‚úÖ Lignes supprim√©es : {missing_target_count}")

X = data[features_user]
y = data[target]

print(f"‚úÖ Donn√©es pr√™tes pour le split : {X.shape[0]} lignes, {X.shape[1]} variables explicatives.")


Variables manquantes dans le dataset : []
Valeurs manquantes dans la cible : 5 / 87615
‚úÖ Lignes supprim√©es : 5
‚úÖ Donn√©es pr√™tes pour le split : 87610 lignes, 9 variables explicatives.


### ‚úÇÔ∏è Split des donn√©es

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train :", X_train.shape, " Test :", X_test.shape)

Train : (70088, 9)  Test : (17522, 9)


###  ‚öôÔ∏è Pr√©processing

In [6]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

### üß† Mod√®les de r√©gression

In [7]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=300, max_depth=15, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
}


### üß™ Entra√Ænement et √©valuation

In [8]:
results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    results[name] = {"R2": r2, "RMSE": rmse, "MAE": mae}
    
    print(f"\nüìä {name}")
    print(f"R¬≤: {r2:.3f} | RMSE: {rmse:.2f} | MAE: {mae:.2f}")

results_df = pd.DataFrame(results).T.sort_values(by="R2", ascending=False)
display(results_df)


üìä Linear Regression
R¬≤: 0.480 | RMSE: 83.77 | MAE: 59.74

üìä Random Forest
R¬≤: 0.606 | RMSE: 72.93 | MAE: 49.12

üìä Gradient Boosting
R¬≤: 0.575 | RMSE: 75.73 | MAE: 52.28


Unnamed: 0,R2,RMSE,MAE
Random Forest,0.606033,72.934722,49.117586
Gradient Boosting,0.575306,75.725518,52.280615
Linear Regression,0.480319,83.766963,59.740375


### üíæ Sauvegarde du meilleur mod√®le

In [10]:
best_model_name = results_df["R2"].idxmax()
best_model = models[best_model_name]
final_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", best_model)
])
final_pipe.fit(X_train, y_train)

os.makedirs("../models", exist_ok=True)
joblib.dump(final_pipe, f"../models/pipeline_CONSO_{best_model_name.replace(' ','_')}.pkl")

print("\n‚úÖ Sauvegarde termin√©e avec succ√®s !")
print(f"üìò Meilleur mod√®le : {best_model_name} (R¬≤={results_df.loc[best_model_name, 'R2']:.3f})")


‚úÖ Sauvegarde termin√©e avec succ√®s !
üìò Meilleur mod√®le : Random Forest (R¬≤=0.606)
