In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
# 1. Chargement des données 
df=pd.read_csv("../data/donnees_dpe_rhone_clean.csv",delimiter=",")

In [3]:
df.head(3)

Unnamed: 0,configuration_installation_chauffage_n2,configuration_installation_chauffage_n1,conso_chauffage_installation_chauffage_n1,type_generateur_n1_ecs_n1,numero_voie_ban,score_ban,conso_chauffage_generateur_n1_installation_n2,conso_auxiliaires_ep,conso_chauffage_installation_chauffage_n2,deperditions_murs,...,periode_installation_generateur_froid,description_generateur_froid,systeme_production_electricite_origine_renouvelable,presence_production_pv,electricite_pv_autoconsommee,surface_habitable_immeuble,surface_ventilee,nombre_niveau_immeuble,type_ventilation,appartement_non_visite
0,Installation de chauffage simple,Installation de chauffage simple,25718.3,Chaudière bois granulés après 2019,1147.0,0.37,5987.3,710.9,5987.3,155.8,...,,,,,,,,,,
1,Installation de chauffage simple,Installation de chauffage simple,25718.3,Chaudière bois granulés après 2019,1147.0,0.37,5987.3,710.9,5987.3,155.8,...,,,,,,,,,,
2,Installation de chauffage simple,Installation de chauffage simple,14399.6,Chaudière bois granulés après 2019,299.0,0.45,2528.6,538.4,2528.6,59.3,...,,,,,,,,,,


In [4]:
df['cout_total_5_usages']

0         2058.3
1         2058.3
2         1615.2
3         4494.0
4         1837.1
           ...  
256699    2617.7
256700    1302.1
256701    3334.4
256702    2918.5
256703    5404.0
Name: cout_total_5_usages, Length: 256704, dtype: float64

In [5]:
target = "cout_total_5_usages"
df = df.dropna(subset=[target])

In [6]:
# Calcul des quartiles et de l'IQR
q1 = df[target].quantile(0.25)
q3 = df[target].quantile(0.75)
iqr = q3 - q1

# Filtrer le DataFrame pour conserver uniquement les valeurs dans la plage des limites
df = df[(df[target] >= (q1 - 1.5 * iqr)) & (df[target] <= (q3 + 1.5 * iqr))]

In [7]:
ls_variables_explicatives = [
    'periode_construction',
    'surface_habitable_logement',
    'etiquette_dpe',
    'deperditions_enveloppe',
    'date_reception_dpe',
    'deperditions_renouvellement_air',
    'type_energie_n1',
    'deperditions_baies_vitrees',
    'qualite_isolation_murs',
    'deperditions_ponts_thermiques',
    'deperditions_murs',
    'deperditions_planchers_hauts'
]


In [8]:
# Définir les données d'entrée et la variable cible
X = df[ls_variables_explicatives]
y = df[target]

# Séparer les colonnes numériques et catégorielles
numeric_features = df[ls_variables_explicatives].select_dtypes(include='number').columns
categorical_features = df[ls_variables_explicatives].select_dtypes(include='object').columns

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Créer les pipelines de transformation pour les caractéristiques numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),      # Impute les valeurs manquantes par la moyenne
    # ('scaler', StandardScaler())                      # Appliquer StandardScaler pour la mise à l'échelle
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute les valeurs manquantes par la valeur la plus fréquente
    ('ordinal', OrdinalEncoder())                          # Utilise OrdinalEncoder pour encoder les variables catégorielles
])

# Step 5: Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 6: Create the full pipeline with Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=5, random_state=42))  # Utilisation de Random Forest pour la régression
])

In [11]:
pipeline.fit(X_train, y_train)

In [12]:
y_pred = pipeline.predict(X_test)
y_pred

array([ 907.8, 1837.1, 4346.6, ..., 4062.5, 2552.6, 1703.6])

In [13]:
y_test

33879      907.8
75436     1837.1
167587    4346.6
206137    2058.3
123316    1837.1
           ...  
108561    1789.5
127057    2315.9
125180    4062.5
54468     2552.6
170285    1703.6
Name: cout_total_5_usages, Length: 47063, dtype: float64

In [15]:

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (RMSE): {rmse:.3f}')

Root Mean Squared Error (RMSE): 0.000
