In [78]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

import pickle


In [79]:
spot_data_path = r"data/France.csv"
eco2mix_data_path = r"data/eCO2mix_RTE_En-cours-Consolide.csv"
models_path = r"models/"

spot_data = pd.read_csv(spot_data_path)
eco2mix_data = pd.read_csv(eco2mix_data_path, sep='\t', encoding='latin-1', index_col=False)

target = "Price (EUR/MWhe)"

features = list(eco2mix_data.columns)
features = features[4:]


In [80]:
spot_data['Datetime (Local)'] = pd.to_datetime(spot_data['Datetime (Local)'])
spot_data['Datetime (UTC)'] = pd.to_datetime(spot_data['Datetime (UTC)'])


In [81]:
eco2mix_data["Datetime (Local)"] = pd.to_datetime(eco2mix_data['Date'] + ' ' + eco2mix_data['Heures'])
eco2mix_data["Datetime (Local)"] = pd.to_datetime(eco2mix_data['Datetime (Local)'], format="%Y-%m-%d %H:%M:%S")
eco2mix_data=eco2mix_data.dropna()

In [82]:
merged_data = pd.merge(spot_data, eco2mix_data[features + ['Datetime (Local)']], on='Datetime (Local)', how='inner')

In [83]:
X = merged_data[features].copy()
y = merged_data[target].copy()

print('Taille de X, y : ', X.shape, y.shape)

Taille de X, y :  (8784, 36) (8784,)


In [84]:
merged_data.isna().sum()

Country                                 0
ISO3 Code                               0
Datetime (UTC)                          0
Datetime (Local)                        0
Price (EUR/MWhe)                        0
Consommation                            0
Prévision J-1                           0
Prévision J                             0
Fioul                                   0
Charbon                                 0
Gaz                                     0
Nucléaire                               0
Eolien                                  0
Solaire                                 0
Hydraulique                             0
Pompage                                 0
Bioénergies                             0
Ech. physiques                          0
Taux de Co2                             0
Ech. comm. Angleterre                   0
Ech. comm. Espagne                      0
Ech. comm. Italie                       0
Ech. comm. Suisse                       0
Ech. comm. Allemagne-Belgique     

In [85]:
split_point = int(len(X) * 0.8)

X_train_tune = X.iloc[:split_point]
y_train_tune = y.iloc[:split_point]

X_test_final = X.iloc[split_point:]
y_test_final = y.iloc[split_point:]

In [86]:
tscv = TimeSeriesSplit(n_splits=5)

pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

param_grid_lr = {}
grid_lr = GridSearchCV(
    estimator=pipeline_lr,
    param_grid=param_grid_lr,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=1
)

grid_lr.fit(X_train_tune, y_train_tune)

model_lr_final = grid_lr.best_estimator_

lr_final_prediction = model_lr_final.predict(X_test_final)

final_mse = mean_squared_error(y_test_final, lr_final_prediction)
final_rmse = np.sqrt(final_mse)

print(f"Score du Gridsearch (plis internes): {grid_lr.best_score_:.2f} (neg MSE)")
print(f"---")
print(f"Score FiNAL (sur X_test_final): {final_mse:.2f} (MSE)")
print(f"Score FINAL (sur X_test_final): {final_rmse:.2f} (RMSE)")
print("---")


Score du Gridsearch (plis internes): -840.79 (neg MSE)
---
Score FiNAL (sur X_test_final): 1171.10 (MSE)
Score FINAL (sur X_test_final): 34.22 (RMSE)
---


In [63]:
pipeline_rf = Pipeline([
    ('model', RandomForestRegressor(random_state=42))
])

param_grid_rf = {
    'model__n_estimators': [50, 100],
    'model__max_depth': [10, 20, None]
}

grid_rf = GridSearchCV(
    estimator=pipeline_rf,
    param_grid=param_grid_rf,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=1,
    verbose=1
)

grid_rf.fit(X, y)

model_rf_final = grid_rf.best_estimator_

rf_final_prediction = model_rf_final.predict(X_test_final)

final_mse = mean_squared_error(y_test_final, rf_final_prediction)
final_rmse = np.sqrt(final_mse)

print(f"Score du Gridsearch (plis internes): {grid_rf.best_score_:.2f} (neg MSE)")
print(f"---")
print(f"Score FiNAL (sur X_test_final): {final_mse:.2f} (MSE)")
print(f"Score FINAL (sur X_test_final): {final_rmse:.2f} (RMSE)")
print("---")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Score du Gridsearch (plis internes): -707.50 (neg MSE)
---
Score FiNAL (sur X_test_final): 19.74 (MSE)
Score FINAL (sur X_test_final): 4.44 (RMSE)
---


In [87]:
final_rf = grid_rf.best_estimator_
final_lr = grid_lr.best_estimator_

final_lr.fit(X,y)
final_rf.fit(X,y)


0,1,2
,steps,"[('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [89]:
try:
    with open('models/linear_regression_model.pkl', 'wb') as f:
        pickle.dump(final_lr, f)
    with open('models/random_forest_model.pkl', 'wb') as f:
        pickle.dump(final_rf, f)

except Exception as e:
    print(f"\nErreur lors de la sauvegarde des modèles : {e}")