# Entraînement Modèle ML avec MLflow

**Données multi-sources** :
- Source 1: API RTE (consommation)
- Source 2: Fichier CSV (jours fériés)
- Source 3: Web scrapping (prix spot)

**Tracking** : MLflow pour versioning et comparaison

## 1. Imports et Configuration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

import mlflow
import mlflow.sklearn

# Style plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports OK")

In [None]:
# Configuration MLflow
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("rte_consommation_3sources")

print("MLflow tracking URI:", mlflow.get_tracking_uri())
print("Experiment:", mlflow.get_experiment_by_name("rte_consommation_3sources"))

## 2. Chargement Données Enrichies (3 sources)

In [None]:
# Charger données fusionnées
data_path = "../data/conso_enrichi_3sources.csv"

df = pd.read_csv(data_path)
df['datetime'] = pd.to_datetime(df['datetime'])

print(f"Données chargées: {len(df)} enregistrements")
print(f"Période: {df['datetime'].min()} -> {df['datetime'].max()}")
print(f"\nColonnes: {list(df.columns)}")

In [None]:
# Aperçu des données
df.head(10)

In [None]:
# Statistiques descriptives
df.describe()

In [None]:
# Matrice de corrélation
corr_cols = ['mw_conso', 'prix_spot_eur_mwh', 'heure', 'jour_semaine', 'est_ferie', 'est_vacances', 'est_weekend']
correlation = df[corr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Matrice de corrélation')
plt.tight_layout()
plt.show()

print("\nCorrélations avec consommation:")
print(correlation['mw_conso'].sort_values(ascending=False))

## 3. Préparation Features et Target

In [None]:
# Features multi-sources
features = [
    'heure',                  # Feature temporelle
    'jour_semaine',           # Feature temporelle
    'mois',                   # Feature temporelle
    'jour_mois',              # Feature temporelle
    'est_weekend',            # Feature temporelle
    'prix_spot_eur_mwh',      # Source 3: Web scrapping
    'est_ferie',              # Source 2: Fichier texte
    'est_vacances'            # Source 2: Fichier texte
]

target = 'mw_conso'  # Source 1: API RTE

print(f"Features utilisées ({len(features)}):")
for i, feat in enumerate(features, 1):
    print(f"  {i}. {feat}")

print(f"\nTarget: {target}")

In [None]:
# Préparer X et y
df_clean = df.dropna(subset=[target] + features)

X = df_clean[features]
y = df_clean[target]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nLignes supprimées (NaN): {len(df) - len(df_clean)}")

In [None]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Split ratio: {len(X_train)/len(X)*100:.1f}% / {len(X_test)/len(X)*100:.1f}%")

## 4. Entraînement avec MLflow Tracking

In [None]:
# Hyperparamètres
params = {
    'n_estimators': 200,
    'max_depth': 15,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'random_state': 42,
    'n_jobs': -1
}

print("Hyperparamètres:")
for key, value in params.items():
    print(f"  {key}: {value}")

In [None]:
# Démarrer run MLflow
with mlflow.start_run(run_name=f"training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    
    print("=" * 70)
    print("   MLflow Run Started")
    print("=" * 70)
    
    # Log paramètres
    mlflow.log_params(params)
    mlflow.log_param("train_size", len(X_train))
    mlflow.log_param("test_size", len(X_test))
    mlflow.log_param("nb_features", len(features))
    mlflow.log_param("features", features)
    
    # Tags
    mlflow.set_tag("model_type", "RandomForestRegressor")
    mlflow.set_tag("data_sources", "API_RTE + Fichier_CSV + Web_Scrapping")
    mlflow.set_tag("nb_sources", "3")
    mlflow.set_tag("training_date", datetime.now().isoformat())
    
    # Entraînement
    print("\nEntraînement du modèle...")
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    print("Entraînement terminé!")
    
    # Cross-validation
    print("\nCross-validation (5-fold)...")
    cv_scores = cross_val_score(
        model, X_train, y_train,
        cv=5, scoring='neg_mean_absolute_error'
    )
    cv_mae = -cv_scores.mean()
    cv_std = cv_scores.std()
    
    mlflow.log_metric("cv_mae", cv_mae)
    mlflow.log_metric("cv_std", cv_std)
    
    print(f"CV MAE: {cv_mae:.2f} ± {cv_std:.2f} MW")
    
    # Prédictions
    print("\nPrédictions sur test set...")
    y_pred = model.predict(X_test)
    
    # Métriques
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Log métriques
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mape_percent", mape)
    
    print("\n" + "="*70)
    print("   Métriques")
    print("="*70)
    print(f"MAE:  {mae:.2f} MW")
    print(f"RMSE: {rmse:.2f} MW")
    print(f"R²:   {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print("="*70)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nImportance des features:")
    print(feature_importance.to_string(index=False))
    
    # Log importance
    for idx, row in feature_importance.iterrows():
        mlflow.log_metric(f"importance_{row['feature']}", row['importance'])
    
    # Sauvegarde modèle
    model_path = Path("models/rte_conso_model_3sources.pkl")
    model_path.parent.mkdir(exist_ok=True)
    joblib.dump(model, model_path)
    
    # Log modèle MLflow
    mlflow.sklearn.log_model(
        model,
        "model",
        registered_model_name="RTEConsommation3Sources"
    )
    
    run_id = mlflow.active_run().info.run_id
    print(f"\nMLflow Run ID: {run_id}")
    print(f"Modèle sauvegardé: {model_path}")

## 5. Visualisations

In [None]:
# Feature importance plot
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Importance des Features')
plt.tight_layout()
plt.show()

In [None]:
# Prédictions vs Réel
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot
axes[0].scatter(y_test, y_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Consommation Réelle (MW)')
axes[0].set_ylabel('Consommation Prédite (MW)')
axes[0].set_title(f'Prédictions vs Réel (R² = {r2:.3f})')
axes[0].grid(True)

# Résidus
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Consommation Prédite (MW)')
axes[1].set_ylabel('Résidus (MW)')
axes[1].set_title('Analyse des Résidus')
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Distribution des erreurs
errors = np.abs(y_test - y_pred)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(errors, bins=30, edgecolor='black')
plt.xlabel('Erreur Absolue (MW)')
plt.ylabel('Fréquence')
plt.title('Distribution des Erreurs Absolues')
plt.axvline(mae, color='r', linestyle='--', label=f'MAE = {mae:.2f} MW')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(errors)
plt.ylabel('Erreur Absolue (MW)')
plt.title('Boxplot des Erreurs')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Erreur médiane: {np.median(errors):.2f} MW")
print(f"Erreur max: {np.max(errors):.2f} MW")
print(f"95% des erreurs < {np.percentile(errors, 95):.2f} MW")

## 6. Test sur quelques exemples

In [None]:
# Prendre 10 exemples aléatoires du test set
sample_idx = np.random.choice(len(X_test), 10, replace=False)

X_sample = X_test.iloc[sample_idx]
y_sample_true = y_test.iloc[sample_idx]
y_sample_pred = model.predict(X_sample)

comparison = pd.DataFrame({
    'Réel (MW)': y_sample_true.values,
    'Prédit (MW)': y_sample_pred,
    'Erreur (MW)': np.abs(y_sample_true.values - y_sample_pred),
    'Erreur (%)': np.abs((y_sample_true.values - y_sample_pred) / y_sample_true.values * 100)
})

print("Exemples de prédictions:")
print(comparison.to_string())

## 7. Résumé

In [None]:
print("=" * 70)
print("   RÉSUMÉ ENTRAÎNEMENT")
print("=" * 70)
print(f"\nDonnées:")
print(f"  - Sources: 3 (API + Fichier + Scrapping)")
print(f"  - Total samples: {len(X)}")
print(f"  - Features: {len(features)}")
print(f"\nModèle: RandomForestRegressor")
print(f"  - n_estimators: {params['n_estimators']}")
print(f"  - max_depth: {params['max_depth']}")
print(f"\nPerformances:")
print(f"  - MAE: {mae:.2f} MW")
print(f"  - RMSE: {rmse:.2f} MW")
print(f"  - R²: {r2:.4f}")
print(f"  - MAPE: {mape:.2f}%")
print(f"\nTop 3 features importantes:")
for i, row in feature_importance.head(3).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")
print("\n" + "=" * 70)
print("Pour visualiser dans MLflow UI:")
print("  mlflow ui")
print("  puis ouvrir: http://localhost:5000")
print("=" * 70)