# Benchmarking Algorithmes ML - Consommation Electrique

Comparaison de 3 algorithmes pour choisir le meilleur pour la prediction.

**Algorithmes compares:**
1. Linear Regression (baseline simple)
2. Gradient Boosting (ensemble avance)
3. Random Forest (ensemble robuste)

## 1. Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Imports OK")

## 2. Chargement donnees

In [None]:
# Charger donnees enrichies 3 sources
data_path = "../data/conso_enrichi_3sources.csv"

df = pd.read_csv(data_path)
df['datetime'] = pd.to_datetime(df['datetime'])

print(f"Donnees chargees: {len(df)} enregistrements")
print(f"Periode: {df['datetime'].min()} -> {df['datetime'].max()}")
df.head()

## 3. Preparation features

In [None]:
# Features multi-sources
features = [
    'heure',
    'jour_semaine',
    'mois',
    'jour_mois',
    'est_weekend',
    'prix_spot_eur_mwh',
    'est_ferie',
    'est_vacances'
]

target = 'mw_conso'

# Nettoyer et preparer
df_clean = df.dropna(subset=[target] + features)

X = df_clean[features]
y = df_clean[target]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Features: {features}")
print(f"\nTrain set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

## 4. Definition des algorithmes

In [None]:
# 3 algorithmes a comparer
models = {
    'Linear Regression': LinearRegression(),
    
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ),
    
    'Random Forest': RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
}

print(f"Algorithmes a benchmarker: {list(models.keys())}")

## 5. Entrainement et evaluation

In [None]:
results = []

print("=" * 70)
print("   BENCHMARKING EN COURS")
print("=" * 70 + "\n")

for name, model in models.items():
    print(f"\nAlgorithme: {name}")
    print("-" * 50)
    
    # Entrainement
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train, y_train,
        cv=5, scoring='neg_mean_absolute_error'
    )
    cv_mae = -cv_scores.mean()
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metriques
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Stocker resultats
    results.append({
        'algorithm': name,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'mape': mape,
        'cv_mae': cv_mae,
        'train_time': train_time,
        'predictions': y_pred
    })
    
    # Afficher resultats
    print(f"MAE:        {mae:.2f} MW")
    print(f"RMSE:       {rmse:.2f} MW")
    print(f"R2:         {r2:.4f}")
    print(f"MAPE:       {mape:.2f}%")
    print(f"CV MAE:     {cv_mae:.2f} MW")
    print(f"Train time: {train_time:.2f}s")

print("\n" + "=" * 70)
print("   BENCHMARKING TERMINE")
print("=" * 70)

## 6. Comparaison des resultats

In [None]:
# DataFrame des resultats
df_results = pd.DataFrame([{
    'Algorithme': r['algorithm'],
    'MAE (MW)': round(r['mae'], 2),
    'RMSE (MW)': round(r['rmse'], 2),
    'R2': round(r['r2'], 4),
    'MAPE (%)': round(r['mape'], 2),
    'CV MAE (MW)': round(r['cv_mae'], 2),
    'Temps (s)': round(r['train_time'], 2)
} for r in results])

# Trier par MAE
df_results = df_results.sort_values('MAE (MW)')

print("\n" + "=" * 90)
print("   COMPARAISON DES ALGORITHMES")
print("=" * 90)
print(df_results.to_string(index=False))
print("=" * 90)

# Meilleur algorithme
best_algorithm = df_results.iloc[0]['Algorithme']
print(f"\nMeilleur algorithme: {best_algorithm}")

## 7. Visualisations comparatives

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Comparaison MAE
axes[0, 0].bar(df_results['Algorithme'], df_results['MAE (MW)'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[0, 0].set_title('MAE par algorithme (plus bas = meilleur)', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('MAE (MW)')
axes[0, 0].tick_params(axis='x', rotation=15)
axes[0, 0].grid(True, alpha=0.3)

# 2. Comparaison R2
axes[0, 1].bar(df_results['Algorithme'], df_results['R2'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[0, 1].set_title('R2 Score par algorithme (plus haut = meilleur)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('R2 Score')
axes[0, 1].tick_params(axis='x', rotation=15)
axes[0, 1].grid(True, alpha=0.3)

# 3. Comparaison MAPE
axes[1, 0].bar(df_results['Algorithme'], df_results['MAPE (%)'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[1, 0].set_title('MAPE par algorithme (plus bas = meilleur)', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('MAPE (%)')
axes[1, 0].tick_params(axis='x', rotation=15)
axes[1, 0].grid(True, alpha=0.3)

# 4. Temps entrainement
axes[1, 1].bar(df_results['Algorithme'], df_results['Temps (s)'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[1, 1].set_title('Temps entrainement par algorithme', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Temps (secondes)')
axes[1, 1].tick_params(axis='x', rotation=15)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('ml/benchmark_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("Graphiques sauvegardes: ml/benchmark_comparison.png")

## 8. Predictions vs Reel pour chaque algorithme

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, result in enumerate(results):
    y_pred = result['predictions']
    
    axes[idx].scatter(y_test, y_pred, alpha=0.5, s=20)
    axes[idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                   'r--', lw=2, label='Prediction parfaite')
    axes[idx].set_xlabel('Consommation Reelle (MW)')
    axes[idx].set_ylabel('Consommation Predite (MW)')
    axes[idx].set_title(f"{result['algorithm']}\nR2 = {result['r2']:.3f}", fontweight='bold')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Conclusion

In [None]:
print("=" * 70)
print("   CONCLUSION BENCHMARKING")
print("=" * 70)
print(f"\nAlgorithme selectionne: {best_algorithm}")
print("\nJustification:")

best_result = [r for r in results if r['algorithm'] == best_algorithm][0]

print(f"- MAE:  {best_result['mae']:.2f} MW (erreur moyenne absolue)")
print(f"- R2:   {best_result['r2']:.4f} (qualite predictions)")
print(f"- MAPE: {best_result['mape']:.2f}% (erreur en pourcentage)")
print(f"\nLe {best_algorithm} offre le meilleur compromis entre:")
print("  1. Precision des predictions (MAE et R2)")
print("  2. Robustesse (cross-validation)")
print("  3. Temps d'entrainement raisonnable")
print("=" * 70)

## 10. Export resultats

In [None]:
# Sauvegarder resultats
df_results.to_csv('ml/benchmark_results.csv', index=False)
print("Resultats sauvegardes: ml/benchmark_results.csv")

# Resume
summary = {
    'best_algorithm': best_algorithm,
    'metrics': {
        'mae': float(best_result['mae']),
        'rmse': float(best_result['rmse']),
        'r2': float(best_result['r2']),
        'mape': float(best_result['mape'])
    },
    'all_results': df_results.to_dict('records')
}

import json
with open('ml/benchmark_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Resume sauvegarde: ml/benchmark_summary.json")