In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# ======================================================
# 1. Charger les données
# ======================================================
df = pd.read_csv("df.csv")

# DROP provider (constante)
df = df.drop(columns=["provider"])

# Features & target
X = df[
    ["amount_of_investment", "type_of_energy"]
]
y = df["predicted_amount_of_income_in_kwh"]

# ======================================================
# 2. Séparation train / test
# ======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ======================================================
# 3. Préprocessing
# ======================================================
numeric_features = ["amount_of_investment"]
categorical_features = ["type_of_energy"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# ======================================================
# 4. Modèle de régression
# ======================================================
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

# ======================================================
# 5. Pipeline complet
# ======================================================
pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", model)
    ]
)

# ======================================================
# 6. Entraînement
# ======================================================
pipeline.fit(X_train, y_train)

# ======================================================
# 7. Évaluation
# ======================================================
y_pred = pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("===== PERFORMANCE MODELE =====")
print(f"R²   : {r2:.4f}")
print(f"RMSE : {rmse:.2f} kWh")

# ======================================================
# 8. Sauvegarde du modèle
# ======================================================
with open("investment_energy_regression.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ Modèle sauvegardé : investment_energy_regression.pkl")


===== PERFORMANCE MODELE =====
R²   : 0.9303
RMSE : 15.45 kWh
✅ Modèle sauvegardé : investment_energy_regression.pkl
