In [19]:
# 02_train_model.ipynb

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from lightgbm import early_stopping
import mlflow

In [2]:
# 📂 Config
DATA_PATH = Path("../data/features")
EXPERIMENT_NAME = "ETF_PEA_MLOpsZoomcamp"

In [3]:
# 🎯 Load dataset
all_files = list(DATA_PATH.glob("*_features.parquet"))
dfs = []

for file in all_files:
    df = pd.read_parquet(file)
    df["ticker"] = file.stem.split("_")[0]
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
print(f"✅ Shape of df_all: {df_all.shape}")

✅ Shape of df_all: (3802, 32)


In [4]:
# 🩺 Vérifier les colonnes
print(df_all.columns.tolist())

["('Close', 'EWLD.PA')", "('High', 'EWLD.PA')", "('Low', 'EWLD.PA')", "('Open', 'EWLD.PA')", "('Volume', 'EWLD.PA')", "('daily_return', '')", "('weekly_return', '')", "('monthly_return', '')", "('rolling_volatility_21', '')", "('rolling_volatility_63', '')", "('momentum_21', '')", "('momentum_63', '')", "('ma_21', '')", "('ma_63', '')", "('drawdown', '')", "('target', '')", 'ticker', "('Close', 'PAEEM.PA')", "('High', 'PAEEM.PA')", "('Low', 'PAEEM.PA')", "('Open', 'PAEEM.PA')", "('Volume', 'PAEEM.PA')", "('Close', 'ESE.PA')", "('High', 'ESE.PA')", "('Low', 'ESE.PA')", "('Open', 'ESE.PA')", "('Volume', 'ESE.PA')", "('Close', 'CW8.PA')", "('High', 'CW8.PA')", "('Low', 'CW8.PA')", "('Open', 'CW8.PA')", "('Volume', 'CW8.PA')"]


In [5]:
# 🩺 Trouver la colonne target proprement
target_col = next(c for c in df_all.columns if "target" in c)
print(f"✅ Detected target column: {target_col}")

✅ Detected target column: ('target', '')


In [6]:
# 🔄 Renommer pour simplifier
df_all = df_all.rename(columns={target_col: "target"})

In [15]:
# ✅ Features et Target
df_all.columns = [col.replace("(", "")
                      .replace(")", "")
                      .replace("'", "")
                      .replace(",", "")
                      .replace(" ", "_")
                  for col in df_all.columns]

feature_cols = [col for col in df_all.columns if "target" not in col and "ticker" not in col]

X = df_all[feature_cols]
y = df_all["target"]

In [16]:
# ⚖️ Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
# 📊 Setup MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:G:/Mon Drive/DataTalksClub/MLOps-ETF-PEA/notebooks/mlruns/1', creation_time=1752591127892, experiment_id='1', last_update_time=1752591127892, lifecycle_stage='active', name='ETF_PEA_MLOpsZoomcamp', tags={}>

In [20]:
# 🎛️ Training
params = {
    "learning_rate": 0.05,
    "num_leaves": 31,
    "random_state": 42
}

with mlflow.start_run():
    mlflow.log_params(params)

    model = LGBMRegressor(**params, n_estimators=100)
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        callbacks=[early_stopping(stopping_rounds=10)],
    )

    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    mlflow.sklearn.log_model(model, "model")

print(f"✅ Training completed with RMSE: {rmse:.5f}, R²: {r2:.5f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6689
[LightGBM] [Info] Number of data points in the train set: 3041, number of used features: 30
[LightGBM] [Info] Start training from score 0.012837
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l2: 0.000253814	valid_1's l2: 0.00047819




✅ Training completed with RMSE: 0.02187, R²: 0.81265
