In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.lightgbm


In [8]:

# 📥 Charger les données
df = pd.read_parquet("../data/df_all.parquet")
print(df.shape)
print(df.columns)


(3802, 32)
Index(['Close_EWLD.PA', 'High_EWLD.PA', 'Low_EWLD.PA', 'Open_EWLD.PA',
       'Volume_EWLD.PA', 'daily_return', 'weekly_return', 'monthly_return',
       'rolling_volatility_21', 'rolling_volatility_63', 'momentum_21',
       'momentum_63', 'ma_21', 'ma_63', 'drawdown', 'target', 'ticker',
       'Close_PAEEM.PA', 'High_PAEEM.PA', 'Low_PAEEM.PA', 'Open_PAEEM.PA',
       'Volume_PAEEM.PA', 'Close_ESE.PA', 'High_ESE.PA', 'Low_ESE.PA',
       'Open_ESE.PA', 'Volume_ESE.PA', 'Close_CW8.PA', 'High_CW8.PA',
       'Low_CW8.PA', 'Open_CW8.PA', 'Volume_CW8.PA'],
      dtype='object')


In [9]:

# ✅ Features + Target
TARGET = "target"
FEATURES = [col for col in df.columns if col not in ["target", "ticker"]]


In [10]:

# ✅ Drop NaN
df = df.dropna(subset=[TARGET])
print(f"Remaining rows after dropping NaN target: {df.shape[0]}")


Remaining rows after dropping NaN target: 3802


In [11]:

# ✅ Split temporel ou random (ici simple pour avancer)
X = df[FEATURES]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (3041, 30), Test shape: (761, 30)


In [12]:

# ✅ LightGBM Dataset
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


In [13]:

# ✅ Paramètres de base
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "seed": 42,
}


In [14]:

# ✅ MLflow tracking local
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment("ETF_PEA_MLOpsZoomcamp")

with mlflow.start_run():
    mlflow.log_params(params)

    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, valid_data],
        num_boost_round=100,
        early_stopping_rounds=10,
        verbose_eval=10
    )

    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"RMSE: {rmse}")
    print(f"R²: {r2}")

    # MLflow log
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.lightgbm.log_model(model, "model")

print("✅ Training terminé et loggé dans MLflow.")


2025/07/15 12:14:37 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/15 12:14:38 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'