# Entraînement baseline (NYC Taxi Trips)

## Setup & Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Ajouter src/ au chemin pour importer nos modules
sys.path.append("../src")
from data_loader import download_month, load_parquet
from preprocessing import preprocess, FEATS_KEEP

# Dossier utiles
DATA_RAW = Path("../data/raw")
DATA_PROC = Path("../data/processed")
MODELS_DIR = Path("../models/trained_models")
for p in [DATA_RAW, DATA_PROC, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Colonnes à charger depuis le parquet
USE_COLS = [
    "tpep_pickup_datetime", "tpep_dropoff_datetime",
    "passenger_count", "trip_distance",
    "PULocationID", "DOLocationID",
    "RatecodeID", "payment_type",
    "fare_amount", "extra", "mta_tax",
    "tip_amount", "tolls_amount", "improvement_surcharge",
    "total_amount", "congestion_surcharge", "airport_fee"
]

# Fenêtre temporelle
TRAIN_MONTHS = ["01", "02", "03", "04"]
VAL_MONTHS = ["05"]
TEST_MONTHS = ["06"]
YEAR = "2023"

## Ingestion des données

In [None]:
def load_months(months, year=YEAR):
    dfs = []
    for m in months:
        path = download_month(m, year=year)
        df_raw = load_parquet(path, use_cols=USE_COLS)
        df_clean = preprocess(df_raw)
        df_clean["set_month"] = f"{year}-{m}"
        dfs.append(df_clean)
    return pd.concat(dfs, axis=0, ignore_index=True)

# Charger les jeux de données
train_df = load_months(TRAIN_MONTHS)
val_df = load_months(VAL_MONTHS)
test_df = load_months(TEST_MONTHS)

train_df.shape, val_df.shape, test_df.shape

## Séparer Features / Target

In [None]:
TARGET = "trip_duration_minutes"

# Features pour la baseline
NUM_FEATS = ["trip_distance", "passenger_count", "pickup_hour", "pickup_dow", "is_weekend"]
CAT_FEATS = ["RatecodeID", "payment_type"]
FEATS = NUM_FEATS + CAT_FEATS

# Split
X_train = train_df[FEATS].copy()
y_train = train_df[TARGET].copy()

X_val = val_df[FEATS].copy()
y_val = val_df[TARGET].copy()

X_test = test_df[FEATS].copy()
y_test = test_df[TARGET].copy()

X_train.head()

## Préprocesseur sklearn

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Encodage des variables catégorielles + normalisation des numériques
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUM_FEATS),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), CAT_FEATS),
    ],
    remainder="drop"
)
preprocessor

## Modèles Baseline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

def evaluate(model, X_tr, y_tr, X_v, y_v):
    pred_tr = model.predict(X_tr)
    pred_v = model.predict(X_v)
    metrics = {
        "train_mae": mean_absolute_error(y_tr, pred_tr),
        "train_rmse": math.sqrt(mean_squared_error(y_tr, pred_tr)),
        "train_r2": r2_score(y_tr, pred_tr),
        "val_mae": mean_absolute_error(y_v, pred_v),
        "val_rmse": math.sqrt(mean_squared_error(y_v, pred_v)),
        "val_r2": r2_score(y_v, pred_v),
    }
    return metrics

# Modèle 1 - Linear Regression
linreg = Pipeline ([
    ("prep", preprocessor),
    ("model", LinearRegression())
])

# Modèle 2 - HistGradientBoosting (souvent meilleur)
hgb = Pipeline([
    ("prep", preprocessor),
    ("model", HistGradientBoostingRegressor(
        max_depth=None, learning_rate=0.1, max_iter=300, random_state=42
    ))
])

models = {"linreg": linreg, "hgb": hgb}
models

## Entraînement + MLflow

In [None]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("baseline-training")

results = {}

for name, pipe in models.items():
    with mlflow.start_run(run_name=name):
        mlflow.log_param("year", YEAR)
        mlflow.log_param("train_months", ",".join(TRAIN_MONTHS))
        mlflow.log_param("val_months", ",".join(VAL_MONTHS))
        mlflow.log_param("features_num", NUM_FEATS)
        mlflow.log_param("features_cat", CAT_FEATS)
        mlflow.log_param("model_name", name)

        # Entraînement
        pipe.fit(X_train, y_train)

        # Évaluation
        metrics = evaluate(pipe, X_train, y_train, X_val, y_val)
        for k, v in metrics.items():
            mlflow.log_metrics(k, float(v))
        results[name] = metrics

        # Sauvegarde du modèle dans MLflow
        mlflow.sklearn.log_model(pipe, artifact_path="model")

results