# Entraînement baseline (NYC Taxi Trips)

## Setup & Imports

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Ajouter src/ au chemin pour importer nos modules
sys.path.append("../src")
from data_loader import download_month, load_parquet
from preprocessing import preprocess, FEATS_KEEP

# Dossier utiles
DATA_RAW = Path("../data/raw")
DATA_PROC = Path("../data/processed")
MODELS_DIR = Path("../models/trained_models")
for p in [DATA_RAW, DATA_PROC, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Colonnes à charger depuis le parquet
USE_COLS = [
    "tpep_pickup_datetime", "tpep_dropoff_datetime",
    "passenger_count", "trip_distance",
    "PULocationID", "DOLocationID",
    "RatecodeID", "payment_type",
    "fare_amount", "extra", "mta_tax",
    "tip_amount", "tolls_amount", "improvement_surcharge",
    "total_amount", "congestion_surcharge", "airport_fee"
]

# Fenêtre temporelle
TRAIN_MONTHS = ["01", "02", "03", "04"]
VAL_MONTHS = ["05"]
TEST_MONTHS = ["06"]
YEAR = "2023"

## Ingestion des données

In [None]:
def load_months(months, year=YEAR):
    dfs = []
    for m in months:
        path = download_month(m, year=year)
        df_raw = load_parquet(path, use_cols=USE_COLS)
        df_clean = preprocess(df_raw)
        df_clean["set_month"] = f"{year}-{m}"
        dfs.append(df_clean)
    return pd.concat(dfs, axis=0, ignore_index=True)

# Charger les jeux de données
train_df = load_months(TRAIN_MONTHS)
val_df = load_months(VAL_MONTHS)
test_df = load_months(TEST_MONTHS)

train_df.shape, val_df.shape, test_df.shape