# 01 — EDA & Préprocessing (NYC Taxi Trips)

> Notebook propre et réutilisable. Les traitements sont déportés dans `src/`.
> Ce notebook se contente d'orchestrer et de visualiser.

## Cellule A — Imports & constantes

In [5]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Affichage
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")

# Dossiers
RAW_DIR = "../data/raw"
PROC_DIR = "../data/processed"
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROC_DIR, exist_ok=True)

# Paramètres dataset (mois de référence pour l’EDA)
DS_MONTH = "2023-01"
URL = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{DS_MONTH}.parquet"
RAW_PATH = f"{RAW_DIR}/yellow_tripdata_{DS_MONTH}.parquet"

# Colonnes utiles
USE_COLS = [
    "tpep_pickup_datetime", "tpep_dropoff_datetime",
    "passenger_count", "trip_distance",
    "PULocationID", "DOLocationID",
    "RatecodeID", "payment_type",
    "fare_amount", "extra", "mta_tax",
    "tip_amount", "tolls_amount", "improvement_surcharge",
    "total_amount", "congestion_surcharge", "airport_fee"
]

# Rendre `src/` importable
sys.path.append("../src")

## Cellule B — Téléchargement sécurisé (idempotent)

In [6]:
import urllib.request

if not os.path.exists(RAW_PATH):
    print("Téléchargement du dataset…")
    urllib.request.urlretrieve(URL, RAW_PATH)
else:
    print("Déjà présent:", RAW_PATH)

Déjà présent: ../data/raw/yellow_tripdata_2023-01.parquet


## Cellule C — Chargement & préprocessing via `src/`

In [None]:
from data_loader import load_parquet
from preprocessing import preprocess

df_raw = load_parquet(RAW_PATH, use_cols=USE_COLS)
df_clean = preprocess(df_raw)
print("Shape clean:", df_clean.shape)
df_clean.head()

## Cellule D — Visualisations EDA essentielles

In [None]:
plt.figure(figsize=(6,4)); sns.histplot(df_clean["trip_duration_minutes"], bins=60); plt.title("Distance (min)"); plt.show()
plt.figure(figsize=(6,4)); sns.histplot(df_clean["trip_distance"], bins=60); plt.title("Distance (miles)"); plt.show()
plt.figure(figsize=(6,4)); sns.histplot(df_clean["total_amount"], bins=60); plt.title("Total ($)"); plt.show()

sample = df_clean.sample(min(50000, len(df_clean)), random_state=42)
plt.figure(figsize=(6,6))
sns.scatterplot(data=sample, x="trip_distance", y="trip_duration_minutes", s=5, alpha=0.3)
plt.title("Distance vs Durée")
plt.show()

## Cellule E — Sauvegarde échantillon propre (200k lignes)

In [None]:
N = 200_000 if len(df_clean) >=200_000 else len(df_clean)
ref = df_clean.sample(N, random_state=42).copy()
OUT_PATH = f"{PROC_DIR}/taxi_ref_2023-01_clean_sample.parquet"
ref.to_parquet(OUT_PATH, index=False)
OUT_PATH, ref.shape

## Cellule F — Log minimal dans MLflow

In [None]:
import mlflow
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("eda-preprocessing")
with mlflow.start_run(run_name="cleaning_2023_01"):
    mlflow.log_param("dataset_month", "2023-01")
    mlflow.log_param("n_samples", len(df_clean))
    mlflow.log_artifact(OUT_PATH)
    mlflow.log_artifact("../src/preprocessing.py")
