In [1]:
# --- Reproducible environment & lightweight utilities ---
from __future__ import annotations

import os
from pathlib import Path
from dataclasses import dataclass
from typing import Iterable, Optional

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# --- Training utilities ---
from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score, f1_score,
    roc_curve, precision_recall_curve, confusion_matrix, classification_report
)
from tqdm.auto import tqdm
import math
import os

# Plot defaults: consistent sizing and readable grids
plt.rcParams["figure.figsize"] = (9, 5)
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.dpi"] = 140

# Keep the console clean from non-critical warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Reproducibility
RNG_SEED = 42
rng = np.random.default_rng(RNG_SEED)

# Project paths
TRAIN_DATASET = "../data/train.csv"
TEST_DATASET = "../data/test.csv"
MODEL_PATH = "../models"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cuda


# Data Cleaning & Alignment

Before any feature engineering or modeling, we ensure both **train** and **test** datasets share the exact same structure.  
This involves removing columns that are not available in both sets (to avoid data leakage),  
isolating the **target variable** (`trip_duration`), and preserving **ID columns** separately for evaluation and submission.

Steps:
1. Drop `dropoff_datetime` from the training data â€” it's unavailable in test and leaks target information.  
2. Split the target column (`trip_duration`) from the training features.  
3. Store `id` columns separately to use later when generating the submission file.  
4. Remove `id` from features (since it has no predictive value).  
5. Validate that train and test feature sets are perfectly aligned.  
This ensures both datasets are **schema-consistent**, **leak-free**, and ready for feature extraction.


In [2]:
# load dataset
df_train = pd.read_csv(TRAIN_DATASET)
df_test = pd.read_csv(TEST_DATASET)

In [3]:
df_train.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [4]:
df_test.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag'],
      dtype='object')

In [5]:
# ============================================================
# Clean alignment between train and test datasets
# ============================================================

# --- 1. Drop columns from train that do not exist in test ---
# (dropoff_datetime leaks the target information)
df_train = df_train.drop(columns=["dropoff_datetime"])
# --- 2. Separate the target variable (trip_duration) ---
target_col = "trip_duration"
y_train = df_train[target_col]
X_train = df_train.drop(columns=[target_col])
# --- 3. Keep ID columns separately for later submission/evaluation ---
train_ids = X_train["id"].copy()
test_ids = df_test["id"].copy()
# --- 4. Drop ID columns from the feature sets (not predictive) ---
X_train = X_train.drop(columns=["id"])
X_test = df_test.drop(columns=["id"])
print(f"Shapes aligned: X_train={X_train.shape}, X_test={X_test.shape}")
print(f"Target shape: y_train={y_train.shape}")
print(f"Feature columns: {X_train.columns.tolist()}")

Shapes aligned: X_train=(1458644, 8), X_test=(625134, 8)
Target shape: y_train=(1458644,)
Feature columns: ['vendor_id', 'pickup_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag']
