# Phase 3: Arrival Delay Classification (ARR_DEL15)

This notebook builds supervised ML models to predict `ARR_DEL15`, which indicates whether an arrival delay was 15 minutes or more, using the airline on-time performance dataset prepared in Phase 1.

We will:

1. Load the processed dataset from Phase 1.
2. Engineer features by selecting numerical columns and encoding categorical variables.
3. Split the data into train and test sets.
4. Train and evaluate classification models (Logistic Regression and Random Forest).

To keep runtime reasonable on a large dataset (~9.5M rows), we train on a stratified sample of the data.


In [None]:
# Setup and imports
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)

# Ensure project root is on path
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))

from src.utils.helpers import load_config, ensure_dir

print("Project root:", project_root)


In [None]:
# Load configuration and processed data from Phase 1
config = load_config(project_root / 'config' / 'config.yaml')

processed_dir = project_root / config['data']['processed_dir']
print("Processed data directory:", processed_dir)

# Load the processed events data (one row per flight in this project setup)
events_path = processed_dir / 'events_sorted.csv.gz'
print("Loading data from:", events_path)

df = pd.read_csv(events_path, compression='gzip')
print(f"Loaded {len(df):,} rows and {df.shape[1]} columns")
print("Columns:", list(df.columns))


In [None]:
# Prepare target variable ARR_DEL15
TARGET_COL = 'ARR_DEL15'

if TARGET_COL not in df.columns:
    raise KeyError(f"Target column {TARGET_COL} not found in dataset")

# Drop rows with missing target
initial_rows = len(df)
df = df[df[TARGET_COL].notna()].copy()
print(f"Dropped {initial_rows - len(df):,} rows with missing {TARGET_COL}")

# Ensure target is integer (0/1)
df[TARGET_COL] = df[TARGET_COL].astype(int)
print(df[TARGET_COL].value_counts(normalize=True).rename('proportion'))


In [None]:
# Feature selection and basic preprocessing

# Columns that leak the target (directly derived from arrival delay)
leakage_cols = [
    TARGET_COL,
    'ARR_DELAY',
    'ARR_DELAY_NEW',
    'ARR_DELAY_GROUP',
]

# Remove obvious identifiers that are not useful for prediction
id_cols = ['flight_id'] if 'flight_id' in df.columns else []

cols_to_drop = [c for c in leakage_cols + id_cols if c in df.columns]
print("Dropping columns:", cols_to_drop)

X = df.drop(columns=cols_to_drop)
y = df[TARGET_COL]

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


In [None]:
# Create a manageable sample for modeling

max_samples = 100_000  # adjust if needed
n_rows = len(X)

if n_rows > max_samples:
    print(f"Sampling {max_samples:,} rows out of {n_rows:,} for model training (stratified by {TARGET_COL})...")
    X_sample, _, y_sample, _ = train_test_split(
        X, y, train_size=max_samples, stratify=y, random_state=42
    )
else:
    print(f"Using all {n_rows:,} rows for modeling.")
    X_sample, y_sample = X, y

print("Sample class distribution:")
print(y_sample.value_counts(normalize=True).rename('proportion'))


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_sample,
    y_sample,
    test_size=0.2,
    stratify=y_sample,
    random_state=42,
)

print(f"Train size: {len(X_train):,}, Test size: {len(X_test):,}")


In [None]:
# Preprocessing: scale numeric features and one-hot encode categoricals

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

print("Preprocessing pipeline defined.")


In [None]:
# Model 1: Logistic Regression

log_reg_clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LogisticRegression(max_iter=200, n_jobs=None, solver='lbfgs')),
])

print("Training Logistic Regression model...")
log_reg_clf.fit(X_train, y_train)

y_pred_lr = log_reg_clf.predict(X_test)
y_proba_lr = log_reg_clf.predict_proba(X_test)[:, 1]

print("\nLogistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1-score:", f1_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))

print("\nClassification report:")
print(classification_report(y_test, y_pred_lr))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_lr))


In [None]:
# Model 2: Random Forest

rf_clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        n_jobs=-1,
        random_state=42,
    )),
])

print("Training Random Forest model...")
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
y_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

print("\nRandom Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1-score:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))

print("\nClassification report:")
print(classification_report(y_test, y_pred_rf))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_rf))
