In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, mean_absolute_error, mean_squared_error
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:

DATA_PATH = "/content/processed_flight_records.csv"

df = pd.read_csv(DATA_PATH)

# Identify columns
date_cols = [c for c in df.columns if 'date' in c.lower() or 'time' in c.lower()]
status_cols = [c for c in df.columns if 'status' in c.lower()]
delay_cols = [c for c in df.columns if 'delay' in c.lower()]

date_col = date_cols[0]
status_col = status_cols[0] if status_cols else None
delay_col = delay_cols[0] if delay_cols else None

# Parse datetime
df[date_col] = pd.to_datetime(df[date_col])

In [None]:
if status_col:
    df['is_delayed'] = df[status_col].astype(str).str.lower().str.contains('delay').astype(int)
else:
    raise ValueError("Status column required for classification modeling")

In [None]:
# Temporal features
df['hour'] = df[date_col].dt.hour
df['day_of_week'] = df[date_col].dt.dayofweek
df['month'] = df[date_col].dt.month

# Airline & route features (if available)
airline_cols = [c for c in df.columns if 'airline' in c.lower() or 'icao' in c.lower()]
origin_cols = [c for c in df.columns if 'origin' in c.lower()]
dest_cols = [c for c in df.columns if 'dest' in c.lower() or 'destination' in c.lower()]

categorical_features = []
if airline_cols:
    categorical_features.append(airline_cols[0])
if origin_cols:
    categorical_features.append(origin_cols[0])
if dest_cols:
    categorical_features.append(dest_cols[0])

numerical_features = ['hour', 'day_of_week', 'month']

In [None]:
# Sort by time
df = df.sort_values(date_col)

X = df[categorical_features + numerical_features]
y = df['is_delayed']

split_date = df[date_col].quantile(0.8)
train_idx = df[date_col] <= split_date

test_idx = df[date_col] > split_date

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42)
}

results = []

for name, model in models.items():
    clf = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', model)
    ])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:,1]

    results.append({
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob)
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
plt.figure()
for name, model in models.items():
    clf = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', model)
    ])
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=name)

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve â€” Delay Classification')
plt.legend()
plt.show()

In [None]:
if delay_col:
    y_reg = df.loc[train_idx, delay_col].fillna(0)
    y_reg_test = df.loc[test_idx, delay_col].fillna(0)

    reg = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', RandomForestRegressor(n_estimators=200, random_state=42))
    ])

    reg.fit(X_train, y_reg)
    y_reg_pred = reg.predict(X_test)

    print("MAE:", mean_absolute_error(y_reg_test, y_reg_pred))
    print("RMSE:", mean_squared_error(y_reg_test, y_reg_pred, squared=False))

In [None]:
results_df.to_csv("/content/predictive_model_results.csv", index=False)
print("Predictive modeling outputs saved.")