In [1]:
# Install packages
!pip install catboost xgboost --quiet
# Core libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ML & preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression

# Models
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('default')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Load flight dataset
df = pd.read_csv('flights_sample_3m.csv', parse_dates=['FL_DATE'])

# Basic overview
print("Dataset Overview")
print(f"Shape: {df.shape}")
print(f"Date range: {df['FL_DATE'].min()} to {df['FL_DATE'].max()}")

# Quick inspection
print("\nFirst 5 rows:")
print(df.head())

print("\nColumn info:")
print(df.info())

# Check for potential data leakage
delay_cols = [col for col in df.columns if 'DELAY' in col.upper()]
print("\nPotential leakage columns:")
for col in delay_cols:
    print(f"- {col}")

print(f"\nDataset loaded successfully with {len(df):,} records")

Dataset Overview
Shape: (3000000, 32)
Date range: 2019-01-01 00:00:00 to 2023-08-31 00:00:00

First 5 rows:
     FL_DATE                AIRLINE                AIRLINE_DOT AIRLINE_CODE  \
0 2019-01-09  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
1 2022-11-19   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
2 2022-07-22  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
3 2023-03-06   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
4 2020-02-23       Spirit Air Lines       Spirit Air Lines: NK           NK   

   DOT_CODE  FL_NUMBER ORIGIN          ORIGIN_CITY DEST  \
0     19977       1562    FLL  Fort Lauderdale, FL  EWR   
1     19790       1149    MSP      Minneapolis, MN  SEA   
2     19977        459    DEN           Denver, CO  MSP   
3     19790       2295    MSP      Minneapolis, MN  SFO   
4     20416        407    MCO          Orlando, FL  DFW   

               DEST_CITY  ...  DIVERTED  CRS_ELAPSED_TIME  ELA

In [3]:
# Legitimate features for arrival delay prediction
legitimate_features = [
    'AIRLINE', 'AIRLINE_CODE', 'ORIGIN', 'DEST', 'FL_NUMBER', 'DISTANCE',
    'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'FL_DATE',
    'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF'
]

# Leaky features to exclude
truly_leaky_features = [
    'ARR_TIME', 'ARR_DELAY', 'WHEELS_ON', 'TAXI_IN', 'ELAPSED_TIME', 'AIR_TIME',
    'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS',
    'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT'
]

print("Legitimate features:")
for f in legitimate_features:
    print(f"- {f}")
print(f"Total: {len(legitimate_features)}")

print("\nTrue data leakage (to remove):")
for f in truly_leaky_features:
    print(f"- {f}")
print(f"Total: {len(truly_leaky_features)}")

# Clean dataset
initial_size = len(df)
df_clean = df[(df['CANCELLED'] != 1) & (df['DIVERTED'] != 1)].copy()

critical_features = ['DEP_TIME', 'DEP_DELAY', 'ARR_DELAY', 'DISTANCE']
df_clean.dropna(subset=critical_features, inplace=True)

features_to_use = [c for c in legitimate_features if c in df_clean.columns]
target_variable = 'ARR_DELAY'

print(f"\nClean dataset:")
print(f"Records: {len(df_clean):,}")
print(f"Features: {len(features_to_use)}")
print(f"Target: {target_variable}")

Legitimate features:
- AIRLINE
- AIRLINE_CODE
- ORIGIN
- DEST
- FL_NUMBER
- DISTANCE
- CRS_DEP_TIME
- CRS_ARR_TIME
- CRS_ELAPSED_TIME
- FL_DATE
- DEP_TIME
- DEP_DELAY
- TAXI_OUT
- WHEELS_OFF
Total: 14

True data leakage (to remove):
- ARR_TIME
- ARR_DELAY
- WHEELS_ON
- TAXI_IN
- ELAPSED_TIME
- AIR_TIME
- DELAY_DUE_CARRIER
- DELAY_DUE_WEATHER
- DELAY_DUE_NAS
- DELAY_DUE_SECURITY
- DELAY_DUE_LATE_AIRCRAFT
Total: 11

Clean dataset:
Records: 2,913,802
Features: 14
Target: ARR_DELAY


In [4]:
# Sort data by date to ensure temporal order
df_clean = df_clean.sort_values(['FL_DATE', 'AIRLINE', 'ORIGIN']).reset_index(drop=True)

# Create time features if not exist
if 'MONTH' not in df_clean.columns:
    df_clean['MONTH'] = df_clean['FL_DATE'].dt.month
    df_clean['DAY_OF_WEEK'] = df_clean['FL_DATE'].dt.dayofweek
    df_clean['DAY_OF_YEAR'] = df_clean['FL_DATE'].dt.dayofyear

# Keep only features that contributed in CatBoost top-15
# Historical performance features
df_clean['AIRLINE_HIST_DELAY'] = df_clean.groupby('AIRLINE')['DEP_DELAY'].expanding().mean().shift(1).reset_index(level=0, drop=True)
df_clean['ROUTE_HIST_DELAY'] = (df_clean.assign(ROUTE_KEY=df_clean['ORIGIN'] + '_' + df_clean['DEST'])
                                .groupby('ROUTE_KEY')['DEP_DELAY'].expanding().mean().shift(1)
                                .reset_index(level=0, drop=True))

# Fill NaN with overall mean
overall_mean = df_clean['DEP_DELAY'].mean()
for feature in ['AIRLINE_HIST_DELAY', 'ROUTE_HIST_DELAY']:
    df_clean[feature] = df_clean[feature].fillna(overall_mean)

# Keep only time features in top 15
# DAY_OF_YEAR, DAY_OF_WEEK, DEP_HOUR, HOUR_SIN, HOUR_COS, IS_SUMMER, IS_PEAK_TRAVEL_DAY, IS_HOLIDAY_WEEK, ROUTE_FREQUENCY
# Ensure these columns exist or create placeholders if needed
time_features = ['DAY_OF_YEAR', 'DAY_OF_WEEK']
for f in time_features:
    if f not in df_clean.columns:
        df_clean[f] = 0  # placeholder, compute properly later

# Verify dataset shape
print(f"Final dataset after feature selection: {df_clean.shape}")


Final dataset after feature selection: (2913802, 37)


In [5]:
# Airline-standard delay categories
def airline_delay_categories(delay):
    if pd.isna(delay):
        return 'Unknown'
    elif delay <= 15:
        return 'On_Time'
    elif delay <= 60:
        return 'Minor_Delay'
    elif delay <= 240:
        return 'Major_Delay'
    else:
        return 'Severe_Delay'

# Apply categorization
df_clean['DEP_DELAY_CATEGORY'] = df_clean['DEP_DELAY'].apply(airline_delay_categories)
df_clean['ARR_DELAY_CATEGORY'] = df_clean['ARR_DELAY'].apply(airline_delay_categories)

# Binary target: delayed if ARR_DELAY > 15 min
df_clean['IS_DELAYED'] = (df_clean['ARR_DELAY'] > 15).astype(int)

# Distribution summary
category_dist = df_clean['ARR_DELAY_CATEGORY'].value_counts()
for cat, cnt in category_dist.items():
    print(f"{cat}: {cnt:,} ({cnt/len(df_clean)*100:.1f}%)")

print(f"Binary delayed flights: {df_clean['IS_DELAYED'].sum():,} ({df_clean['IS_DELAYED'].mean()*100:.1f}%)")

On_Time: 2,398,513 (82.3%)
Minor_Delay: 340,154 (11.7%)
Major_Delay: 157,912 (5.4%)
Severe_Delay: 17,223 (0.6%)
Binary delayed flights: 515,289 (17.7%)


In [6]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Time-based features from scheduled departure
df_clean['DEP_HOUR'] = df_clean['CRS_DEP_TIME'] // 100
df_clean['DEP_MINUTE'] = df_clean['CRS_DEP_TIME'] % 100

# Temporal features
df_clean['MONTH'] = df_clean['FL_DATE'].dt.month
df_clean['DAY_OF_WEEK'] = df_clean['FL_DATE'].dt.dayofweek
df_clean['IS_WEEKEND'] = (df_clean['DAY_OF_WEEK'] >= 5).astype(int)

# Rush hour indicators
df_clean['IS_MORNING_RUSH'] = ((df_clean['DEP_HOUR'] >= 6) & (df_clean['DEP_HOUR'] <= 9)).astype(int)
df_clean['IS_EVENING_RUSH'] = ((df_clean['DEP_HOUR'] >= 16) & (df_clean['DEP_HOUR'] <= 19)).astype(int)

# Seasonal indicators
df_clean['IS_SUMMER'] = ((df_clean['MONTH'] >= 6) & (df_clean['MONTH'] <= 8)).astype(int)
df_clean['IS_WINTER'] = ((df_clean['MONTH'] >= 12) | (df_clean['MONTH'] <= 2)).astype(int)

# Distance categories
df_clean['IS_SHORT_HAUL'] = (df_clean['DISTANCE'] <= 500).astype(int)
df_clean['IS_LONG_HAUL'] = (df_clean['DISTANCE'] >= 1500).astype(int)

# Encode categorical variables
categorical_features = ['AIRLINE', 'ORIGIN', 'DEST']
le_dict = {}
for col in categorical_features:
    if col in df_clean.columns:
        le_dict[col] = LabelEncoder()
        df_clean[col] = le_dict[col].fit_transform(df_clean[col].astype(str))

# Normalize numerical features
numerical_features = ['DISTANCE', 'CRS_ELAPSED_TIME', 'DEP_DELAY', 'TAXI_OUT', 'DEP_HOUR', 'DEP_MINUTE']
features_to_scale = [col for col in numerical_features if col in df_clean.columns]

scaler = StandardScaler()
df_clean[features_to_scale] = scaler.fit_transform(df_clean[features_to_scale])

# Final feature set for modeling
modeling_features = [
    'AIRLINE', 'ORIGIN', 'DEST', 'DISTANCE',
    'DEP_HOUR', 'MONTH', 'DAY_OF_WEEK', 'IS_WEEKEND',
    'IS_MORNING_RUSH', 'IS_EVENING_RUSH', 'IS_SUMMER', 'IS_WINTER',
    'IS_SHORT_HAUL', 'IS_LONG_HAUL'
]
final_features = [col for col in modeling_features if col in df_clean.columns]

# Summary
print(f"Final feature set: {len(final_features)} features")
print(df_clean[final_features].head())
print(f"Dataset shape: {df_clean.shape}")

Final feature set: 14 features
   AIRLINE  ORIGIN  DEST  DISTANCE  DEP_HOUR  MONTH  DAY_OF_WEEK  IS_WEEKEND  \
0        0      19    32 -0.698576  1.036579      1            1           0   
1        0      19    10 -0.946298  0.415713      1            1           0   
2        0      19   323  1.081290 -1.446887      1            1           0   
3        0      19   201  2.603254  2.071357      1            1           0   
4        0     108   323  1.894022  0.622668      1            1           0   

   IS_MORNING_RUSH  IS_EVENING_RUSH  IS_SUMMER  IS_WINTER  IS_SHORT_HAUL  \
0                0                1          0          1              1   
1                0                0          0          1              1   
2                1                0          0          1              0   
3                0                0          0          1              0   
4                0                1          0          1              0   

   IS_LONG_HAUL  
0            

In [7]:
# Cyclical time encoding
df_clean['HOUR_SIN'] = np.sin(2 * np.pi * df_clean['DEP_HOUR'] / 24)
df_clean['HOUR_COS'] = np.cos(2 * np.pi * df_clean['DEP_HOUR'] / 24)
df_clean['DAY_OF_YEAR'] = df_clean['FL_DATE'].dt.dayofyear

# Enhanced time period
def enhanced_time_period(hour):
    if pd.isna(hour): return 2
    if 5 <= hour < 8: return 0
    elif 8 <= hour < 12: return 1
    elif 12 <= hour < 17: return 2
    elif 17 <= hour < 21: return 3
    else: return 4

df_clean['TIME_PERIOD_ENHANCED'] = df_clean['DEP_HOUR'].apply(enhanced_time_period)

# Travel patterns
df_clean['IS_PEAK_TRAVEL_DAY'] = df_clean['DAY_OF_WEEK'].isin([4, 6]).astype(int)
df_clean['IS_HOLIDAY_WEEK'] = df_clean['FL_DATE'].dt.isocalendar().week.isin([51, 52, 1, 25]).astype(int)

# Airport congestion
df_clean['DEP_HOUR_FLIGHTS'] = df_clean.groupby(['ORIGIN', 'DEP_HOUR'])['ORIGIN'].transform('count')

# Hub and route indicators
hubs = ['JFK', 'LAX', 'ORD', 'ATL', 'DFW', 'DEN']
df_clean['IS_INTERNATIONAL_HUB'] = df_clean['ORIGIN'].isin(hubs).astype(int)
df_clean['IS_TRANSCONTINENTAL'] = (df_clean['DISTANCE'] > 2500).astype(int)

# Distance category
def safe_distance_category(dist):
    if pd.isna(dist): return 1
    elif dist <= 500: return 0
    elif dist <= 1000: return 1
    elif dist <= 2000: return 2
    else: return 3

df_clean['DISTANCE_CAT'] = df_clean['DISTANCE'].apply(safe_distance_category)

# Historical airline delay
df_clean['AIRLINE_HISTORICAL_DELAY'] = df_clean.groupby('AIRLINE')['DEP_DELAY'].transform('mean')

# Route frequency
route_key = df_clean['ORIGIN'].astype(str) + '_' + df_clean['DEST'].astype(str)
df_clean['ROUTE_FREQUENCY'] = route_key.map(route_key.value_counts()).fillna(1)

# Hub flags for origin/destination
df_clean['ORIGIN_IS_HUB'] = df_clean['ORIGIN'].isin(hubs).astype(int)
df_clean['DEST_IS_HUB'] = df_clean['DEST'].isin(hubs).astype(int)

# Update final features
enhanced_features = final_features + [
    'HOUR_SIN', 'HOUR_COS', 'DAY_OF_YEAR', 'TIME_PERIOD_ENHANCED',
    'IS_PEAK_TRAVEL_DAY', 'IS_HOLIDAY_WEEK', 'DEP_HOUR_FLIGHTS',
    'IS_INTERNATIONAL_HUB', 'IS_TRANSCONTINENTAL', 'DISTANCE_CAT',
    'AIRLINE_HISTORICAL_DELAY', 'ROUTE_FREQUENCY', 'ORIGIN_IS_HUB', 'DEST_IS_HUB'
]

# Check for NaNs
for feat in enhanced_features[len(final_features):]:
    df_clean[feat].fillna(0, inplace=True)

final_features = enhanced_features
print(f"Final feature count: {len(final_features)}")

Final feature count: 28


In [8]:
# Interaction features
df_clean['AIRLINE_ORIGIN_COMBO'] = df_clean['AIRLINE'].astype(str) + '_' + df_clean['ORIGIN'].astype(str)
df_clean['DISTANCE_TIME_RATIO'] = df_clean['DISTANCE'] / (df_clean['DEP_HOUR'] + 1)
df_clean['WEEKEND_RUSH_COMBO'] = df_clean['IS_WEEKEND'] * df_clean['IS_MORNING_RUSH']

# Rolling window features (7-day airline delay average)
df_clean = df_clean.sort_values(['FL_DATE', 'AIRLINE'])
df_clean['AIRLINE_7DAY_AVG'] = df_clean.groupby('AIRLINE')['DEP_DELAY'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean().shift(1)
)

# Weather proxy features
df_clean['WINTER_MORNING'] = df_clean['IS_WINTER'] * df_clean['IS_MORNING_RUSH']
df_clean['SUMMER_EVENING'] = df_clean['IS_SUMMER'] * df_clean['IS_EVENING_RUSH']

# Update final feature list
additional_features = [
    'AIRLINE_ORIGIN_COMBO', 'DISTANCE_TIME_RATIO', 'WEEKEND_RUSH_COMBO',
    'AIRLINE_7DAY_AVG', 'WINTER_MORNING', 'SUMMER_EVENING'
]

final_features = final_features + additional_features

In [9]:
# Prepare features and target
X = df_clean[final_features].copy()
y = df_clean['ARR_DELAY'].copy()

# Identify categorical columns (including combo features)
categorical_cols = ['AIRLINE', 'ORIGIN', 'DEST']
# Ensure they exist in the dataset
categorical_cols = [col for col in categorical_cols if col in X.columns]
# Make sure categorical columns are strings
for col in categorical_cols:
    X[col] = X[col].astype(str)

# Drop any remaining object columns that are not categorical
for col in X.select_dtypes(include='object').columns:
    if col not in categorical_cols:
        X = X.drop(columns=col)

# Train-validation-test split (60-20-20)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Categorical feature indices for CatBoost
categorical_features_indices = [i for i, f in enumerate(X_train.columns) if f in categorical_cols]

# CatBoost Regressor
catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=categorical_features_indices,
    random_seed=42,
    verbose=100
)

# Train with early stopping
catboost_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=100,
    plot=False,
    verbose_eval=100
)

# Output best iteration and validation RMSE
best_iter = catboost_model.best_iteration_
best_val_rmse = catboost_model.best_score_['validation']['RMSE']

print(f"CatBoost trained successfully")
print(f"Best iteration: {best_iter}")
print(f"Validation RMSE: {best_val_rmse:.2f}")


0:	learn: 51.1778546	test: 50.8353748	best: 50.8353748 (0)	total: 2.08s	remaining: 34m 34s
100:	learn: 50.1170703	test: 49.8015020	best: 49.8015020 (100)	total: 1m 50s	remaining: 16m 25s
200:	learn: 50.0224871	test: 49.7353731	best: 49.7353731 (200)	total: 3m 31s	remaining: 14m
300:	learn: 49.9517417	test: 49.6949179	best: 49.6949179 (300)	total: 5m 19s	remaining: 12m 22s
400:	learn: 49.8834018	test: 49.6668621	best: 49.6668621 (400)	total: 7m 13s	remaining: 10m 46s
500:	learn: 49.8311620	test: 49.6472218	best: 49.6472218 (500)	total: 9m 8s	remaining: 9m 6s
600:	learn: 49.7848663	test: 49.6302879	best: 49.6302879 (600)	total: 11m 2s	remaining: 7m 19s
700:	learn: 49.7413756	test: 49.6155120	best: 49.6155120 (700)	total: 12m 56s	remaining: 5m 31s
800:	learn: 49.7019243	test: 49.6046328	best: 49.6043773 (799)	total: 14m 44s	remaining: 3m 39s
900:	learn: 49.6589718	test: 49.5967088	best: 49.5965843 (898)	total: 16m 39s	remaining: 1m 49s
999:	learn: 49.6230117	test: 49.5877387	best: 49.5877

In [10]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

def airline_delay_categories(delay):
    if pd.isna(delay):
        return 'Unknown'
    elif delay <= 15:
        return 'On_Time'
    elif delay <= 60:
        return 'Minor_Delay'
    elif delay <= 240:
        return 'Major_Delay'
    else:
        return 'Severe_Delay'

def evaluate_model(y_true, y_pred, dataset_name):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    y_true_cat = [airline_delay_categories(d) for d in y_true]
    y_pred_cat = [airline_delay_categories(d) for d in y_pred]
    category_acc = accuracy_score(y_true_cat, y_pred_cat)

    binary_acc = accuracy_score((y_true > 15), (y_pred > 15))

    print(f"{dataset_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.4f}, "
          f"4-cat Acc: {category_acc:.3f}, Binary Acc: {binary_acc:.3f}")

    return {"mae": mae, "rmse": rmse, "r2": r2,
            "category_accuracy": category_acc, "binary_accuracy": binary_acc}

# Predictions
train_pred = catboost_model.predict(X_train)
val_pred = catboost_model.predict(X_val)
test_pred = catboost_model.predict(X_test)

# Evaluate
train_metrics = evaluate_model(y_train, train_pred, "TRAIN")
val_metrics = evaluate_model(y_val, val_pred, "VALIDATION")
test_metrics = evaluate_model(y_test, test_pred, "TEST")

TRAIN - MAE: 22.66, RMSE: 49.63, R²: 0.0656, 4-cat Acc: 0.771, Binary Acc: 0.794
VALIDATION - MAE: 22.71, RMSE: 49.59, R²: 0.0545, 4-cat Acc: 0.769, Binary Acc: 0.792
TEST - MAE: 22.61, RMSE: 49.45, R²: 0.0546, 4-cat Acc: 0.771, Binary Acc: 0.794


In [12]:
# Get feature importances and align with trained features
feature_importances = catboost_model.get_feature_importance()
trained_features = catboost_model.feature_names_

importance_df = pd.DataFrame({
    'Feature': trained_features,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

print("Top 15 features by importance:")
for i, row in importance_df.head(15).iterrows():
    print(f"{i+1:2d}. {row['Feature']:<25}: {row['Importance']:.4f}")

Top 15 features by importance:
31. AIRLINE_7DAY_AVG         : 23.3561
17. DAY_OF_YEAR              : 11.9758
15. HOUR_SIN                 : 10.7477
 5. DEP_HOUR                 : 8.4653
 3. DEST                     : 5.9568
 1. AIRLINE                  : 5.8716
25. AIRLINE_HISTORICAL_DELAY : 5.6835
 2. ORIGIN                   : 5.3478
 7. DAY_OF_WEEK              : 3.5127
21. DEP_HOUR_FLIGHTS         : 3.3524
 4. DISTANCE                 : 3.2533
11. IS_SUMMER                : 2.8170
26. ROUTE_FREQUENCY          : 2.1527
16. HOUR_COS                 : 2.0098
29. DISTANCE_TIME_RATIO      : 1.2707


In [16]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# Encode categorical features for LightGBM/XGBoost
from sklearn.preprocessing import LabelEncoder

X_encoded = X_train.copy()
X_val_encoded = X_val.copy()
X_test_encoded = X_test.copy()

for col in ['AIRLINE', 'ORIGIN', 'DEST']:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    X_val_encoded[col] = le.transform(X_val_encoded[col].astype(str))
    X_test_encoded[col] = le.transform(X_test_encoded[col].astype(str))

# Initialize models
lgbm_model = LGBMRegressor(
    n_estimators=300, max_depth=8, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1, verbose=-1
)
xgb_model = XGBRegressor(
    n_estimators=300, max_depth=8, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
)

# Train models
lgbm_model.fit(X_encoded, y_train)
xgb_model.fit(X_encoded, y_train)

# Predictions
lgbm_pred = lgbm_model.predict(X_test_encoded)
xgb_pred = xgb_model.predict(X_test_encoded)
catboost_pred = catboost_model.predict(X_test)

# Evaluate
lgbm_metrics = evaluate_model(y_test, lgbm_pred, "LIGHTGBM")
xgb_metrics = evaluate_model(y_test, xgb_pred, "XGBOOST")
catboost_metrics = evaluate_model(y_test, catboost_pred, "CATBOOST_BASELINE")


LIGHTGBM - MAE: 22.66, RMSE: 49.49, R²: 0.0534, 4-cat Acc: 0.772, Binary Acc: 0.794
XGBOOST - MAE: 22.67, RMSE: 49.55, R²: 0.0512, 4-cat Acc: 0.768, Binary Acc: 0.792
CATBOOST_BASELINE - MAE: 22.61, RMSE: 49.45, R²: 0.0546, 4-cat Acc: 0.771, Binary Acc: 0.794


In [17]:
# === OVERFITTING DETECTION ===
print("Checking for overfitting...")

# Compare training vs validation performance
print(f"Training MAE: {train_metrics['mae']:.2f} | Validation MAE: {val_metrics['mae']:.2f} | Test MAE: {test_metrics['mae']:.2f}")
print(f"Training 4-Category Accuracy: {train_metrics['category_accuracy']:.1%} | Validation: {val_metrics['category_accuracy']:.1%} | Test: {test_metrics['category_accuracy']:.1%}")

# Calculate gaps
mae_gap = val_metrics['mae'] - train_metrics['mae']
acc_gap = train_metrics['category_accuracy'] - val_metrics['category_accuracy']

print(f"MAE Gap (Validation - Training): {mae_gap:.2f}")
print(f"Accuracy Gap (Training - Validation): {acc_gap:.3f}")

# Risk assessment
if mae_gap < 2.0 and acc_gap < 0.05:
    print("LOW OVERFITTING RISK")
elif mae_gap < 5.0 and acc_gap < 0.10:
    print("MODERATE OVERFITTING")
else:
    print("HIGH OVERFITTING RISK")

Checking for overfitting...
Training MAE: 22.66 | Validation MAE: 22.71 | Test MAE: 22.61
Training 4-Category Accuracy: 77.1% | Validation: 76.9% | Test: 77.1%
MAE Gap (Validation - Training): 0.06
Accuracy Gap (Training - Validation): 0.002
LOW OVERFITTING RISK
