# Airline Delay Economics Analysis
## Which Airlines Deliver the Best Value for Money?

**Goal:** Translate flight delay data into an economic efficiency metric to determine which airlines and airports offer the best "value for money" in on-time performance.

**Dataset:** Bureau of Transportation Statistics — Airline On-Time Performance Data

**Note:** This version uses the correct column names for BTS data downloaded in 2024.

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports complete")

## 2. Data Loading

**BTS Column Names (2024 format):**
- `YEAR`, `MONTH`, `DAY_OF_MONTH`
- `OP_UNIQUE_CARRIER` (airline code)
- `ORIGIN`, `DEST` (airport codes)
- `ARR_DELAY`, `DEP_DELAY` (in minutes)
- `DISTANCE` (in miles)
- `CANCELLED`, `DIVERTED`

In [None]:
# Define columns to load (using correct BTS column names)
columns_to_load = [
    'YEAR', 'MONTH', 'DAY_OF_MONTH',
    'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST',
    'ARR_DELAY', 'DEP_DELAY', 'DISTANCE',
    'CANCELLED', 'DIVERTED'
]

# Load data
data_file = '../data/airline_ontime.csv'

try:
    df = pd.read_csv(data_file, usecols=columns_to_load, low_memory=False)
    
    # Rename columns to match our analysis code
    df = df.rename(columns={
        'YEAR': 'Year',
        'MONTH': 'Month',
        'DAY_OF_MONTH': 'DayofMonth',
        'OP_UNIQUE_CARRIER': 'Carrier',
        'ORIGIN': 'Origin',
        'DEST': 'Dest',
        'ARR_DELAY': 'ArrDelay',
        'DEP_DELAY': 'DepDelay',
        'DISTANCE': 'Distance',
        'CANCELLED': 'Cancelled',
        'DIVERTED': 'Diverted'
    })
    
    print(f"✓ Data loaded: {len(df):,} rows")
    print(f"✓ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    print(f"✓ Columns: {list(df.columns)}")
    
except FileNotFoundError:
    print("❌ Data file not found. Please download BTS data and place it in ../data/")
    print("   See DATA_DOWNLOAD_GUIDE.md for instructions")
    df = None

In [None]:
# Display first few rows and basic info
if df is not None:
    display(df.head())
    print("\nDataset Info:")
    df.info()

## 3. Data Cleaning

In [None]:
if df is not None:
    print(f"Starting rows: {len(df):,}\n")
    
    # Step 1: Remove cancelled and diverted flights
    df_clean = df[(df['Cancelled'] == 0) & (df['Diverted'] == 0)].copy()
    print(f"After removing cancelled/diverted: {len(df_clean):,} rows")
    
    # Step 2: Remove rows with missing delay values
    df_clean = df_clean.dropna(subset=['ArrDelay', 'DepDelay', 'Distance'])
    print(f"After removing missing values: {len(df_clean):,} rows")
    
    # Step 3: Filter out extreme outliers (delays > 500 minutes or < -60 minutes)
    df_clean = df_clean[
        (df_clean['ArrDelay'] >= -60) & 
        (df_clean['ArrDelay'] <= 500) &
        (df_clean['Distance'] > 0)
    ]
    print(f"After removing outliers: {len(df_clean):,} rows")
    
    # Step 4: Filter to top carriers and airports for MVP
    top_carriers = df_clean['Carrier'].value_counts().head(5).index
    top_airports = df_clean['Origin'].value_counts().head(10).index
    
    df_mvp = df_clean[
        df_clean['Carrier'].isin(top_carriers) &
        df_clean['Origin'].isin(top_airports)
    ].copy()
    
    print(f"\nMVP subset (top 5 carriers, top 10 airports): {len(df_mvp):,} rows")
    print(f"\nTop 5 Carriers: {list(top_carriers)}")
    print(f"Top 10 Airports: {list(top_airports)}")
    
    # Clean up
    df = df_mvp
    del df_clean, df_mvp
    
    print("\n✓ Data cleaning complete")

## 4. Feature Computation

In [None]:
if df is not None:
    # FAA estimated cost per minute of delay
    COST_PER_MINUTE = 74
    
    # Compute delay cost (only for positive delays)
    df['delay_cost'] = df['ArrDelay'].apply(lambda x: max(0, x) * COST_PER_MINUTE)
    
    # Compute cost per mile (normalized efficiency metric)
    df['cost_per_mile'] = df['delay_cost'] / df['Distance']
    
    # Binary flag for significant delays (> 15 minutes)
    df['is_delayed'] = (df['ArrDelay'] > 15).astype(int)
    
    # Display sample of new features
    print("Sample of computed features:\n")
    display(df[['Carrier', 'Origin', 'ArrDelay', 'Distance', 'delay_cost', 'cost_per_mile', 'is_delayed']].head(10))
    
    print("\n✓ Feature computation complete")

## 5. Airline-Level Aggregation

In [None]:
if df is not None:
    airline_summary = df.groupby('Carrier').agg({
        'ArrDelay': ['mean', 'median', 'std'],
        'delay_cost': ['mean', 'sum'],
        'cost_per_mile': 'mean',
        'is_delayed': ['sum', 'mean'],
        'Distance': 'mean',
        'Carrier': 'count'
    }).reset_index()
    
    # Flatten column names
    airline_summary.columns = [
        'Carrier', 
        'avg_delay_min', 'median_delay_min', 'std_delay_min',
        'avg_delay_cost', 'total_delay_cost',
        'avg_cost_per_mile',
        'num_delayed_flights', 'delay_rate',
        'avg_distance',
        'num_flights'
    ]
    
    # Sort by cost per mile (efficiency)
    airline_summary = airline_summary.sort_values('avg_cost_per_mile')
    
    print("Airline Performance Summary:\n")
    display(airline_summary)
    
    # Export for dashboard
    airline_summary.to_csv('../outputs/airline_summary.csv', index=False)
    print("\n✓ Airline summary exported to outputs/airline_summary.csv")

## 6. Airport-Level Aggregation

In [None]:
if df is not None:
    airport_summary = df.groupby('Origin').agg({
        'ArrDelay': ['mean', 'median'],
        'delay_cost': ['mean', 'sum'],
        'cost_per_mile': 'mean',
        'is_delayed': 'mean',
        'Origin': 'count'
    }).reset_index()
    
    # Flatten column names
    airport_summary.columns = [
        'Airport',
        'avg_delay_min', 'median_delay_min',
        'avg_delay_cost', 'total_delay_cost',
        'avg_cost_per_mile',
        'delay_rate',
        'num_flights'
    ]
    
    # Sort by cost per mile
    airport_summary = airport_summary.sort_values('avg_cost_per_mile')
    
    print("Airport Performance Summary:\n")
    display(airport_summary)
    
    # Export for dashboard
    airport_summary.to_csv('../outputs/airport_summary.csv', index=False)
    print("\n✓ Airport summary exported to outputs/airport_summary.csv")

## 7. Linear Regression: Distance vs Delay

In [None]:
if df is not None:
    # Prepare data for regression
    X = df[['Distance']].values
    y = df['ArrDelay'].values
    
    # Fit linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get predictions and metrics
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    
    # Print results
    print("Linear Regression Results: ArrDelay ~ Distance\n")
    print(f"Intercept: {model.intercept_:.2f} minutes")
    print(f"Slope: {model.coef_[0]:.4f} minutes per mile")
    print(f"  → Extra delay per 100 miles: {model.coef_[0] * 100:.2f} minutes")
    print(f"R²: {r2:.4f}")
    print(f"  → Distance explains {r2*100:.2f}% of delay variability")
    
    print("\n✓ Regression analysis complete")

## 8. Visualization

In [None]:
if df is not None:
    plt.figure(figsize=(12, 6))
    
    # Sample for visualization
    sample_df = df.sample(min(5000, len(df)), random_state=42)
    
    plt.scatter(sample_df['Distance'], sample_df['ArrDelay'], 
                alpha=0.3, s=10, label='Actual delays')
    
    # Plot regression line
    X_range = np.array([[df['Distance'].min()], [df['Distance'].max()]])
    y_range = model.predict(X_range)
    plt.plot(X_range, y_range, 'r-', linewidth=2, 
             label=f'Regression line (R²={r2:.3f})')
    
    plt.xlabel('Distance (miles)', fontsize=12)
    plt.ylabel('Arrival Delay (minutes)', fontsize=12)
    plt.title('Flight Distance vs Arrival Delay', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../outputs/regression_plot.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✓ Regression plot saved")

## 9. Export for Dashboard

In [None]:
if df is not None:
    # Export full dataset
    tableau_export = df[[
        'Year', 'Month', 'DayofMonth',
        'Carrier', 'Origin', 'Dest',
        'Distance', 'ArrDelay', 'DepDelay',
        'delay_cost', 'cost_per_mile', 'is_delayed'
    ]].copy()
    
    tableau_export.to_csv('../outputs/full_dataset_for_tableau.csv', index=False)
    
    print(f"✓ Full dataset exported: {len(tableau_export):,} rows")
    print(f"✓ Files ready in outputs/ directory")
    print(f"\n🚀 Next step: Run dashboard_app_premium.py")