In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
DATA_PATH = "/content/processed_flight_records.csv"

df = pd.read_csv(DATA_PATH)

# Identify key columns
airline_cols = [c for c in df.columns if 'airline' in c.lower() or 'icao' in c.lower()]
status_cols = [c for c in df.columns if 'status' in c.lower()]
delay_cols = [c for c in df.columns if 'delay' in c.lower()]
date_cols = [c for c in df.columns if 'date' in c.lower() or 'time' in c.lower()]

airline_col = airline_cols[0]
status_col = status_cols[0] if status_cols else None
delay_col = delay_cols[0] if delay_cols else None
date_col = date_cols[0]

df[date_col] = pd.to_datetime(df[date_col])

print("Using columns:")
print("Airline:", airline_col)
print("Status:", status_col)
print("Delay:", delay_col)

In [None]:
if status_col:
    df['is_delayed'] = df[status_col].astype(str).str.lower().str.contains('delay')
    df['is_cancelled'] = df[status_col].astype(str).str.lower().str.contains('cancel')
else:
    df['is_delayed'] = False
    df['is_cancelled'] = False

In [None]:
airline_kpis = df.groupby(airline_col).agg(
    total_flights=('is_delayed', 'count'),
    delay_rate=('is_delayed', 'mean'),
    cancellation_rate=('is_cancelled', 'mean'),
    avg_delay_minutes=(delay_col, 'mean') if delay_col else ('is_delayed', 'mean')
)

airline_kpis['delay_rate'] *= 100
airline_kpis['cancellation_rate'] *= 100

airline_kpis = airline_kpis.sort_values('delay_rate', ascending=False)

airline_kpis.head(10)

In [None]:
plt.figure(figsize=(12,6))
airline_kpis['delay_rate'].plot(kind='bar')
plt.title("Airline Delay Rate Comparison")
plt.ylabel("Delay Rate (%)")
plt.xlabel("Airline")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
airline_kpis['cancellation_rate'].plot(kind='bar', color='orange')
plt.title("Airline Cancellation Rate Comparison")
plt.ylabel("Cancellation Rate (%)")
plt.xlabel("Airline")
plt.show()

In [None]:
# Compare delay rates across airlines (binary delayed)
groups = [group['is_delayed'].values for _, group in df.groupby(airline_col)]

if len(groups) > 1:
    f_stat, p_value = stats.f_oneway(*groups)
    print("ANOVA F-statistic:", f_stat)
    print("ANOVA p-value:", p_value)

In [None]:
# Normalize KPIs
kpi_norm = airline_kpis.copy()

for col in ['delay_rate', 'cancellation_rate', 'avg_delay_minutes']:
    kpi_norm[col] = (kpi_norm[col] - kpi_norm[col].min()) / (kpi_norm[col].max() - kpi_norm[col].min())

# Composite score (lower is better)
kpi_norm['composite_score'] = (
    0.4 * kpi_norm['delay_rate'] +
    0.4 * kpi_norm['cancellation_rate'] +
    0.2 * kpi_norm['avg_delay_minutes']
)

kpi_norm = kpi_norm.sort_values('composite_score')

kpi_norm.head(10)

In [None]:
plt.figure(figsize=(10,6))
kpi_norm['composite_score'].plot(kind='bar')
plt.title("Composite Airline Performance Score (Lower is Better)")
plt.ylabel("Score")
plt.xlabel("Airline")
plt.show()

In [None]:
airline_kpis.to_csv("/content/airline_kpi_comparison.csv")
kpi_norm.to_csv("/content/airline_composite_scores.csv")

print("Comparative airline analysis outputs saved.")
