In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
DATA_PATH = "/content/processed_flight_records.csv"

df = pd.read_csv(DATA_PATH)

# Identify datetime column
date_cols = [c for c in df.columns if 'date' in c.lower() or 'time' in c.lower()]
date_col = date_cols[0]
df[date_col] = pd.to_datetime(df[date_col])

# Identify airline, status, delay columns
airline_cols = [c for c in df.columns if 'airline' in c.lower() or 'icao' in c.lower()]
status_cols = [c for c in df.columns if 'status' in c.lower()]
delay_cols = [c for c in df.columns if 'delay' in c.lower()]

airline_col = airline_cols[0] if airline_cols else None
status_col = status_cols[0] if status_cols else None
delay_col = delay_cols[0] if delay_cols else None

print("Using columns:")
print("Date:", date_col)
print("Airline:", airline_col)
print("Status:", status_col)
print("Delay:", delay_col)

In [None]:
# Create binary indicators if not present
if status_col:
    df['is_cancelled'] = df[status_col].astype(str).str.lower().str.contains('cancel')
    df['is_delayed'] = df[status_col].astype(str).str.lower().str.contains('delay')
else:
    df['is_cancelled'] = False
    df['is_delayed'] = False

In [None]:
summary_rates = pd.DataFrame({
    'Total Flights': [len(df)],
    'Delayed Flights': [df['is_delayed'].sum()],
    'Cancelled Flights': [df['is_cancelled'].sum()],
})

summary_rates['Delay Rate (%)'] = (summary_rates['Delayed Flights'] / summary_rates['Total Flights']) * 100
summary_rates['Cancellation Rate (%)'] = (summary_rates['Cancelled Flights'] / summary_rates['Total Flights']) * 100

summary_rates

In [None]:
if delay_col:
    plt.figure()
    sns.histplot(df[delay_col].dropna(), bins=50, kde=True)
    plt.title("Distribution of Delay Duration")
    plt.xlabel("Delay (minutes)")
    plt.show()

    plt.figure()
    sns.boxplot(x=df[delay_col])
    plt.title("Delay Duration Boxplot")
    plt.show()

In [None]:
if airline_col:
    airline_stats = df.groupby(airline_col).agg(
        total_flights=('is_delayed', 'count'),
        delayed_flights=('is_delayed', 'sum'),
        cancelled_flights=('is_cancelled', 'sum')
    )

    airline_stats['delay_rate_%'] = airline_stats['delayed_flights'] / airline_stats['total_flights'] * 100
    airline_stats['cancellation_rate_%'] = airline_stats['cancelled_flights'] / airline_stats['total_flights'] * 100

    airline_stats = airline_stats.sort_values('delay_rate_%', ascending=False)

    plt.figure(figsize=(12,6))
    airline_stats['delay_rate_%'].head(15).plot(kind='bar')
    plt.title("Top 15 Airlines by Delay Rate")
    plt.ylabel("Delay Rate (%)")
    plt.show()

    airline_stats.head(10)

In [None]:
df['hour'] = df[date_col].dt.hour

delay_by_hour = df.groupby('hour')['is_delayed'].mean() * 100

plt.figure()
delay_by_hour.plot()
plt.title("Delay Rate by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Delay Rate (%)")
plt.show()

In [None]:
df['month'] = df[date_col].dt.to_period('M')

monthly_delay = df.groupby('month')['is_delayed'].mean() * 100
monthly_cancel = df.groupby('month')['is_cancelled'].mean() * 100

plt.figure()
monthly_delay.plot(label='Delay Rate')
monthly_cancel.plot(label='Cancellation Rate')
plt.legend()
plt.title("Monthly Delay and Cancellation Rates")
plt.ylabel("Rate (%)")
plt.show()

In [None]:
airline_stats.to_csv("/content/airline_delay_cancellation_summary.csv")
summary_rates.to_csv("/content/overall_delay_cancellation_rates.csv")

print("Delay and cancellation analysis outputs saved.")