In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
DATA_PATH = "/content/processed_flight_records.csv"

df = pd.read_csv(DATA_PATH, parse_dates=True)

print("Dataset shape:", df.shape)
df.head()

In [None]:
# Attempt to identify primary date column
date_cols = [c for c in df.columns if 'date' in c.lower() or 'time' in c.lower()]
print("Detected date/time columns:", date_cols)

# Choose first datetime column as reference (adjust if needed)
date_col = date_cols[0]
df[date_col] = pd.to_datetime(df[date_col])

df['year'] = df[date_col].dt.year
df['month'] = df[date_col].dt.to_period('M')
df['day'] = df[date_col].dt.date

In [None]:
flights_over_time = df.groupby('month').size()

plt.figure()
flights_over_time.plot()
plt.title("Total Flights per Month")
plt.xlabel("Month")
plt.ylabel("Number of Flights")
plt.show()


In [None]:
# Attempt to identify airline column
airline_cols = [c for c in df.columns if 'airline' in c.lower() or 'icao' in c.lower()]
print("Detected airline columns:", airline_cols)

airline_col = airline_cols[0]

flights_by_airline = df[airline_col].value_counts()

plt.figure(figsize=(12,6))
flights_by_airline.head(15).plot(kind='bar')
plt.title("Top 15 Airlines by Number of Flights")
plt.xlabel("Airline")
plt.ylabel("Flights")
plt.show()

flights_by_airline.head(15)

In [None]:
airport_cols = [c for c in df.columns if 'origin' in c.lower() or 'destination' in c.lower() or 'airport' in c.lower()]
print("Detected airport columns:", airport_cols)

if len(airport_cols) >= 1:
    airport_col = airport_cols[0]
    flights_by_airport = df[airport_col].value_counts()

    plt.figure(figsize=(12,6))
    flights_by_airport.head(15).plot(kind='bar')
    plt.title("Top 15 Airports by Flight Count")
    plt.xlabel("Airport")
    plt.ylabel("Flights")
    plt.show()

    flights_by_airport.head(15)

In [None]:
status_cols = [c for c in df.columns if 'status' in c.lower() or 'cancel' in c.lower() or 'delay' in c.lower()]
print("Detected status-related columns:", status_cols)

if len(status_cols) > 0:
    status_col = status_cols[0]

    status_counts = df[status_col].value_counts(normalize=True) * 100

    plt.figure()
    status_counts.plot(kind='bar')
    plt.title("Flight Status Distribution (%)")
    plt.xlabel("Status")
    plt.ylabel("Percentage")
    plt.show()

    status_counts

In [None]:
if len(status_cols) > 0:
    crosstab_airline_status = pd.crosstab(df[airline_col], df[status_col], normalize='index') * 100
    crosstab_airline_status.head(10)

In [None]:
summary_table = pd.DataFrame({
    'Total Flights': df.groupby(airline_col).size(),
})

summary_table = summary_table.sort_values(by='Total Flights', ascending=False)
summary_table.head(10)

In [None]:
summary_table.to_csv("/content/airline_flight_summary.csv")
print("Descriptive analytics outputs saved.")
