In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)


In [None]:
DATA_PATH = "/content/processed_flight_records.csv"

df = pd.read_csv(DATA_PATH)

# Identify columns
date_cols = [c for c in df.columns if 'date' in c.lower() or 'time' in c.lower()]
status_cols = [c for c in df.columns if 'status' in c.lower()]
delay_cols = [c for c in df.columns if 'delay' in c.lower()]

date_col = date_cols[0]
status_col = status_cols[0] if status_cols else None
delay_col = delay_cols[0] if delay_cols else None

# Parse datetime
df[date_col] = pd.to_datetime(df[date_col])

In [None]:
# Temporal features
df['hour'] = df[date_col].dt.hour
df['day_of_week'] = df[date_col].dt.dayofweek
df['month'] = df[date_col].dt.month

# Target variables
if status_col:
    df['is_delayed'] = df[status_col].astype(str).str.lower().str.contains('delay').astype(int)
    df['is_cancelled'] = df[status_col].astype(str).str.lower().str.contains('cancel').astype(int)
else:
    df['is_delayed'] = 0
    df['is_cancelled'] = 0

In [None]:
encoded_df = df.copy()
label_encoders = {}

for col in encoded_df.select_dtypes(include=['object', 'category']).columns:
    if col not in [status_col]:
        le = LabelEncoder()
        encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))
        label_encoders[col] = le


In [None]:
num_cols = encoded_df.select_dtypes(include=[np.number]).columns

corr_matrix = encoded_df[num_cols].corr()

plt.figure(figsize=(14,8))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
corr_with_delay = corr_matrix['is_delayed'].sort_values(ascending=False)
corr_with_delay

In [None]:
X = encoded_df[num_cols].drop(columns=['is_delayed', 'is_cancelled'], errors='ignore')
y_class = encoded_df['is_delayed']

mi_class = mutual_info_classif(X, y_class, random_state=42)
mi_class_df = pd.DataFrame({
    'feature': X.columns,
    'mutual_information': mi_class
}).sort_values('mutual_information', ascending=False)

mi_class_df

In [None]:
if delay_col:
    y_reg = encoded_df[delay_col].fillna(0)
    mi_reg = mutual_info_regression(X, y_reg, random_state=42)

    mi_reg_df = pd.DataFrame({
        'feature': X.columns,
        'mutual_information': mi_reg
    }).sort_values('mutual_information', ascending=False)

    mi_reg_df

In [None]:
plt.figure(figsize=(10,6))
mi_class_df.head(15).plot(kind='barh', x='feature', y='mutual_information', legend=False)
plt.title("Top Features Associated with Delays")
plt.xlabel("Mutual Information")
plt.gca().invert_yaxis()
plt.show()


In [None]:
corr_with_delay.to_csv("/content/correlation_with_delay.csv")
mi_class_df.to_csv("/content/mutual_information_delay_classification.csv", index=False)

if delay_col:
    mi_reg_df.to_csv("/content/mutual_information_delay_regression.csv", index=False)

print("Correlation and feature association outputs saved.")