****
# Import modules
****

In [1]:
import os
import pandas as pd
import numpy as np

import statsmodels.formula.api as smf

****
# Load Data
****

In [2]:
file_path = "C:\\Users\\kruu\\store\\data_EDDM\\"

data_EDDM = pd.read_parquet(os.path.join(file_path + "landing_df_EDDM_with_meteo_0_of_9.parquet"))

for i in range(1, 10):
    temp = pd.read_parquet(os.path.join(file_path + f"landing_df_EDDM_with_meteo_{i}_of_9.parquet"))
    data_EDDM = pd.concat((data_EDDM, temp))

In [None]:
data_EDDM[["avg_wind_dir", "avg_wind_speed", "avg_vis", "avg_temp", "avg_press"]].isna().any()

In [4]:
# Redefine Rush hours as they were in UTC

def is_rush_hour(date): #based on hourly count bar plot: data_EDDM_reduced.groupby("hour").id.count().plot(kind="bar")
    
    # Extract hour
    hour = date.hour
    minute = date.minute
    time_in_minutes = hour * 60 + minute
    
    if (5 * 60 <= time_in_minutes <= 6 * 60) or \
       (7 * 60 <= time_in_minutes <= 9 * 60) or \
       (11 * 60 <= time_in_minutes <= 14 * 60) or \
       (16 * 60 <= time_in_minutes <= 20 * 60):
        return True
    else:
        return False

data_EDDM['rush_hour'] = data_EDDM["start"].apply(is_rush_hour)

In [5]:
data_EDDM["nominal_distance_prop"] = data_EDDM["distance"] / data_EDDM["nominal_distance"] 

****
# LR Model
****

In [None]:
boxplot = data_EDDM.boxplot(["nominal_distance_prop"], by = ["star"],
                     figsize = (16, 9),
                     vert = False,
                     showmeans = False,
                     notch = False,
                     whis = (2.5,97.5))

boxplot.axvline(x=1, color='darkorange', linestyle=':', linewidth=2)

custom_labels = ['BETO1A', 'LAND1B', 'NAPS1B', 'ROKI1A'] #MAYBE IN WRONG ORDER
boxplot.set_yticklabels(custom_labels, fontsize=22, rotation=45)

boxplot.get_figure().suptitle('')
boxplot.set_ylabel("")
boxplot.set_title('')

boxplot.set_axisbelow(True)
boxplot.grid(True, linestyle='--', alpha=0.7)
boxplot.spines['top'].set_visible(False)
boxplot.spines['right'].set_visible(False)
boxplot.spines['left'].set_visible(False)
boxplot.spines['bottom'].set_visible(False)
boxplot.tick_params(axis='x', which='both', length=0, labelsize=20)

In [None]:
model_lm = smf.ols(
    "nominal_distance_prop ~ C(star) + C(weekday) + avg_vis + avg_wind_speed + C(season) + avg_vis + avg_press + C(body_type, Treatment(reference='Narrowbody')) + C(rush_hour) + avg_temp", 
    data_EDDM, 
)

res_lm = model_lm.fit()
res_lm.summary()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

coefficients = res_lm.params
p_values = res_lm.pvalues

# features = coefficients.index  # Feature names
features = np.array([ #Same order as coefficients.index 
    "Intercept", 
    "STAR: LAND1B",
    "STAR: NAPS1B",
    "STAR: ROKIT1A",
    "WEEKDAY: Tuesday",
    "WEEKDAY: Wednesday",
    "WEEKDAY: Thursday",
    "WEEKDAY: Friday",
    "WEEKDAY: Saturday",
    "WEEKDAY: Sunday",
    "SEASON: Spring",
    "SEASON: Summer",
    "SEASON: Winter",
    "BODY TYPE: Regional jet",
    "RUSH HOUR: True",
    "Visibility",
    "Wind speed",
    "Pressure",
    "Temperature",
])
importance = coefficients.values  # Absolute coefficients to represent feature importance

indices = np.argsort(importance)

bar_color = 'steelblue'

def significance_stars(p_value):
    if p_value <= 0.01:
        return '***'
    elif p_value <= 0.05:
        return '**'
    elif p_value <= 0.10:
        return '*'
    else:
        return ''

plt.figure(figsize=(12, 8))
plt.barh(range(len(importance)), importance[indices], align='center', color=bar_color)

for i in range(len(importance)):
    pvalue = significance_stars(p_values[indices][i])
    if p_values[indices][i] < 0.05: 
        if importance[indices][i] > 0:
            plt.text(importance[indices][i] * 1.01, i, f'{np.round(importance[indices][i],4)}{pvalue}', va='center', fontsize=10)
        else:
            plt.text(0.001, i, f'{np.round(importance[indices][i],4)}{pvalue}', va='center', fontsize=10)
        

y_labels = plt.yticks(range(len(importance)), features[indices], fontsize=11)
for i in range(len(importance)):
    if p_values[indices][i] > 0.05:
        y_labels[1][i].set_color('firebrick')

ax = plt.gca()
ax.grid(True, axis='y', linestyle='--', alpha=0.7)
ax.set_axisbelow(True)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.tick_params(axis='y', which='both', length=0)

plt.xlabel('Coefficient Magnitude', fontsize=14, labelpad=10)
plt.title('Feature Importance for EDDM with Significance Levels', fontsize=16, pad=15)
plt.tight_layout()
plt.show()