In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp
from typing import Tuple
from scipy.stats import t
from flight_delay_utils import *

In [7]:
from IPython.display import display, HTML

display(HTML("<style>.container { width:80 !important; }</style>"))

In [8]:
df = pd.read_csv("../T_ONTIME_REPORTING.csv")

In [9]:
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
       'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN',
       'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15',
       'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME',
       'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE'],
      dtype='object')

In [10]:
destinations = df.DEST.unique()
origins = df.ORIGIN.unique()


mean_arr, std_arr = get_delays(df, destinations, origins)
mean_dep, _ = get_delays(df, destinations, origins, delay_type="DEP_DELAY")
mean_flight = mean_dep-mean_arr
std_delay_diff = get_delay_statistics(df, destinations, origins)
sample_size = df.groupby(['ORIGIN', 'DEST']).size().unstack(fill_value=0)

NameError: name 'get_delays' is not defined

In [None]:
mean_arr
std_arr

In [None]:
tail_numbers = df.TAIL_NUM.unique()
keys = ["DEST", "ORIGIN", "ARR_TIME", "DAY_OF_MONTH"]

airport_names = df.DEST.unique()
airport_data = {}
for name in airport_names:
    airport_data[name] = []

print(len(tail_numbers))
for i, number in enumerate(tail_numbers):
    all_flights = df[df.loc[:, "TAIL_NUM"] == number]
    old_flight = pd.Series({"DEST": [np.NaN, np.NaN]})
    for month in all_flights.MONTH.unique():
        month_flights = all_flights[all_flights.loc[:, "MONTH"] == month]
        for day in month_flights.DAY_OF_MONTH.unique():
            flights = month_flights[month_flights.loc[:, "DAY_OF_MONTH"] == day]
            flights = flights.sort_values(by="ARR_TIME")
            for _, flight in flights.iterrows():
                if old_flight.DEST == flight.ORIGIN:
                    airport_data[flight.ORIGIN].append(flight.DEP_DELAY - flight.ARR_DELAY)
                old_flight = flight

In [None]:



# Create a new DataFrame to store the sampled values
flights_df = mean_flight.copy()
first_df = mean_flight.copy()


# Apply the sampling function to each element
for origin in flights_df.index:
    for dest in flights_df.columns:
        mean = mean_flight.at[dest, origin]
        std = std_delay_diff.at[dest, origin]
        n = sample_size.at[dest, origin]
        flights_df.at[dest, origin] = lambda size, mean=mean, std=std, n=n : t.rvs(n-1, mean, std, size=size)
        
        mean = mean_arr.at[dest, origin]
        std = std_arr.at[dest, origin]
        first_df.at[dest, origin] = lambda size, mean=mean, std=std, n=n : t.rvs(n-1, mean, std, size=size)

In [None]:
first_df.at["BOS", "JFK"](1)
# t.rvs(0, mean, n, size=1)

# for origin in origins:
#     for dest in destinations:
#         ans = first_df.at[dest, origin](1)
#         print(ans)
#         if ans is not np.NaN:
#             print(ans)

In [None]:
ground = []
for airport in airport_data.keys():
    ground.extend(airport_data[airport])
arr = np.array(ground)
ground_mean = np.mean(arr[~np.isnan(arr)])


means = {}
std = {}
functions = {}
for airport in airport_data.keys():
    arr = np.array(airport_data[airport])
    clean_arr = arr[~np.isnan(arr)]
    if clean_arr.size > 0 :
        _, p_value = ttest_1samp(clean_arr, ground_mean)
        if p_value < 0.05:
            means[airport] = np.mean(clean_arr)
        else:
            means[airport] = ground_mean
        std[airport] = np.std(clean_arr)
        df, loc, scale = t.fit(clean_arr)
        functions[airport]  = lambda size: t.rvs(df, loc, scale, size=size) 
        # print(f"Airport = {airport}, delay = {functions[airport](1)}")
    else:
        means[airport] = ground_mean

# means_series = pd.Series(means)
# # Sorting the series in descending order by value
# sorted_means = means_series.sort_values(ascending=False)

# # Selecting the top 20 entries
# top_20_airports = sorted_means.tail(20)

# # Plotting
# plt.figure(figsize=(12, 6))  # Larger figure size for clarity
# top_20_airports.plot(kind='bar', color='teal')  # Using a different color for distinction
# plt.title('Top 20 Airports by Average Value')
# plt.xlabel('Airport')
# plt.ylabel('Added Delay (Minutes)')
# plt.xticks(rotation=45)  # Rotate the labels to fit them nicely
# plt.tight_layout()  # Adjust subplots
# plt.show()

In [None]:
path = ["BOS", "JFK", "MIA"]

initial_state = 0
delay = distribution_delay(path, flights_df, first_df, functions, int(3e3), initial_state = initial_state)
delay2 = distribution_delay(path, flights_df, first_df, functions, int(3e3))
expected_delay = calculate_expected_delay(path, means, mean_arr, mean_flight)

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 15

path_label = path[0]
for i, airport in enumerate(path):
    if i != 0:
        path_label += ' -> ' + airport
# Plotting the histogram with a vertical line at the mean

fig, (ax0, ax1) = plt.subplots(2, 1, sharex = True, figsize=(6, 9))
ax0.hist(delay, bins=50, alpha=0.5, color='blue', edgecolor='navy')
ax0.axvline(np.mean(delay), color='red', linestyle='dashed', linewidth=3, label=f"mean: {np.round(np.mean(delay))}")
ax0.set_ylabel('Frequency')
ax0.legend()
ax1.hist(delay2, bins=50, alpha=0.5, color='blue', edgecolor='navy')
ax1.axvline(np.mean(delay2), color='red', linestyle='dashed', linewidth=3, label=f"mean: {np.round(np.mean(delay2))}")
ax1.set_xlabel('Delay (minutes)')
ax1.set_ylabel('Frequency')
ax1.legend()
plt.tight_layout
plt.show()

In [None]:
for dest in destinations:
    for origin in origins:
        print(first_df.at[dest, origin](1))


In [None]:
ground = []
for airport in airport_data.keys():
    ground.extend(airport_data[airport])
arr = np.array(ground)
ground_mean = np.mean(arr[~np.isnan(arr)])
print(ground_mean)

In [None]:
flight_delays = (df.ARR_DELAY - df.DEP_DELAY).dropna()
_, p_value = ttest_1samp(flight_delays, - ground_mean)
print(f"{100 * (1 - p_value)}")

In [None]:
destinations = df.DEST.unique()
origins = df.ORIGIN.unique()

days = df.DAY_OF_WEEK.unique()
M_days = {}
P_days = {}
day_name = {1: "MON",
            2: "TUE",
            3: "WED",
            4: "THU",
            5: "FRI",
            6: "SAT",
            7: "SUN"
           }
for day in days:
    day = day_name[day]
    M, P = get_m_p(df.loc[df.DAY_OF_WEEK==day], destinations, origins)
    M_days[day] = M
    P_days[day] = P
print(M_days)
print(P_days)

In [None]:
U_M, Sigma_M, VT_M = np.linalg.svd(M)
U_P, Sigma_P, VT_P= np.linalg.svd(P)
U_P[:, 0] * Sigma_P[0] @ VT_P[0,:]

In [None]:
destinations = df.DEST.unique()
origins = df.ORIGIN.unique()

get_influences(M)

In [None]:
U_P

In [None]:
tail_numbers = df.TAIL_NUM.unique()

for number in tail_numbers:
    flights = df[df.loc[:, "TAIL_NUM"] == tail_1.loc[:, ["MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", "ORIGIN", "DEST", "CRS_ARR_TIME"]]
    


df[df.loc[:, "TAIL_NUM"] == tail_1.loc[:, ["MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", "ORIGIN", "DEST", "CRS_ARR_TIME"]]