## Setup and load data

In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, to_timestamp, to_date, when
from prophet.make_holidays import make_holidays_df
from xgboost.spark import SparkXGBClassifier

In [0]:
# Variables and directories
data_BASE_DIR = "dbfs:/mnt/mids-w261/datasets_final_project_2022"
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/checkpoints")
period = "" # one of the following values ("", "_3m", "_6m", "_1y")

# Datasets
flights = spark.read.parquet(f"{data_BASE_DIR}/parquet_airlines_data{period}")
weather = spark.read.parquet(f"{team_BASE_DIR}/interim/weather_{period}_checkpoint")

## Get dataset sizes

In [0]:
# Get the number of rows
num_rows = flights.count()

# Get the number of columns
num_columns = len(flights.columns)

# Display the size of the DataFrame
print(f"The flights DataFrame has {num_rows} rows and {num_columns} columns.")

In [0]:
# Get the number of rows
num_rows = flights.filter(flights.YEAR < 2020).count()

# Get the number of columns
num_columns = len(flights.columns)

# Display the size of the DataFrame
print(f"The 5 year flights DataFrame has {num_rows} rows and {num_columns} columns.")

In [0]:
# Get the number of rows
num_rows = weather.count()

# Get the number of columns
num_columns = len(weather.columns)

# Display the size of the DataFrame
print(f"The weather DataFrame has {num_rows} rows and {num_columns} columns.")

In [0]:
# Get the number of rows
num_rows = weather.filter(weather.YEAR < 2020).count()

# Get the number of columns
num_columns = len(weather.columns)

# Display the size of the DataFrame
print(f"The 5 year weather DataFrame has {num_rows} rows and {num_columns} columns.")

In [0]:
flights = flights.dropDuplicates()

In [0]:
keep_me = [
    'CARRIER_DELAY',
    'WEATHER_DELAY',
    'NAS_DELAY',
    'SECURITY_DELAY',
    'LATE_AIRCRAFT_DELAY',
    'DEP_DELAY',
    'CANCELLED',
    'YEAR'
]

outcome_info = flights.select(keep_me).toPandas()

outcome_info['is_delayed'] = (outcome_info['DEP_DELAY'] >= 15)
outcome_info['is_cancelled'] = (outcome_info['CANCELLED'] > 0)
outcome_info['outcome'] = outcome_info['is_delayed'] | outcome_info['is_cancelled']

In [0]:
data = [np.mean(outcome_info['outcome']), np.mean(np.logical_not(outcome_info['outcome']))]

fig,ax = plt.subplots(1,1,figsize=(10,2))
ax.barh([''],data[0], color='black',height=0.25,edgecolor='black',label='Delayed')
ax.barh([''],data[1], color='gray',height=0.25,edgecolor='black',left=data[0],label='Not Delayed')
ax.set_xlabel('Proportion')
ax.set_title('Proportion of delayed flights')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=2)
plt.show()

reason_cols = [c for c in outcome_info.columns if "_DELAY" in c and c not in ["DIV_ARR_DELAY","DEP_DELAY"]]
# delayed = outcome_info[outcome_info['DEP_DELAY'] > 0][reason_cols].fillna(0)
delayed = outcome_info[outcome_info['outcome']][reason_cols].fillna(0)
for v in delayed.columns:
    data = [np.mean(delayed[v] > 0), np.mean(delayed[v] <= 0)]

    fig,ax = plt.subplots(1,1,figsize=(10,2))
    ax.barh([''],data[0], color='black',height=0.25,edgecolor='black',label='Delayed')
    ax.barh([''],data[1], color='gray',height=0.25,edgecolor='black',left=data[0],label='Not Delayed')
    ax.set_xlabel('Proportion')
    reason = ' '.join(v.split('_')[0:]).lower()
    ax.set_title(f'Proportion of delayed flights delayed by {reason}')
    ax.set_xlim((0,1))
    ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=2)
    plt.show()

# average delay amount attributed to each reason
data = np.sum(delayed, axis=0)
data = data.sort_values(ascending=False) / np.sum(data)
data

In [0]:
fig,ax = plt.subplots(1,1,figsize=(10,2))
for idx,v in enumerate(data.index):
    if idx == 0:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "))
    else:
        print(data[delayed.columns[idx-1]])
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "),left=np.sum(data[data.index[:idx]]))
ax.set_xlabel('Proportion')
ax.set_title(f'Proportion of total delay minutes by DoT delay categories')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=len(data))
plt.show()

## Working with joined data

In [0]:
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather{period}_v1.parquet")

In [0]:
# Get the number of rows
num_rows = df.count()

# Get the number of columns
num_columns = len(df.columns)

# Display the size of the DataFrame
print(f"The joined DataFrame has {num_rows} rows and {num_columns} columns.")

In [0]:
# Get the number of rows
num_rows = df.filter(df.YEAR < 2020).count()

# Get the number of columns
num_columns = len(df.columns)

# Display the size of the DataFrame
print(f"The 5 year joined DataFrame has {num_rows} rows and {num_columns} columns.")

In [0]:
df = df.withColumn("dep_date_utc", to_date(col("sched_depart_utc"))) \
  .withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

In [0]:
from pyspark.sql.functions import col, avg, count, to_date, when

# Group by dep_date_utc and calculate the average of DEP_DELAY and outcome
avg_delays_df = df.groupBy("dep_date_utc").agg(
    avg(col("DEP_DELAY")).alias("avg_dep_delay"),
    avg(col("outcome")).alias("avg_outcome"),
    count("*").alias("count")
)

# Convert to Pandas DataFrame
avg_delays_pd = avg_delays_df.toPandas()

In [0]:
# Sort avg_delays_pd by avg_dep_delay
avg_delays_pd_sorted = avg_delays_pd.sort_values(by='dep_date_utc')

In [0]:
import matplotlib.pyplot as plt

# Plot avg_dep_delay vs dep_date_utc
plt.figure(figsize=(10, 6))
plt.plot(avg_delays_pd_sorted['dep_date_utc'], avg_delays_pd_sorted['avg_dep_delay'], linestyle='-')
plt.xlabel('Departure Date (UTC)')
plt.ylabel('Average Departure Delay')
plt.title('Average Departure Delay vs Departure Date (UTC)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [0]:
import matplotlib.pyplot as plt

# Plot avg_dep_delay vs dep_date_utc
plt.figure(figsize=(10, 6))
plt.plot(avg_delays_pd_sorted['dep_date_utc'], avg_delays_pd_sorted['avg_outcome'], linestyle='-')
plt.xlabel('Departure Date (UTC)')
plt.ylabel('Average Proportion of Flights Delayed or Cancelled')
plt.title('Proportion of Flights Disrupted vs Departure Date (UTC)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [0]:
import matplotlib.pyplot as plt

# Plot avg_dep_delay vs dep_date_utc
plt.figure(figsize=(10, 6))
plt.plot(avg_delays_pd_sorted['dep_date_utc'], avg_delays_pd_sorted['count'], linestyle='-')
plt.xlabel('Departure Date (UTC)')
plt.ylabel('Number of Flights')
plt.title('Daily Flight Count vs Departure Date (UTC)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [0]:
lw = 0.8
font_size = 12

fig, ax = plt.subplots(3, 1, figsize=(10, 6))

ax[0].plot(avg_delays_pd_sorted['dep_date_utc'], avg_delays_pd_sorted['count'], linestyle='-', linewidth=lw)
ax[0].set_xlabel('Departure Date (UTC)', fontsize=font_size)
ax[0].set_ylabel('# Flights', fontsize=font_size)
ax[0].set_title('Quantifying Flight Disruption')
ax[0].grid(True)
ax[0].set_xlabel('')
ax[0].set_xticklabels([])
ax[0].tick_params(axis='both', which='major', labelsize=font_size)

ax[1].plot(avg_delays_pd_sorted['dep_date_utc'], avg_delays_pd_sorted['avg_outcome'], linestyle='-', linewidth=lw)
ax[1].set_ylabel('Daily Average\nProportion of\nFlights Disrupted', fontsize=font_size)
ax[1].grid(True)
ax[1].set_xlabel('')
ax[1].set_xticklabels([])
ax[1].tick_params(axis='both', which='major', labelsize=font_size)

ax[2].plot(avg_delays_pd_sorted['dep_date_utc'], avg_delays_pd_sorted['avg_dep_delay'], linestyle='-', linewidth=lw)
ax[2].set_ylabel('Daily Average\nFlight Delay\n(minutes)', fontsize=font_size)
ax[2].grid(True)
ax[2].set_xlabel('Departure Date (UTC)', fontsize=font_size)
ax[2].tick_params(axis='both', which='major', labelsize=font_size)

plt.xticks(rotation=45, fontsize=font_size)
plt.tight_layout()
plt.show()

In [0]:
from pyspark.sql.functions import col

# Filter the DataFrame for rows where YEAR is less than 2020
filtered_df = df.filter(col("YEAR") < 2020)

# Group by outcome and count the values
outcome_counts = filtered_df.groupBy("outcome").count()

# Display the result
display(outcome_counts)

## Seasonality Figures

In [0]:
fn_wd = f"wd_seasonality_model_{period}_train.parquet"
fn_yh = f"yh_seasonality_model_{period}_train.parquet"

model_wd = spark.read.parquet(f"{team_BASE_DIR}/interim/{fn_wd}")
model_yh = spark.read.parquet(f"{team_BASE_DIR}/interim/{fn_yh}")

In [0]:
airport = 'BOS'
seas1 = model_wd.filter(col('ORIGIN') == airport).toPandas()
seas2 = model_yh.filter(col('ORIGIN') == airport).toPandas()

In [0]:
fig, ax = plt.subplots(4, 1, figsize=(8, 6), gridspec_kw={'hspace': 0.6})
font_size = 10

seas1['x'] = seas1['dow'] + seas1['hour'] / 24
seas1.sort_values('x', inplace=True)
seas1.plot(x='x', y='daily', ax=ax[0], legend=False)
seas1.plot(x='x', y='weekly', ax=ax[1], legend=False)

seas2.sort_values(['month', 'dom'], inplace=True)
seas2['x'] = range(1, seas2.shape[0] + 1)
seas2.sort_values('x', inplace=True)
seas2.plot(x='x', y='yearly', ax=ax[2], legend=False)
seas2.plot(x='x', y='holidays', ax=ax[3], legend=False)

ax[0].set_title(f'{airport} Seasonality Components (minutes)\nTrained on 2015-2018 Data')
ax[0].set_xlabel('Day of week', fontsize=font_size)
ax[1].set_xlabel('Day of week', fontsize=font_size)
ax[0].set_ylabel('Daily', fontsize=font_size)
ax[1].set_ylabel('Weekly', fontsize=font_size)
ax[2].set_xlabel('Day of year', fontsize=font_size)
ax[3].set_xlabel('Day of year', fontsize=font_size)
ax[2].set_ylabel('Yearly', fontsize=font_size)
ax[3].set_ylabel('Holiday', fontsize=font_size)

for axis in ax:
    axis.tick_params(axis='both', which='major', labelsize=font_size)

plt.show()