# Flights data EDA and feature engineering

Erica Landreth

In [0]:
pip install prophet

In [0]:
# imports
import pandas as pd
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
import pytz
from datetime import datetime, timedelta
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt

In [0]:
folder_path = "dbfs:/student-groups/Group_4_1"
dataset = 'parquet_airlines_data_1y' # 1 year
df = spark.read.parquet(f"{folder_path}/interim/{dataset}_clean.parquet")

In [0]:
display(df)

In [0]:
# reminder of which colmns kept

# core features: useful for ML features and/or feature engineering
core_feats = ["FL_DATE","OP_UNIQUE_CARRIER","TAIL_NUM","OP_CARRIER_FL_NUM","ORIGIN","DEST","CRS_DEP_TIME","DEP_DELAY","CRS_ARR_TIME","ARR_DELAY","CANCELLED","DIVERTED","CRS_ELAPSED_TIME","AIR_TIME","DISTANCE"]
# we may or may not end up using these, but they can't easily be recreated later, so we'll keep them to be cautious
on_the_fence = ["ORIGIN_AIRPORT_SEQ_ID","DEST_AIRPORT_SEQ_ID","TAXI_OUT","TAXI_IN"]
# useful for time series analysis
time_series = ["QUARTER","MONTH","DAY_OF_MONTH","DAY_OF_WEEK","DEP_TIME_BLK","ARR_TIME_BLK","YEAR"]
# useful to sanity check that joins are successful
sanity_check = ["ORIGIN_CITY_NAME","DEST_CITY_NAME","ORIGIN_STATE_FIPS","DEST_STATE_FIPS"]
# provides reasoning for cancellations, delays, and returns to gate
delay_info = [col for col in df.columns if col.endswith("_DELAY") and col not in core_feats] + ["CANCELLATION_CODE"] + ["FIRST_DEP_TIME","LONGEST_ADD_GTIME","TOTAL_ADD_GTIME"]
    # Note: cancellation codes are: "A" for carrier-caused, "B" for weather, "C" for National Aviation System, and "D" for security

## Characterizing outcome

In [0]:
outcome_cols = ['DEP_DELAY','CANCELLED']
outcome_info = df.select(outcome_cols + delay_info).toPandas()

outcome_info['is_delayed'] = (outcome_info['DEP_DELAY'] >= 15)
outcome_info['is_cancelled'] = (outcome_info['CANCELLED'] > 0)
outcome_info['outcome'] = outcome_info['is_delayed'] | outcome_info['is_cancelled']

In [0]:
display(outcome_info)

In [0]:
display(outcome_info.groupby(['is_delayed','is_cancelled']).size())

In [0]:
delay_info

Explore how many flights are delayed, and, of the the delayed flights, how many have delay time attributed to various reasons. Understanding which "reasons" are often listed for a delay can guide which types of features we try to design, in order to capture some of those effects.

In [0]:
data = [np.mean(outcome_info['outcome']), np.mean(np.logical_not(outcome_info['outcome']))]

fig,ax = plt.subplots(1,1,figsize=(10,2))
ax.barh([''],data[0], color='black',height=0.25,edgecolor='black',label='Delayed')
ax.barh([''],data[1], color='gray',height=0.25,edgecolor='black',left=data[0],label='Not Delayed')
ax.set_xlabel('Proportion')
ax.set_title('Proportion of delayed flights')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=2)
plt.show()

reason_cols = [c for c in outcome_info.columns if "_DELAY" in c and c not in ["DIV_ARR_DELAY","DEP_DELAY"]]
# delayed = outcome_info[outcome_info['DEP_DELAY'] > 0][reason_cols].fillna(0)
delayed = outcome_info[outcome_info['outcome']][reason_cols].fillna(0)
for v in delayed.columns:
    data = [np.mean(delayed[v] > 0), np.mean(delayed[v] <= 0)]

    fig,ax = plt.subplots(1,1,figsize=(10,2))
    ax.barh([''],data[0], color='black',height=0.25,edgecolor='black',label='Delayed')
    ax.barh([''],data[1], color='gray',height=0.25,edgecolor='black',left=data[0],label='Not Delayed')
    ax.set_xlabel('Proportion')
    reason = ' '.join(v.split('_')[0:]).lower()
    ax.set_title(f'Proportion of delayed flights delayed by {reason}')
    ax.set_xlim((0,1))
    ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=2)
    plt.show()

# average delay amount attributed to each reason
data = np.sum(delayed, axis=0)
data = data.sort_values(ascending=False) / np.sum(data)
data

In [0]:
fig,ax = plt.subplots(1,1,figsize=(10,2))
for idx,v in enumerate(data.index):
    if idx == 0:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "))
    else:
        print(data[delayed.columns[idx-1]])
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "),left=np.sum(data[data.index[:idx]]))
ax.set_xlabel('Proportion')
ax.set_title(f'Proportion of total delay minutes by each reason')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=len(data))
plt.show()

Info on delay reasons from DoT data dictionary: https://www.transtats.bts.gov/Fields.asp?gnoyr_VQ=FGJ

Greatest proportion of flights being delayed by carrier, NAS, or late aircraft.

More notes on what the delay categories mean: https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

Verbatim descriptions from that website:

"Air Carrier: The cause of the cancellation or delay was due to circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling, etc.).

Extreme Weather: Significant meteorological conditions (actual or forecasted) that, in the judgment of the carrier, delays or prevents the operation of a flight such as tornado, blizzard or hurricane.

National Aviation System (NAS): Delays and cancellations attributable to the national aviation system that refer to a broad set of conditions, such as non-extreme weather conditions, airport operations, heavy traffic volume, and air traffic control.

Late-arriving aircraft: A previous flight with same aircraft arrived late, causing the present flight to depart late.
Security: Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas."

NOTES:

Interesting that "weather delay" only includes extreme weather; otherwise weather delays are lumped into the NAS category.

## Prophet modeling for time series characterization

In [0]:
# get US holidays
us_holidays = make_holidays_df(
    year_list=[2013 + i for i in range(10)], country='US'
)
display(us_holidays)

In [0]:
cols = ['DEP_DELAY','dep_datetime']
P = df.filter(col('ORIGIN') == 'ORD').orderBy('dep_datetime').select(*cols).toPandas()

In [0]:
# fit prophet model with weekly, daily, yearly, and holiday seasonality features
model = Prophet(
    interval_width=0.9,
    growth='linear',
    weekly_seasonality=True,
    daily_seasonality=True,
    yearly_seasonality=True,
    holidays=us_holidays
)

model.fit(P)