# Flights data EDA and feature engineering

Erica Landreth

In [0]:
pip install prophet

In [0]:
spark_version = spark.version
print(f"Spark version: {spark_version}")

In [0]:
# imports
import pandas as pd
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
import pytz
from datetime import datetime, timedelta
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
import matplotlib.pyplot as plt
from datetime import datetime, time
from pyspark.sql.functions import current_date
from pyspark.sql.functions import lit


In [0]:
folder_path = "dbfs:/student-groups/Group_4_1"
dataset = 'parquet_airlines_data_1y' # 1 year
df = spark.read.parquet(f"{folder_path}/interim/{dataset}_clean.parquet")

In [0]:
display(df)

In [0]:
# reminder of which colmns kept

# core features: useful for ML features and/or feature engineering
core_feats = ["FL_DATE","OP_UNIQUE_CARRIER","TAIL_NUM","OP_CARRIER_FL_NUM","ORIGIN","DEST","CRS_DEP_TIME","DEP_DELAY","CRS_ARR_TIME","ARR_DELAY","CANCELLED","DIVERTED","CRS_ELAPSED_TIME","AIR_TIME","DISTANCE"]
# we may or may not end up using these, but they can't easily be recreated later, so we'll keep them to be cautious
on_the_fence = ["ORIGIN_AIRPORT_SEQ_ID","DEST_AIRPORT_SEQ_ID","TAXI_OUT","TAXI_IN"]
# useful for time series analysis
time_series = ["QUARTER","MONTH","DAY_OF_MONTH","DAY_OF_WEEK","DEP_TIME_BLK","ARR_TIME_BLK","YEAR"]
# useful to sanity check that joins are successful
sanity_check = ["ORIGIN_CITY_NAME","DEST_CITY_NAME","ORIGIN_STATE_FIPS","DEST_STATE_FIPS"]
# provides reasoning for cancellations, delays, and returns to gate
delay_info = [col for col in df.columns if col.endswith("_DELAY") and col not in core_feats] + ["CANCELLATION_CODE"] + ["FIRST_DEP_TIME","LONGEST_ADD_GTIME","TOTAL_ADD_GTIME"]
    # Note: cancellation codes are: "A" for carrier-caused, "B" for weather, "C" for National Aviation System, and "D" for security

## Characterizing outcome

In [0]:
outcome_cols = ['DEP_DELAY','CANCELLED']
outcome_info = df.select(outcome_cols + delay_info).toPandas()

outcome_info['is_delayed'] = (outcome_info['DEP_DELAY'] >= 15)
outcome_info['is_cancelled'] = (outcome_info['CANCELLED'] > 0)
outcome_info['outcome'] = outcome_info['is_delayed'] | outcome_info['is_cancelled']

In [0]:
display(outcome_info)

In [0]:
display(outcome_info.groupby(['is_delayed','is_cancelled']).size())

In [0]:
delay_info

Explore how many flights are delayed, and, of the the delayed flights, how many have delay time attributed to various reasons. Understanding which "reasons" are often listed for a delay can guide which types of features we try to design, in order to capture some of those effects.

In [0]:
data = [np.mean(outcome_info['outcome']), np.mean(np.logical_not(outcome_info['outcome']))]

fig,ax = plt.subplots(1,1,figsize=(10,2))
ax.barh([''],data[0], color='black',height=0.25,edgecolor='black',label='Delayed')
ax.barh([''],data[1], color='gray',height=0.25,edgecolor='black',left=data[0],label='Not Delayed')
ax.set_xlabel('Proportion')
ax.set_title('Proportion of delayed flights')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=2)
plt.show()

reason_cols = [c for c in outcome_info.columns if "_DELAY" in c and c not in ["DIV_ARR_DELAY","DEP_DELAY"]]
# delayed = outcome_info[outcome_info['DEP_DELAY'] > 0][reason_cols].fillna(0)
delayed = outcome_info[outcome_info['outcome']][reason_cols].fillna(0)
for v in delayed.columns:
    data = [np.mean(delayed[v] > 0), np.mean(delayed[v] <= 0)]

    fig,ax = plt.subplots(1,1,figsize=(10,2))
    ax.barh([''],data[0], color='black',height=0.25,edgecolor='black',label='Delayed')
    ax.barh([''],data[1], color='gray',height=0.25,edgecolor='black',left=data[0],label='Not Delayed')
    ax.set_xlabel('Proportion')
    reason = ' '.join(v.split('_')[0:]).lower()
    ax.set_title(f'Proportion of delayed flights delayed by {reason}')
    ax.set_xlim((0,1))
    ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=2)
    plt.show()

# average delay amount attributed to each reason
data = np.sum(delayed, axis=0)
data = data.sort_values(ascending=False) / np.sum(data)
data

In [0]:
fig,ax = plt.subplots(1,1,figsize=(10,2))
for idx,v in enumerate(data.index):
    if idx == 0:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "))
    else:
        print(data[delayed.columns[idx-1]])
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "),left=np.sum(data[data.index[:idx]]))
ax.set_xlabel('Proportion')
ax.set_title(f'Proportion of total delay minutes by DoT delay categories')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=len(data))
plt.show()

Info on delay reasons from DoT data dictionary: https://www.transtats.bts.gov/Fields.asp?gnoyr_VQ=FGJ

Greatest proportion of flights being delayed by carrier, NAS, or late aircraft.

More notes on what the delay categories mean: https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

Verbatim descriptions from that website:

"Air Carrier: The cause of the cancellation or delay was due to circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling, etc.).

Extreme Weather: Significant meteorological conditions (actual or forecasted) that, in the judgment of the carrier, delays or prevents the operation of a flight such as tornado, blizzard or hurricane.

National Aviation System (NAS): Delays and cancellations attributable to the national aviation system that refer to a broad set of conditions, such as non-extreme weather conditions, airport operations, heavy traffic volume, and air traffic control.

Late-arriving aircraft: A previous flight with same aircraft arrived late, causing the present flight to depart late.
Security: Delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas."

NOTES:

Interesting that "weather delay" only includes extreme weather; otherwise weather delays are lumped into the NAS category.

## Visualize airport and weather station locations

In [0]:
# load stations data
df_stations = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/stations_with_neighbors.parquet/")

# load airports data
df_airports = spark.read.option("header","true").csv(f"dbfs:/mnt/mids-w261/airport-codes_csv.csv")

# get list of airports in stations data
station_call = df_stations.select('neighbor_call').toPandas()

# get weather station locations
station_locs = df_stations.select('lat','lon','station_id').distinct().toPandas()

# get airport locations
airport_locs = df_airports.select('coordinates','ident').distinct().toPandas()

# filter airports to those in stations
airport_locs = airport_locs[airport_locs['ident'].isin(station_call['neighbor_call'])]

# get airport lat/lon coordinates
airport_locs[['lon', 'lat']] = airport_locs['coordinates'].str.split(',', expand=True)
airport_locs['lat'] = airport_locs['lat'].astype(float)
airport_locs['lon'] = airport_locs['lon'].astype(float)


In [0]:
# plot stations and

fig = go.Figure()

fig.add_trace(go.Scattergeo(
    lat=station_locs['lat'],
    lon=station_locs['lon'],
    marker=dict(
        size=5,
        color='blue'
    ),
    name='Weather Stations'
))

fig.add_trace(go.Scattergeo(
    lat=airport_locs['lat'],
    lon=airport_locs['lon'],
    marker=dict(
        size=3,
        color='red'
    ),
    name='Airports'
))

fig.update_layout(
    geo=dict(
        lonaxis_range=[-180, -60],
        lataxis_range=[10, 90]
    ),
    width=1000,
    height=1000
)

fig.show()

In [0]:
airport_locs['type'] = 'airport'
airport_locs['size'] = 0.3
station_locs['type'] = 'station'
station_locs['size'] = 0.5
combined_locs = pd.concat([station_locs, airport_locs])

In [0]:
fig = px.scatter_geo(
    combined_locs,
    lat='lat',
    lon='lon',
    color='type',
    opacity=0.5,
    # size=combined_locs['size']/10,
    scope='usa',
    title='Weather Stations and Airports'
)

fig.show()

## Prophet modeling for time series characterization

In [0]:
# get US holidays
us_holidays = make_holidays_df(
    year_list=[2013 + i for i in range(10)], country='US'
)
display(us_holidays)

In [0]:
def to_dt(yyyymmdd, hhmm, tz):
    """
    Create UTC timestamp from flights table columns
    yyyymmdd = FL_DATE
    hhmm = CRS_DEP_TIME
    tz = time zone from time zone table

    Returns UTC time stamp, (cast to string)
    """

    hhmm = int(hhmm)

    yyyy,MM,dd = yyyymmdd.split('-')
    yyyy = int(yyyy) # get year
    MM = int(MM) # get month
    dd = int(dd) # get day

    hh = hhmm//100 # get hour
    mm = hhmm%100 # get minute
    if hh == 24:
        hh = 0
        shift = True
    else:
        shift = False

    # create datetime variable for departure
    dt_dep = datetime(yyyy,MM,dd,hh,mm)
    if shift:
        dt_dep += timedelta(days=1)
    # apply local time zone
    local = pytz.timezone(tz).localize(dt_dep)

    dt_format = "%Y-%m-%dT%H:%M:%S"

    # return UTC datetime, cast to string
    return (local.strftime(dt_format))

dt_udf = udf(to_dt, StringType())


In [0]:
out = df.withColumn('local_dep_datetime', to_timestamp(dt_udf(col("FL_DATE"), col("CRS_DEP_TIME"), col("origin_tz")))).cache()

In [0]:
tmp = out.limit(10000).filter(df.ORIGIN.isin(['BOS','ORD'])) \
    .withColumnRenamed("DEP_DELAY","y").withColumnRenamed("local_dep_datetime","ds")
display(tmp)

In [0]:
# informed by: https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html

def forecast_delay(history_pd: pd.DataFrame) -> pd.DataFrame: 
    
    model = Prophet(
        interval_width=0.9,
        growth='linear',
        weekly_seasonality=True,
        daily_seasonality=True,
        yearly_seasonality=True,
        # holidays=us_holidays,
        # seasonality_mode='multiplicative'
    )
    
    # fit the model
    model.fit(history_pd)
    
    # configure predictions
    future_pd = model.make_future_dataframe(
        periods=24*7, 
        freq='h',
        include_history=False
    )
    
    # make predictions
    results_pd = model.predict(future_pd)

    # ref date and dow
    ref_date = history_pd.ds.iloc[0].date()
    ref_dow = history_pd.DAY_OF_WEEK[0]

    def get_dow(x,ref_date,dow):
        d_days = (x.date() - ref_date).days + dow
        d_days = d_days%7
        if d_days == 0:
            d_days = 7
        return d_days

    # dateshift
    results_pd['dow'] = results_pd.ds.apply(lambda x: get_dow(x,ref_date,ref_dow))

    # hour
    results_pd['hour'] = results_pd.ds.apply(lambda x: x.hour)

    # apply origin
    results_pd['ORIGIN'] = history_pd.ORIGIN.iloc[0]

    # # get seasonality
    # results_pd['seasonality'] = results_pd['weekly'] + results_pd['daily']
        
    # return predictions
    return results_pd[['dow','hour','weekly','daily','ORIGIN']]

In [0]:
tmp_out = forecast_delay(tmp.limit(10).toPandas())

In [0]:
tmp_out_spark = spark.createDataFrame(tmp_out)
tmp_out_spark.schema

In [0]:
tmp_out['x'] = tmp_out['dow'] + tmp_out['hour']/24
tmp_out.sort_values('x',inplace=True)
tmp_out.plot(x='x',y='daily')
plt.show()
tmp_out.plot(x='x',y='weekly')
plt.show()

In [0]:
train_max = datetime(2019,2,1)

# informed by: https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html

results = (
    out.filter(df.dep_datetime < train_max) \
    .withColumnRenamed("DEP_DELAY","y").withColumnRenamed("local_dep_datetime","ds")
    .groupBy('ORIGIN')
          .applyInPandas(forecast_delay, schema=tmp_out_spark.schema)
        .withColumn('model_training_date', current_date())
        .withColumn('model_training_max_dt', lit(train_max.strftime('%Y-%m-%d')))
    )

In [0]:
display(results)

In [0]:
display(dbutils.fs.ls(f"{folder_path}/interim"))

In [0]:
results.write.parquet(f"{folder_path}/interim/{dataset}_seasonality_tr{train_max.date()}.parquet")