## Recency Features

Idea: Get the average proportion of flights that were delayed and/or average delay amount at a flight's origin airport (stats taken over flights that occurred between UTC-4 and UTC-2)

In [0]:
# imports
from pyspark.sql.functions import col, lag
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import time
from pyspark.sql.functions import col, when, to_timestamp, lit, udf, lag, pandas_udf, isnan, array, array_contains, explode, lit, countDistinct, first, last, unix_timestamp, to_date, to_timestamp, date_format, date_add
from datetime import datetime, timedelta, time
from pyspark.sql.types import TimestampType
from pyspark.sql.types import FloatType
import numpy as np
from pyspark.sql.functions import min
from pyspark.sql.window import Window
from pyspark.sql.functions import col, mean, when, collect_list
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
from pyspark.sql.window import Window
from pyspark.sql.functions import col

In [0]:
# define helper functions and corresponding udf's

def get_act_dep_time(sched_dep_time, DEP_DELAY):
    """get actual departure time from scheduled departure time and dep delay"""
    if DEP_DELAY is None:
        sched_dep_time + timedelta(days=9999) # this case should not be triggered
    return sched_dep_time + timedelta(minutes=DEP_DELAY)

act_dep_udf = F.udf(get_act_dep_time, TimestampType())

def get_delay_mean(delays, act_dep, two_hours_prior, four_hours_prior):
    """Get average delay within window for records at least 2 hours stale"""
    delays = np.array(delays)
    act_dep = np.array(act_dep)
    d = delays[(act_dep < two_hours_prior) & (act_dep > four_hours_prior) & (delays != -99999)].astype(np.float)
    return float(np.mean(d)) if len(d) > 0 else None

delay_mean_udf = F.udf(get_delay_mean, FloatType())

# def get_delay_proportion(outcome, act_dep, two_hours_prior, four_hours_prior):
#     outcome = np.array(outcome)
#     act_dep = np.array(act_dep)
#     d = outcome[(act_dep < two_hours_prior) & (act_dep > four_hours_prior)].astype(np.float)
#     return float(np.mean(d)) if len(d) > 0 else None

# delay_prop_udf = F.udf(get_delay_proportion, FloatType())

In [0]:
# data time period
period = "" # on of the following values ("", "3m", "6m", "1y")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
# spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")

# read in joined, cleaned dataset
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather{period}_v1.parquet")

# add outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double")) \
    .withColumn("outcome0", (when(col('DEP_DELAY').isNull(), None).otherwise(col("outcome"))).cast("double")) \
    .withColumn("DEP_DELAY0", col("DEP_DELAY")).fillna({'DEP_DELAY0': 0}) \
    .withColumn("act_dep_datetime", act_dep_udf(col("sched_depart_utc"), col("DEP_DELAY0")))
df.cache()

In [0]:
hours = lambda i: i * 3600
window_spec = Window.partitionBy(col("ORIGIN"),col("FL_DATE")) \
    .orderBy(col("sched_depart_utc").cast("timestamp").cast("long")) \
        .rangeBetween(-hours(4),0)
# we will eventually get just -4 to -2 hours, but using 0 in the window allows us to
# grab the utc-2 for the 0 hour offset case

df = df.repartition("ORIGIN", "FL_DATE")


Time windowing strategy informed by: https://stackoverflow.com/questions/33207164/spark-window-functions-rangebetween-dates

In [0]:
# Define a pandas UDF to calculate the mean of DEP_DELAY0 within the window
@pandas_udf("double", PandasUDFType.GROUPED_AGG)
def mean_dep_delay_udf(dep_delays: pd.Series, act_dep_times: pd.Series, sched_dep_utc2: pd.Series) -> float:
    d = dep_delays[(act_dep_times < np.max(sched_dep_utc2))].astype(np.float)
    return np.nanmean(d)

# Apply the UDF over the window
df_with_mean_delay = df \
    .withColumn("mean_dep_delay", 
        mean_dep_delay_udf(
            col("DEP_DELAY"),
            col("act_dep_datetime"),
            col("two_hours_prior_depart_UTC")).over(window_spec)) \
    .withColumn("prop_delayed", 
        mean_dep_delay_udf(
            col("outcome0"),
            col("act_dep_datetime"),
            col("two_hours_prior_depart_UTC")).over(window_spec))

df_with_mean_delay.cache()
display(df_with_mean_delay)

In [0]:
fn_out = f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat.parquet"
df_with_mean_delay.write.parquet(fn_out)


In [0]:
display(dbutils.fs.ls(f"{team_BASE_DIR}/interim/join_checkpoints"))

## TODO: Rerun Sanity Check on 5y Data

In [0]:
data = df_with_mean_delay.select("ORIGIN","sched_depart_utc","DEP_DELAY","mean_dep_delay","prop_delayed").orderBy("sched_depart_utc").toPandas()
display(data)

In [0]:
import matplotlib.pyplot as plt

origins = ["BOS"]
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.DEP_DELAY, label="DEP_DELAY")
    plt.plot(tmp.sched_depart_utc, tmp.mean_dep_delay, label="mean_dep_delay")
    plt.show()

In [0]:
import matplotlib.pyplot as plt

origins = ["BOS"]
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.prop_delayed, label="prop_delayed")
    plt.show()