## Testing out recency features

Idea: Get the average proportion of flights that were delayed and/or average delay amount at a flight's origin airport (stats taken over flights that occurred between UTC-4 and UTC-2)

In [0]:
from pyspark.sql.functions import col, lag
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import time
from pyspark.sql.functions import col, when, to_timestamp, lit, udf, lag, pandas_udf, isnan, array, array_contains, explode, lit, countDistinct, first, last, unix_timestamp, to_date, to_timestamp, date_format, date_add

In [0]:
# data time period
period = "1y" # on of the following values ("", "3m", "6m", "1y")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")

# read in joined, cleaned dataset
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_weather_cleaned_combo.parquet")
# dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_cleaned_engineered.parquet

df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

df.cache()


In [0]:
display(df)

In [0]:
# for testing, just look at BOS and ATL
dd = df.select(df.ORIGIN,
               df.sched_depart_utc,
               df.two_hours_prior_depart_UTC,
               df.four_hours_prior_depart_UTC,
               df.DEP_DELAY,
               df.CANCELLED,
               df.outcome) \
        .where((df.ORIGIN=="BOS") | (df.ORIGIN=="ATL")).orderBy(df.sched_depart_utc) \
        .limit(20000)
# dd = dd.dropna(subset=["DEP_DELAY"]) # !!! FIX NULL HANDLING
dd.cache()
display(dd)


In [0]:
from datetime import datetime, timedelta, time
from pyspark.sql.types import TimestampType

In [0]:
def get_act_dep_time(sched_dep_time, DEP_DELAY):
    if DEP_DELAY is None:
        return sched_dep_time # !!! ???
    return sched_dep_time + timedelta(minutes=DEP_DELAY)

act_def_udf = F.udf(get_act_dep_time, TimestampType())

dd = dd.withColumn("act_dep_datetime", act_def_udf(col("sched_depart_utc"), col("DEP_DELAY")))
dd.cache()
display(dd)

In [0]:
# Get the minimum DEP_DELAY value
from pyspark.sql.functions import min
min_dep_delay = dd.agg(min(col("DEP_DELAY"))).collect()[0][0]


print(f"Minimum DEP_DELAY value: {min_dep_delay}")
dd = dd.fillna({'DEP_DELAY': -99999})
dd.cache()

In [0]:

from pyspark.sql.window import Window
from pyspark.sql.functions import col, mean, when, collect_list

# CURRENT KLUGE: GRAB LAST 1000 ROWS
# window specification sets window to times between 4 and 2 hours prior to scheduled departure
window_spec = Window.partitionBy("ORIGIN").orderBy("sched_depart_utc").rowsBetween(-1000,0)

# # mean
# dd_with_mean_delay = dd.withColumn(
#     "mean_dep_delay",
#     mean(when(col("act_dep_datetime") < col("two_hours_prior_depart_UTC"), col("DEP_DELAY")).otherwise(0)).over(window_spec)
# )

# mean
dd_with_delays = dd.withColumn(
    "delays_within_window",
    collect_list(col("DEP_DELAY")).over(window_spec)) \
    .withColumn(
    "act_dep_within_window",
    collect_list(col("act_dep_datetime")).over(window_spec)) \
    .withColumn(
    "outcome_within_window",
    collect_list(col("outcome")).over(window_spec))

display(dd_with_delays)



In [0]:
display(dd_with_delays.filter(dd_with_delays.sched_depart_utc > "2019-01-03T12:33:00.000+00:00"))

In [0]:
from pyspark.sql.types import FloatType
import numpy as np

def get_delay_mean(delays, act_dep, two_hours_prior, four_hours_prior):
    delays = np.array(delays)
    act_dep = np.array(act_dep)
    d = delays[(act_dep < two_hours_prior) & (act_dep > four_hours_prior) & (delays != -99999)].astype(np.float)
    return float(np.mean(d)) if len(d) > 0 else None

delay_mean_udf = F.udf(get_delay_mean, FloatType())

def get_delay_proportion(outcome, act_dep, two_hours_prior, four_hours_prior):
    outcome = np.array(outcome)
    act_dep = np.array(act_dep)
    d = outcome[(act_dep < two_hours_prior) & (act_dep > four_hours_prior)].astype(np.float)
    return float(np.mean(d)) if len(d) > 0 else None

delay_prop_udf = F.udf(get_delay_proportion, FloatType())

In [0]:
dd_with_mean_delay = dd_with_delays.withColumn("mean_dep_delay", delay_mean_udf(col("delays_within_window"), col("act_dep_within_window"), col("two_hours_prior_depart_UTC"), col("four_hours_prior_depart_UTC"))) \
    .withColumn("prop_delayed", delay_prop_udf(col("outcome_within_window"), col("act_dep_within_window"), col("two_hours_prior_depart_UTC"), col("four_hours_prior_depart_UTC")))

display(dd_with_mean_delay)

In [0]:
display(dd_with_mean_delay.filter(dd_with_mean_delay.sched_depart_utc > "2019-01-03T12:33:00.000+00:00") \
    .filter(dd_with_mean_delay.ORIGIN == "BOS"))

In [0]:
def get_delay_mean(delays, act_dep, two_hours_prior):
    try:
        delays = np.array(delays)
        act_dep = np.array(act_dep)
        d = delays[(act_dep < two_hours_prior) & ~np.isnan(delays)].astype(np.float)
        return float(np.mean(d)) if len(d) > 0 else None
    except:
        return None

## Plot results to sanity check

In [0]:
data = dd_with_mean_delay.select("ORIGIN","sched_depart_utc","DEP_DELAY","mean_dep_delay","prop_delayed").orderBy("sched_depart_utc").toPandas()
display(data)

In [0]:
import matplotlib.pyplot as plt

origins = ["BOS","ATL"]
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.DEP_DELAY, label="DEP_DELAY")
    plt.plot(tmp.sched_depart_utc, tmp.mean_dep_delay, label="mean_dep_delay")
    plt.ylim([-10,20])
    plt.show()

# ZOOMED VIEW
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.DEP_DELAY, label="DEP_DELAY")
    plt.plot(tmp.sched_depart_utc, tmp.mean_dep_delay, label="mean_dep_delay")
    plt.ylim([-10,20])
    plt.xlim([datetime.strptime("2019-01-02", "%Y-%m-%d"), 
              datetime.strptime("2019-01-04", "%Y-%m-%d")])
    plt.show()

In [0]:
import matplotlib.pyplot as plt

origins = ["BOS","ATL"]
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.prop_delayed, label="prop_delayed")
    plt.show()

# 4/8

In [0]:
from pyspark.sql.functions import col, lag
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import time
from pyspark.sql.functions import col, when, to_timestamp, lit, udf, lag, pandas_udf, isnan, array, array_contains, explode, lit, countDistinct, first, last, unix_timestamp, to_date, to_timestamp, date_format, date_add
from datetime import datetime, timedelta, time
from pyspark.sql.types import TimestampType
from pyspark.sql.types import FloatType
import numpy as np
from pyspark.sql.functions import min
from pyspark.sql.window import Window
from pyspark.sql.functions import col, mean, when, collect_list
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
from pyspark.sql.window import Window
from pyspark.sql.functions import col

In [0]:
def get_act_dep_time(sched_dep_time, DEP_DELAY):
    if DEP_DELAY is None:
        sched_dep_time + timedelta(days=9999) # !!! ???
    return sched_dep_time + timedelta(minutes=DEP_DELAY)

act_dep_udf = F.udf(get_act_dep_time, TimestampType())

def get_delay_mean(delays, act_dep, two_hours_prior, four_hours_prior):
    delays = np.array(delays)
    act_dep = np.array(act_dep)
    d = delays[(act_dep < two_hours_prior) & (act_dep > four_hours_prior) & (delays != -99999)].astype(np.float)
    return float(np.mean(d)) if len(d) > 0 else None

delay_mean_udf = F.udf(get_delay_mean, FloatType())

def get_delay_proportion(outcome, act_dep, two_hours_prior, four_hours_prior):
    outcome = np.array(outcome)
    act_dep = np.array(act_dep)
    d = outcome[(act_dep < two_hours_prior) & (act_dep > four_hours_prior)].astype(np.float)
    return float(np.mean(d)) if len(d) > 0 else None

delay_prop_udf = F.udf(get_delay_proportion, FloatType())

In [0]:
# data time period
period = "1y" # on of the following values ("", "3m", "6m", "1y")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
# spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")

# read in joined, cleaned dataset
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_cleaned_engineered.parquet")

# add outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))
df = df.withColumn("outcome0", (when(col('DEP_DELAY').isNull(), None).otherwise(col("outcome"))).cast("double"))

df.cache()

In [0]:
# # Get the minimum DEP_DELAY value
# min_dep_delay = df.agg(min(col("DEP_DELAY"))).collect()[0][0]

In [0]:
# # for testing, just look at BOS
# dd = df.select(df.ORIGIN,
#                df.sched_depart_utc,
#                df.two_hours_prior_depart_UTC,
#                df.four_hours_prior_depart_UTC,
#                df.DEP_DELAY,
#                df.FL_DATE,
#                df.CANCELLED,
#                df.outcome0) \
#         .where((df.ORIGIN=="BOS")).orderBy(df.sched_depart_utc) \
#         .filter(df.sched_depart_utc > "2019-01-29T12:33:00.000+00:00")
# # dd = dd.dropna(subset=["DEP_DELAY"]) # !!! FIX NULL HANDLING
# # dd.cache()
# display(dd)

In [0]:
dd = df.withColumn("DEP_DELAY0", col("DEP_DELAY")).fillna({'DEP_DELAY0': 0})
dd.cache()

In [0]:
dd = dd.withColumn("act_dep_datetime", act_dep_udf(col("sched_depart_utc"), col("DEP_DELAY0")))

In [0]:
hours = lambda i: i * 3600
window_spec = Window.partitionBy(col("ORIGIN"),col("FL_DATE")) \
    .orderBy(col("sched_depart_utc").cast("timestamp").cast("long")) \
        .rangeBetween(-hours(4),0)
# we will eventually get just -4 to -2 hours, but using 0 in the window allows us to
# grab the utc-2 for the 0 hour offset case

dd = dd.repartition("ORIGIN", "FL_DATE")

# # sanity check window
# dd_with_delays = dd.withColumn(
#     "act_dep_within_window",
#     collect_list(col("act_dep_datetime")).over(window_spec))
# display(dd_with_delays)



Time windowing strategy informed by: https://stackoverflow.com/questions/33207164/spark-window-functions-rangebetween-dates

In [0]:
# Define a pandas UDF to calculate the mean of DEP_DELAY0 within the window
@pandas_udf("double", PandasUDFType.GROUPED_AGG)
def mean_dep_delay_udf(dep_delays: pd.Series, act_dep_times: pd.Series, sched_dep_utc2: pd.Series) -> float:
    d = dep_delays[(act_dep_times < np.max(sched_dep_utc2))].astype(np.float)
    return np.nanmean(d)

# Apply the UDF over the window
dd_with_mean_delay = dd \
    .withColumn("mean_dep_delay", 
        mean_dep_delay_udf(
            col("DEP_DELAY"),
            col("act_dep_datetime"),
            col("two_hours_prior_depart_UTC")).over(window_spec)) \
    .withColumn("prop_delayed", 
        mean_dep_delay_udf(
            col("outcome0"),
            col("act_dep_datetime"),
            col("two_hours_prior_depart_UTC")).over(window_spec))

dd_with_mean_delay.cache()
display(dd_with_mean_delay)

In [0]:
fn_out = f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_cleaned_engineered_timefeat.parquet"
dd_with_mean_delay.write.parquet(fn_out)


In [0]:
display(dbutils.fs.ls(f"{team_BASE_DIR}/interim/join_checkpoints"))

## Sanity Check

In [0]:
data = dd_with_mean_delay.select("ORIGIN","sched_depart_utc","DEP_DELAY","mean_dep_delay","prop_delayed").orderBy("sched_depart_utc").toPandas()
display(data)

In [0]:
import matplotlib.pyplot as plt

origins = ["BOS"]
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.DEP_DELAY, label="DEP_DELAY")
    plt.plot(tmp.sched_depart_utc, tmp.mean_dep_delay, label="mean_dep_delay")
    plt.show()

# ZOOMED VIEW
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.DEP_DELAY, label="DEP_DELAY")
    plt.plot(tmp.sched_depart_utc, tmp.mean_dep_delay, label="mean_dep_delay")
    plt.ylim([-10,20])
    plt.xlim([datetime.strptime("2019-01-29", "%Y-%m-%d"), 
              datetime.strptime("2019-01-30", "%Y-%m-%d")])
    plt.show()

In [0]:
import matplotlib.pyplot as plt

origins = ["BOS"]
for o in origins:
    tmp = data.loc[data.ORIGIN == o]
    plt.plot(tmp.sched_depart_utc, tmp.prop_delayed, label="prop_delayed")
    plt.show()

In [0]:
display(dd_with_mean_delay.filter(dd_with_mean_delay.sched_depart_utc > "2019-01-29T12:33:00.000+00:00") \
    .withColumn(
    "act_dep_within_window",
    collect_list(col("act_dep_datetime")).over(window_spec)))