# Lagged Features, Re-Engineered

## Setup & Imports

In [0]:
# imports

from pyspark.sql import functions as f
from pyspark.sql.functions import col, when, to_timestamp, lit, udf, lag, pandas_udf, isnan, array, array_contains, explode, lit, countDistinct, first, last, unix_timestamp, to_date, to_timestamp, date_format, date_add
import pytz
from datetime import datetime, timedelta
from pyspark.sql import types
from pyspark.sql.types import *
from pyspark.sql import Window
import re

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")

# read in joined, cleaned, engineered dataset
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_1y_weather_cleaned_combo.parquet")


## get arr_time column UTC-ed

In [0]:
def to_utc(yyyymmdd, dep_hhmm, arr_hhmm, dep_tz, arr_tz, flight_dur):
    """
    Create UTC timestamp from flights table columns
    yyyymmdd = FL_DATE
    dep_hhmm = CRS_DEP_TIME
    arr_hhmm = CRS_ARR_TIME
    dep_tz = origin_timezone
    arr_tz = dest_timezone
    flight_dur = CRS_ELAPSED_TIME (for sanity check of arrival time)

    Returns UTC time stamp, (cast to string)
    """

    dep_hhmm = int(dep_hhmm)
    arr_hhmm = int(arr_hhmm)

    yyyy,MM,dd = yyyymmdd.split('-')
    yyyy = int(yyyy) # get year
    MM = int(MM) # get month
    dd = int(dd) # get day

    dep_hh = dep_hhmm//100 # get hour
    dep_mm = dep_hhmm%100 # get minute
    if dep_hh == 24:
        dep_hh = 0
        dep_shift = True
    else:
        dep_shift = False

    arr_hh = arr_hhmm//100 # get hour
    arr_mm = arr_hhmm%100
    if arr_hh == 24:
        arr_hh = 0
        arr_shift = True
    else:
        arr_shift = False

    # create datetime variable for departure
    dt_dep = datetime(yyyy,MM,dd,dep_hh,dep_mm)
    if dep_shift:
        dt_dep += timedelta(days=1)
    # apply local time zone
    dep_local = pytz.timezone(dep_tz).localize(dt_dep)
    # convert to UTC
    dep_utc = dep_local.astimezone(pytz.utc)

    # create datetime variable for arrival
    dt_arr = datetime(yyyy,MM,dd,arr_hh,arr_mm)
    if arr_shift:
        dt_arr += timedelta(days=1)
    # apply local time zone
    arr_local = pytz.timezone(arr_tz).localize(dt_arr)
    # convert to UTC
    arr_utc = arr_local.astimezone(pytz.utc)

    if dep_utc > arr_utc:
        arr_utc += timedelta(days=1)

    # # sanity check
    # arr_utc_SC = dep_utc + timedelta(minutes=flight_dur)

    dt_format = "%Y-%m-%dT%H:%M:%S"

    # return UTC datetime, cast to string
    # return (dep_utc.strftime(dt_format), arr_utc.strftime(dt_format), arr_utc_SC.strftime(dt_format))
    return (dep_utc.strftime(dt_format), arr_utc.strftime(dt_format))

schema = StructType([
    StructField("dep_datetime", StringType(), False),
    StructField("arr_datetime", StringType(), False),
])

dt_udf = udf(to_utc, schema)

out = df.withColumn('processed', dt_udf(col("FL_DATE"), col("CRS_DEP_TIME"), col("CRS_ARR_TIME"), col("origin_timezone"), col("dest_timezone"), col("CRS_ELAPSED_TIME"))).cache()

cols = [c for c in out.columns if c != "processed"]
cols += ["processed.dep_datetime","processed.arr_datetime"]
out = out.select(cols)

display(out)

### Handle Nulls

In [0]:
out.columns

In [0]:
out=out.withColumn("CRS_ELAPSED_TIME",
              F.when(F.col("CRS_ELAPSED_TIME").isNull(),
                     (F.col("arr_datetime")-F.col("sched_depart_utc")).cast('int'))
              .otherwise(F.col("CRS_ELAPSED_TIME"))
              )

## Optimized Lag Columns Pipeline

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

WhenConditions = (f.col("ORIGIN") == f.col("priorflight_dest")) & (f.col("priorflight_sched_deptime") >= f.col("twentysix_hours_prior_depart_UTC"))

def add_lags_optimized(df):
    # Define windows once
    aircraft_window = Window.partitionBy("TAIL_NUM").orderBy("sched_depart_utc")
    route_window = Window.partitionBy("ORIGIN", "DEST").orderBy("sched_depart_utc").rowsBetween(-100, -1)

    # Precompute all lagged columns in single pass
    lagged_cols = [
        F.lag("ORIGIN").over(aircraft_window).alias("priorflight_origin"),
        F.lag("DEST").over(aircraft_window).alias("priorflight_dest"),
        F.lag("CANCELLED").over(aircraft_window).alias("priorflight_cancelled_true"),
        F.lag("sched_depart_utc").over(aircraft_window).alias("priorflight_sched_deptime"),
        F.lag("CRS_ELAPSED_TIME").over(aircraft_window).alias("priorflight_elapsed_time_calc_raw"),
        F.lag("DEP_DELAY").over(aircraft_window).alias("priorflight_depdelay_true_raw"),
        F.lag("arr_datetime").over(aircraft_window).alias("priorflight_arr_time_true")
    ]

    # Base transformations
    base_df = (df
        .withColumn("twentysix_hours_prior_depart_UTC", 
                   (F.col("two_hours_prior_depart_UTC") - F.expr("INTERVAL 24 HOURS")).cast("timestamp"))
        .select("*", *lagged_cols)
    )

    # Precompute common conditions
    WhenConditions = (
        (F.col("ORIGIN") == F.col("priorflight_dest")) & 
        (F.col("priorflight_sched_deptime") >= F.col("twentysix_hours_prior_depart_UTC"))
    )
    valid_prior = WhenConditions & (F.col("priorflight_cancelled_true") != 1)

    # Core calculations
    result_df = (base_df
        .withColumn("priorflight_elapsed_time_calc",
            F.when(valid_prior,
                F.expr("INTERVAL 1 MINUTE") * F.col("priorflight_elapsed_time_calc_raw")
            )
        )
        .withColumn("priorflight_depdelay_true",
            F.when(valid_prior, F.col("priorflight_depdelay_true_raw"))
        )
        .withColumn("priorflight_deptime_true",
            F.when(valid_prior,
                F.col("priorflight_sched_deptime") + 
                (F.expr("INTERVAL 1 MINUTE") * F.col("priorflight_depdelay_true"))
            )
        )
        .withColumn("priorflight_isdeparted",
            F.when(
                (F.col("priorflight_deptime_true") <= F.col("two_hours_prior_depart_UTC")) &
                valid_prior, 1
            ).otherwise(0)
        )
        .withColumn("priorflight_depdelay_calc",
            F.when(
                (F.col("priorflight_deptime_true") <= F.col("two_hours_prior_depart_UTC")) & valid_prior,
                F.col("priorflight_depdelay_true")
            ).when(
                (F.col("priorflight_sched_deptime") <= F.col("two_hours_prior_depart_UTC")) &
                (F.col("priorflight_deptime_true") > F.col("two_hours_prior_depart_UTC")) &
                valid_prior,
                (F.col("two_hours_prior_depart_UTC").cast('long') - 
                 F.col("priorflight_sched_deptime").cast('long')) / 60
            ).otherwise(F.lit(0.0))
        )
        .withColumn("priorflight_deptime_calc",
            F.col("priorflight_sched_deptime") + 
            (F.expr("INTERVAL 1 MINUTE") * F.col("priorflight_depdelay_calc"))
        )
        .withColumn("priorflight_isdelayed_calc",
            F.when(
                (F.col("priorflight_depdelay_calc") >= 15) | 
                (F.col('priorflight_cancelled_true') == 1), 1
            ).otherwise(0)
        )
        .withColumn("elapsed_time_true",
            F.when(valid_prior,
                (F.col("AIR_TIME") + F.col("TAXI_IN") + F.col("TAXI_OUT")).cast("int")
            )
        )
        .withColumn("arr_time_true",
            F.col("arr_datetime").cast("timestamp") +
            (F.expr("INTERVAL 1 MINUTE") * F.col("ARR_DELAY"))
        )
        .withColumn("priorflight_isarrived_calc",
            F.when(
                (F.col("priorflight_arr_time_true") <= F.col("two_hours_prior_depart_UTC")) &
                valid_prior, 1
            ).otherwise(0)
        )
        .withColumn("priorflight_arr_time_calc",
            F.when(
                F.col("priorflight_isarrived_calc") == 1,
                F.col("priorflight_arr_time_true")
            ).when(
                (F.col("priorflight_isarrived_calc") == 0) &
                (F.col("priorflight_deptime_true") <= F.col("two_hours_prior_depart_UTC")), 
                F.col("priorflight_deptime_true") + F.col("priorflight_elapsed_time_calc")
            ).otherwise(
                F.col("priorflight_deptime_calc") + F.col("priorflight_elapsed_time_calc")
            )
        )
        .withColumn("turnaround_time_calc",
            F.when(valid_prior,
                ((F.col("sched_depart_utc").cast("long") - 
                  F.col("priorflight_arr_time_calc").cast("long")) / 60).cast("double")
            )
        )
        # Edge case handling
        .withColumn("turnaround_time_calc",
            F.when(
                (~valid_prior) | (F.col("priorflight_cancelled_true") == 1),
                F.last("turnaround_time_calc", ignorenulls=True).over(route_window)
            ).otherwise(F.col("turnaround_time_calc"))
        )
        .withColumn("priorflight_depdelay_calc",
            F.when(
                (~valid_prior) | (F.col("priorflight_cancelled_true") == 1),
                F.last("priorflight_depdelay_calc", ignorenulls=True).over(route_window)
            ).otherwise(F.col("priorflight_depdelay_calc"))
        )
    ).cache()

    return result_df

# Optimized execution flow
def optimized_pipeline(input_df):
    # Single pass processing with conditional logic
    return add_lags_optimized(
        input_df.withColumn("is_cancelled", F.col("CANCELLED") == 1)
    )

# Execute pipeline
result = optimized_pipeline(out)
display(result)


In [0]:
result.checkpoint()

In [0]:
result.count() #sanity check

In [0]:
display(result.filter(f.col('TAIL_NUM').isNotNull()).orderBy('TAIL_NUM','sched_depart_utc'))

In [0]:


output_path = "dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_cleaned_engineered.parquet"
(
    result.write
    .mode("overwrite")
    .parquet(output_path)
)




In [0]:
print(f"Delay indicator null count: {result.filter(F.col('priorflight_isdelayed_calc').isNull()).count()}")

print(f"Departed indicator null count: {result.filter(F.col('priorflight_isdeparted').isNull()).count()}")

print(f"Arrival indicator null count: {result.filter(F.col('priorflight_isarrived_calc').isNull()).count()}")

print(f"Est. Turnaround Time Null Count: {result.filter(F.col('turnaround_time_calc').isNull()).count()}")


In [0]:
print(f"Delay estimate null count: {result.filter(F.col('priorflight_isdelayed_calc').isNull()).count()}")


In [0]:
display(result.filter(F.col('turnaround_time_calc').isNull()))

In [0]:

result = result.fillna({'turnaround_time_calc': 0}) #don't have enough information to calculate the turnaround time, so just assume something is going on that wont give us enough time

# Modeling

## setup and imports

In [0]:
df=spark.read.parquet("dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_cleaned_engineered.parquet")

In [0]:

df = df.fillna({'turnaround_time_calc': 0}) #don't have enough information to calculate the turnaround time, so just assume something is going on that wont give us enough time

In [0]:
pip install prophet

In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline

In [0]:
# data time period
period = "1y" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds
k = 3

# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup) #using false since erica already computed
compute_seasonality = False

# define train/test split date
min_test_dt = "2019-10-01"

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

In [0]:
df_handled = df.withColumns(
    {
        "dep_hour_utc": 
            F.hour(col(dep_utc_varname)),
        "outcome":  
            (F.when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double")
            }
)

In [0]:
# split into train and test

df_train = df_handled.filter(F.col(dep_utc_varname) < min_test_dt)
df_train.cache()
df_test = df_handled.filter(F.col(dep_utc_varname) >= min_test_dt)
df_test.cache()

### Helper functions

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def get_cv_time_limits(df, k=3, blocking=False, dep_utc_varname="dep_datetime", verbose=True):
    '''
    Get time bins for time-series cross validation
    '''
    n = df.count()
    df = df.withColumn("row_id", f.row_number()
            .over(Window.partitionBy().orderBy(dep_utc_varname)))
    chunk_size = np.floor(n/(k+1))

    idx = np.arange(0,)
    idx = np.arange(0,n,chunk_size)
    idx[-1] = n-1
    idx = [int(i)+1 for i in idx]
    
    if verbose:
        print('')
        print(f'Number of validation datapoints for each fold is {chunk_size:,}')
        print("************************************************************")

    bin_edges = df.filter(f.col("row_id").isin(idx)).select("row_id",dep_utc_varname).toPandas()

    out = []
    for i in range(k):
        # define minimum training time based on cross-validation style
        if not blocking:
            t_min_train = bin_edges[dep_utc_varname][0]
        else:
            t_min_train = bin_edges[dep_utc_varname][i]
        # define maximum training time
        t_max_train = bin_edges[dep_utc_varname][i+1]
        # define minimum test time
        t_min_test = bin_edges[dep_utc_varname][i+1]
        # define maximum test_time
        t_max_test = bin_edges[dep_utc_varname][i+2]

        out.append({"train_min":t_min_train, "train_max":t_max_train,
                    "test_min":t_min_test, "test_max":t_max_test})
    out = pd.DataFrame(out)
        
    if verbose:
        for i in range(k):
            print(f'    TRAIN set for fold {i} goes from {out["train_min"][i]} to {out["train_max"][i]}')
            print(f'    TEST set for fold {i} goes from {out["test_min"][i]} to {out["test_max"][i]}')
        
    return out

In [0]:
# get cross-validation split times
cv_cutoffs = get_cv_time_limits(df_train.select(dep_utc_varname), k=3, blocking=True, 
    dep_utc_varname=dep_utc_varname, verbose=True)
cv_cutoffs

In [0]:
def get_seasonality_data(df, fold, k):
  """
  Look up seasonlaity features from saved seasonality model.
  """
  if fold == 'full':
      fn_model = f"seasonality_model_{period}_train.parquet"
  else:
      fn_model = f"seasonality_model_{period}_cv{fold}of{k}.parquet"
  model = spark.read.parquet(f"{team_BASE_DIR}/interim/{fn_model}")

  joined_df = df.join(model, 
                    (df["ORIGIN"] == model["ORIGIN"]) & 
                    (df["DAY_OF_WEEK"] == model["dow"]) & 
                    (df["dep_hour_utc"] == model["hour"]),
                    how="left").drop(model["ORIGIN"])
  
  return joined_df

# CODE BELOW DERIVED FROM DEMO 11 NOTEBOOK

def upsample(train_df,verbose=False):
  '''Upsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = non_delay_count / delay_count

  train_delay = train_df.filter(f.col('outcome') == 0)
  train_non_delay = train_df.filter(f.col('outcome') == 1).sample(withReplacement=True, fraction=keep_percent,seed=42)
  train_upsampled = train_delay.union(train_non_delay)
  return train_upsampled


def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

def cv_eval(preds):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  return F2, pr

def timeSeriesSplitCV(df, pipeline, cv_info, sampling=None, metric='f2', verbose=True, dep_utc_varname=dep_utc_varname):
  '''
  Perform timSeriesSplit k-fold cross validation 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      train_df = train_df.cache()
    elif sampling=='up':
      train_df = upsample(train_df)
      train_df = train_df.cache()
    # elif sampling=='weights':
    #   train_df = add_class_weights(train_df).cache()
      
    #print info on train and dev set for this fold
    if verbose:
      print('    TRAIN set for fold {} goes from {} to {}, count is {:,} flights ({})'.format((i+1), 
                                                                                      train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
                                                                                      train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
                                                                                      train_df.count(),
                                                                                      sampling + '-sampled' if sampling else 'no sampling'))
      print('    DEV set for fold {} goes from {} to {}, count is {:,} flights'.format((i+1), 
                                                                                      dev_df.agg({dep_utc_varname:'min'}).collect()[0][0],
                                                                                      dev_df.agg({dep_utc_varname:'max'}).collect()[0][0],
                                                                                      dev_df.count()))
      
    # TODO: remove once feat engineering applied outside
    train_df = get_seasonality_data(train_df, i, k)
    train_df = train_df.fillna({col:0 for col in ['daily','weekly']})
    dev_df = get_seasonality_data(dev_df, i, k)
    dev_df = dev_df.fillna({col:0 for col in ['daily','weekly']})

    # print(train_df.dtypes)
    # print(dev_df.dtypes)
        
    # Fit params on the model
    model = pipeline.fit(train_df)
    dev_pred = model.transform(dev_df)
    if metric=='f2':
      score = cv_eval(dev_pred)[0]
    elif metric=='pr':
      score = cv_eval(dev_pred)[1]
    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")

  # # Train on full df
  # print('Training on full train dataset, and validating on dev dataset with best parameters from CV:')
  # print(best_parameters)
    
  # if verbose:
  #   print('    TRAIN set for best parameter fitted model goes from {} to {}, count is {:,} flights ({})'.format(train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
  #                                                                                                    train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
  #                                                                                                    train_df.count(),
  #                                                                                                    sampling + '-sampled' if sampling else 'no sampling'))
  return avg_score

## Baseline Model

### setup

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")


In [0]:
# weather columns
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]

# seasonality columns
seasonality_cols = ["daily","weekly"]

# date related columns
date_cols = ["YEAR","QUARTER","MONTH","DAY_OF_MONTH","DAY_OF_WEEK"]


# prior & current flight cols
num_flight_cols = ['turnaround_time_calc', 'priorflight_depdelay_calc','DISTANCE','CRS_ELAPSED_TIME','priorflight_elapsed_time_calc_raw']
bool_flight_cols = ['priorflight_isdeparted', 'priorflight_isarrived_calc','priorflight_isdelayed_calc']

# flight metadata
flight_metadata_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO", "origin_type", "dest_type"]

# fields that will not be features but need to be kept for processing
keep_me = ["outcome","DAY_OF_WEEK","ORIGIN","dep_hour_utc",dep_utc_varname]


########## Define columns to be used as numeric and categorical features in the pipeline ##########
numeric_cols = [*num_weather_cols, *seasonality_cols, *num_flight_cols]
categorical_cols = [*bool_flight_cols, *flight_metadata_cols, *date_cols]

In [0]:
from pyspark.sql.functions import col, isnan, when, count
ncols = [*num_weather_cols, *num_flight_cols]
# Count NaNs in numeric columns
nan_counts_numeric = df_handled.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ncols])

# Count NaNs in categorical columns
nan_counts_categorical = df_handled.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in categorical_cols])

display(nan_counts_numeric)
display(nan_counts_categorical)

In [0]:
# # List to hold the stages of the pipeline
# stages = []

# # Index and encode each categorical column
# for column in categorical_cols:
#     indexer = StringIndexer(inputCol=column, outputCol=column + "_index",handleInvalid="keep")
#     encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_vec",handleInvalid="keep")
#     stages += [indexer, encoder]
# # define encoded categorical feature names
# categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# # assemble features
# features = numeric_cols + categorical_vec_columns
# assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# # scale features
# scaler = MinMaxScaler(inputCol="features", \
#     outputCol="features_scaled")

# # logistic regression model
# lr = LogisticRegression(featuresCol='features_scaled', \
#     labelCol='outcome',maxIter=50)

# # construct pipeline object from all components
# pipeline = Pipeline(stages=stages+[assembler,scaler,lr])

In [0]:
from pyspark.ml.feature import Imputer

# List to hold the stages of the pipeline
stages = []

# 1. Index and encode categorical columns
for column in categorical_cols:
    indexer = StringIndexer(
        inputCol=column, 
        outputCol=column + "_index", 
        handleInvalid="keep"
    )
    encoder = OneHotEncoder(
        inputCol=column + "_index", 
        outputCol=column + "_vec", 
        handleInvalid="keep"
    )
    stages += [indexer, encoder]

# 2. Median imputation for numerical columns
impute_cols = [
    "turnaround_time_calc", 
    "priorflight_depdelay_calc", 
    "priorflight_elapsed_time_calc_raw"
]
imputer = Imputer(
    inputCols=impute_cols,
    outputCols=[col + "_imputed" for col in impute_cols],
    strategy="median"
)
stages += [imputer]


# 4. Update feature list to include imputed columns
categorical_vec_columns = [col + "_vec" for col in categorical_cols]
numeric_cols_imputed = [col + "_imputed" for col in impute_cols] + \
    [col for col in numeric_cols if col not in impute_cols]

features = numeric_cols_imputed + categorical_vec_columns

# 5. Assemble features
assembler = VectorAssembler(
    inputCols=features, 
    outputCol="features", 
    handleInvalid="keep"
)

# 6. Scale features
scaler = MinMaxScaler(
    inputCol="features", 
    outputCol="features_scaled"
)

# 7. Logistic regression model
lr = LogisticRegression(
    featuresCol="features_scaled", 
    labelCol="outcome", 
    maxIter=50
)

# Build final pipeline
pipeline = Pipeline(stages=stages + [assembler, scaler, lr])


### Evaluation

In [0]:
timeSeriesSplitCV(df_train, pipeline, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname=dep_utc_varname)

In [0]:
timeSeriesSplitCV(df_train, pipeline, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname=dep_utc_varname)

In [0]:
# final training and evaluation

df_train_downsampled = downsample(df_train).cache()
df_train_seasonal = get_seasonality_data(df_train_downsampled, 'full', k).cache()
df_test_seasonal = get_seasonality_data(df_test, 'full', k).cache()
model = pipeline.fit(df_train_seasonal)
dev_pred = model.transform(df_test_seasonal)
# get f2 score
score = cv_eval(dev_pred)[0]
print(score)


**F2 SCORE: Prior Features Model**

Train/cv: 0.5861666666666666

Test: 0.5727

## Interactions

In [0]:
from pyspark.ml.feature import Interaction
from pyspark.ml.feature import Imputer


In [0]:

# List to hold the stages of the pipeline
stages = []

# 1. Index and encode categorical columns
for column in categorical_cols:
    indexer = StringIndexer(
        inputCol=column, 
        outputCol=column + "_index", 
        handleInvalid="keep"
    )
    encoder = OneHotEncoder(
        inputCol=column + "_index", 
        outputCol=column + "_vec", 
        handleInvalid="keep"
    )
    stages += [indexer, encoder]

# 2. Median imputation for numerical columns
impute_cols = [
    "turnaround_time_calc", 
    "priorflight_depdelay_calc", 
    "priorflight_elapsed_time_calc_raw"
]
imputer = Imputer(
    inputCols=impute_cols,
    outputCols=[col + "_imputed" for col in impute_cols],
    strategy="median"
)
stages += [imputer]

# 3. Interaction feature engineering
interaction = Interaction(
    inputCols=["origin_HourlyPrecipitation", "origin_HourlyWetBulbTemperature"],
    outputCol="interactedCol"
)

# 4. Update feature list to include imputed columns
categorical_vec_columns = [col + "_vec" for col in categorical_cols]
numeric_cols_imputed = [col + "_imputed" for col in impute_cols] + \
    [col for col in numeric_cols if col not in impute_cols]

features = numeric_cols_imputed + categorical_vec_columns + ["interactedCol"]

# 5. Assemble features
assembler = VectorAssembler(
    inputCols=features, 
    outputCol="features", 
    handleInvalid="keep"
)

# 6. Scale features
scaler = MinMaxScaler(
    inputCol="features", 
    outputCol="features_scaled"
)

# 7. Logistic regression model
lr = LogisticRegression(
    featuresCol="features_scaled", 
    labelCol="outcome", 
    maxIter=50
)

# Build final pipeline
pipeline = Pipeline(stages=stages + [interaction, assembler, scaler, lr])


In [0]:
# split into train and test



timeSeriesSplitCV(df_train, pipeline, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname=dep_utc_varname)

In [0]:
# final training and evaluation
# split into train and test

df_train = df_handled.filter(F.col(dep_utc_varname) < min_test_dt)
df_train.cache()
df_test = df_handled.filter(F.col(dep_utc_varname) >= min_test_dt)
df_test.cache()

df_train_downsampled = downsample(df_train).cache() #downsample

df_train_seasonal= get_seasonality_data(df_train_downsampled, 'full', k).cache()
df_test_seasonal = get_seasonality_data(df_test, 'full', k).cache()

model = pipeline.fit(df_train_seasonal)
dev_pred = model.transform(df_test_seasonal)
# get f2 score
score = cv_eval(dev_pred)[0]
print(score)

### checking predictions

In [0]:
dev_pred.groupBy('prediction').count().show() #why :( 

In [0]:
dev_pred.groupBy('outcome').count().show()

In [0]:
df_train_seasonal.groupBy('outcome').count().show()

In [0]:
df_test_seasonal.groupBy('outcome').count().show()

## Train seasonality models for each fold

In [0]:
# informed by: https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html

def forecast_delay(history_pd: pd.DataFrame) -> pd.DataFrame: 
    
    # define Prophet model
    model = Prophet(
        interval_width=0.9,
        growth='linear',
        weekly_seasonality=True,
        daily_seasonality=True,
        yearly_seasonality=True,
        # holidays=us_holidays,
        # seasonality_mode='multiplicative'
    )
    
    # fit the model
    model.fit(history_pd)
    
    # configure predictions
    future_pd = model.make_future_dataframe(
        periods=24*7, 
        freq='h',
        include_history=False
    )
    
    # make predictions
    results_pd = model.predict(future_pd)

    # ref date and dow
    ref_date = history_pd.ds.iloc[0].date()
    ref_dow = history_pd.DAY_OF_WEEK[0]

    # helper function: get day of the week,
    # using reference date and dow
    def get_dow(x,ref_date,dow):
        d_days = (x.date() - ref_date).days + dow
        d_days = d_days%7
        if d_days == 0:
            d_days = 7
        return d_days

    # get dow for forecasted points
    results_pd['dow'] = results_pd.ds.apply(lambda x: get_dow(x,ref_date,ref_dow))

    # get hour for forecasted points
    results_pd['hour'] = results_pd.ds.apply(lambda x: x.hour)

    # store origin
    results_pd['ORIGIN'] = history_pd.ORIGIN.iloc[0]
        
    # return components
    return results_pd[['dow','hour','weekly','daily','ORIGIN']]

schema = StructType([StructField('dow', LongType(), True),
                     StructField('hour', LongType(), True),
                     StructField('weekly', DoubleType(), True),
                     StructField('daily', DoubleType(), True),
                     StructField('ORIGIN', StringType(), True)])

In [0]:
def get_seasonality_data(df, fold, k):
    if fold == 'full':
        fn_model = f"seasonality_model_{period}_train.parquet"
    else:
        fn_model = f"seasonality_model_{period}_cv{fold}of{k}.parquet"
    model = spark.read.parquet(f"{team_BASE_DIR}/interim/{fn_model}")

    joined_df = df.join(model, 
                     (df["ORIGIN"] == model["ORIGIN"]) & 
                     (df["DAY_OF_WEEK"] == model["dow"]) & 
                     (df["dep_hour_utc"] == model["hour"]),
                     how="left").drop(model["ORIGIN"])
    
    return joined_df

# display(get_seasonality_data(df_train.limit(10), 0, 3))

In [0]:
display(get_seasonality_data(df_test.limit(10), 0, 3))

In [0]:
if compute_seasonality:
    # train seasonality model for each cross validation split
    for i in range(k):
        # train seasonality model for this cross validation split
        model = get_seasonality(df_train, cv_cutoffs["train_min"][i], cv_cutoffs["train_max"][i])
        # write out
        fn_out = f"seasonality_model_{period}_cv{i}of{k}.parquet"
        model.write.parquet(f"{team_BASE_DIR}/interim/{fn_out}")

    # train seasonality model for full training data
    model = get_seasonality(df_train, datetime(1970,1,1), min_test_dt)
    # write out
    fn_out = f"seasonality_model_{period}_train.parquet"
    model.write.parquet(f"{team_BASE_DIR}/interim/{fn_out}")

# lag sandbox - ignore

In [0]:
WindowConditions = Window.partitionBy("TAIL_NUM").orderBy("sched_depart_utc")

WhenConditions = (f.col("ORIGIN") == f.col("priorflight_dest")) & (f.col("priorflight_sched_deptime") >= f.col("twentysix_hours_prior_depart_UTC"))


def add_lags(df):

    result_df = (df
                 .withColumn("priorflight_origin",
                             lag("ORIGIN").over(WindowConditions))
                 .withColumn("priorflight_dest",
                             lag("DEST").over(WindowConditions))
                 .withColumn('priorflight_cancelled_true',  #~~~~true cancellation status, assumed known (?)
                             lag('CANCELLED').over(WindowConditions))
                 .withColumn("twentysix_hours_prior_depart_UTC",
                             (f.col("two_hours_prior_depart_UTC") - f.expr("INTERVAL 24 HOURS")).cast("timestamp"))
                             
                 
                 .withColumn("priorflight_sched_deptime",
                             f.when(WhenConditions, lag("sched_depart_utc").over(WindowConditions))
                             .otherwise(None)
                             )
                 
                 .withColumn("priorflight_elapsed_time_calc", #~~~crs estimated
                             lag("CRS_ELAPSED_TIME").over(WindowConditions))
                    .withColumn("priorflight_elapsed_time_calc", #~~~turned into interval
                                f.when(WhenConditions,f.expr("INTERVAL 1 MINUTE") * f.col("priorflight_elapsed_time_calc"))
                                .otherwise(None)
                    )

                 .withColumn("priorflight_depdelay_true", #~~~true dep delay
                             f.when(WhenConditions,lag("DEP_DELAY").over(WindowConditions))
                             .otherwise(None)
                             )
                 

                 .withColumn("special_cases", f.when(WhenConditions, 1).otherwise(f.lit(0.0)))

                 .withColumn("priorflight_deptime_true", #~~~true dep time based on true dep delay
                     f.when(WhenConditions, (f.col("priorflight_sched_deptime") + 
                     (f.expr("INTERVAL 1 MINUTE") * f.col("priorflight_depdelay_true"))))
                     .otherwise(None)
                 )
                 
                 
                 .withColumn("priorflight_isdeparted", #~~~ only 1 when we definitely knew it left already
                             f.when((f.col("priorflight_deptime_true") <= f.col("two_hours_prior_depart_UTC")) 
                                    & WhenConditions, 1).otherwise(0) #we don't really know about the prior flight for the when conditions
                                 ) 
                 .withColumn("priorflight_depdelay_calc", #~~~estimated dep delay
                        f.when(
                            # Case 1: Flight departed BEFORE observation window ***** add in when conditions
                            ((f.col("priorflight_deptime_true") <= f.col("two_hours_prior_depart_UTC")) & WhenConditions),
                            f.col("priorflight_depdelay_true")  # Full delay known
                        ).when(
                            # Case 2: Flight scheduled to depart BEFORE window, but departed DURING observation window
                            (f.col("priorflight_sched_deptime") <= f.col("two_hours_prior_depart_UTC")) &
                            (f.col("priorflight_deptime_true") > f.col("two_hours_prior_depart_UTC")) &
                            WhenConditions,
                            (f.col("two_hours_prior_depart_UTC").cast('long') - f.col("priorflight_sched_deptime").cast('long')) / 60  # Partial delay
                        ).otherwise(
                            f.lit(0.0)  # Flight scheduled to depart AFTER window; we know nothing and assume departed on time
                        )
                    )
                 
                 .withColumn("priorflight_deptime_calc", #~~~estimated dep time based on estimated dep delay
                                f.col("priorflight_sched_deptime") + 
                                (f.expr("INTERVAL 1 MINUTE") * f.col("priorflight_depdelay_calc")) 
                            )
                 
                 .withColumn("priorflight_isdelayed_calc", #~~~estimated delay indicator ** ADJUSTED FOR CANCELLED
                             f.when( ((f.col("priorflight_depdelay_calc") >= 15) | (f.col('priorflight_cancelled_true') == 1)), 1).otherwise(0)
                             )
                 
                 .withColumn("elapsed_time_true", #~~~true elapsed time for current flight
                             f.when(WhenConditions,(f.col("AIR_TIME") + f.col("TAXI_IN") + f.col("TAXI_OUT")
                             ).cast("int")).otherwise(None)
                             )
                 
                 .withColumn("arr_time_true", #~~~true arrival time based on true dp time + true elased time

                                f.col("arr_datetime").cast("timestamp") +
                                (f.expr("INTERVAL 1 MINUTE") * f.col("ARR_DELAY"))
                        )
                 
                 .withColumn("priorflight_arr_time_true", #~~~true prior flight arrival time
                             lag("arr_time_true").over(WindowConditions)
                             )
                 
                 .withColumn("priorflight_isarrived_calc", #~~~estimated arrival indicator based on whether flight landed before window
                             f.when((f.col("priorflight_arr_time_true") <= f.col("two_hours_prior_depart_UTC")) & WhenConditions,1).otherwise(0)
                             )
                 
                 .withColumn("priorflight_arr_time_calc", #~~~estimated arrival time based on 3 scenarios
                        f.when(
                            f.col("priorflight_isarrived_calc") == 1,  # Case 1: Dep before window, arr after window
                            f.col("priorflight_arr_time_true") #so we know the info
                        ).when(
                            (f.col("priorflight_isarrived_calc") == 0) &  # Case 2: Dep before window, arr after window
                            (f.col("priorflight_deptime_true") <= f.col("two_hours_prior_depart_UTC")), 
                            f.col("priorflight_deptime_true") + f.col("priorflight_elapsed_time_calc")
                        ).otherwise(
                            f.col("priorflight_deptime_calc") + f.col("priorflight_elapsed_time_calc")  # dep after window, arr after window
                        ))
                 
                 .withColumn("turnaround_time_calc", 
                             #~~~estimated how much time we have between estimated arrival of prior flight and scheduled departure of current flight
                    (f.when(WhenConditions,
                        ((f.col("sched_depart_utc").cast("long") - 
                        f.col("priorflight_arr_time_calc").cast("long")) / 60
                    ).cast("double"))
                    ).otherwise(None))
    )


    #fill in edge case values
            #~~ 1. if prior flight is cancelled and there was nothing to pull,
            #~~ 2. if prior flight dest != current flight origin, 
            #~~ 3. if it has been >26 hrs since last flight,
        # impute:
            #turnaround time
            #prior flight delay estimation
    window2 =  Window.partitionBy("ORIGIN","DEST").orderBy("sched_depart_utc").rowsBetween(-10, -1)
    

    result_df = (result_df
                .withColumn("turnaround_time_calc", 
                            f.when(((~WhenConditions) | (f.col('priorflight_cancelled_true') == 1)),
                            last(f.col("turnaround_time_calc"), ignorenulls=True)
                            .over(window2)
                                ).otherwise(f.col("turnaround_time_calc"))
                )
                .withColumn("priorflight_depdelay_calc", 
                            f.when(((~WhenConditions) | (f.col('priorflight_cancelled_true') == 1)),
                            last(f.col("priorflight_depdelay_calc"), ignorenulls=True)
                            .over(window2)
                                ).otherwise(f.col("priorflight_depdelay_calc"))
                )
            )


    return result_df



full = add_lags(out) #first pass to correctly get the current cancelled flights
noncancelled=add_lags(out.filter(f.col("CANCELLED") == 0)) #next pass to correctly skip prior cancelled flights for current non cancelled flights
result=full.filter(f.col('CANCELLED')==1).unionByName(noncancelled) #combine the two 

display(result)


In [0]:
redundant=[ 'priorflight_origin',
 'priorflight_dest',
 'twentysix_hours_prior_depart_UTC',
 'priorflight_sched_deptime',
 'priorflight_elapsed_time_calc',
 'priorflight_depdelay_true',
 'priorflight_deptime_true',
 'priorflight_depdelay_calc',
 'priorflight_deptime_calc',
 'priorflight_isdelayed_calc',
 'elapsed_time_true',
 'arr_time_true',
 'priorflight_arr_time_true',
 'priorflight_isarrived_calc',
 'priorflight_arr_time_calc',
 'turnaround_time_calc',
]

out=out.select([c for c in out.columns if c not in redundant])

In [0]:
out = out.withColumn('arr_datetime', F.col('arr_datetime').cast("timestamp"))

### testing

In [0]:
display(result.filter(f.col('TAIL_NUM')=='259NV').filter(f.col('sched_depart_utc').contains('2019-01-02')))

In [0]:
display(result.filter(f.col("ORIGIN") != f.col("priorflight_dest")))

In [0]:
display(result)

In [0]:
WindowConditions = Window.partitionBy("TAIL_NUM").orderBy("sched_depart_utc")

display(result.withColumn("prior_cancelled", 
                  lag("CANCELLED").over(WindowConditions)).filter(f.col('prior_cancelled')==1))

impute missing turnaroudns based on current origin -> destination ema something

In [0]:
display(result.filter(f.col('TAIL_NUM')=="N102UW").filter(f.col('sched_depart_utc').contains('2019-04-09')))

In [0]:
result.filter(f.col('turnaround_time_calc').isNull()).groupBy('special_cases','priorflight_cancelled_true').count().show()

In [0]:
result.filter(f.col('turnaround_time_calc').isNull()).groupBy('special_cases','priorflight_cancelled_true').count().show()

In [0]:
display(df)

In [0]:
display(result.filter(f.col('TAIL_NUM')=='215NV').orderBy('sched_depart_utc')
        .select('sched_depart_utc','ORIGIN','DEST','priorflight_origin','priorflight_dest',
                'CANCELLED','priorflight_cancelled_true','priorflight_isarrived_calc','priorflight_deptime_true',
                'arr_time_true','priorflight_arr_time_true','priorflight_deptime_calc','priorflight_arr_time_calc','priorflight_elapsed_time_calc','turnaround_time_calc','special_cases'))

In [0]:
display(result.filter(f.col('turnaround_time_calc').isNull()).filter(f.col('priorflight_cancelled_true')==0).filter(f.col('CANCELLED')==0).orderBy('TAIL_NUM','sched_depart_utc'))

In [0]:
display(result.filter(f.col('TAIL_NUM')=="N102UW").filter(f.col('sched_depart_utc') >= '2019-03-20'))

In [0]:
display(result.filter(f.col('TAIL_NUM')=="N102UW").filter(f.col('sched_depart_utc') >= '2019-03-20'))

to clean:
prior flight is cancelled, current flight is cancelled: impute

1. A good, B good, C good
2. A good, B cancelled, C good
3. A cancelled, B cancelled, C good **

In [0]:

# prior_noncancelled=add_lags(full.filter(f.col("priorflight_cancelled_true") == 0))
# result2=full.filter(f.col('priorflight_cancelled_true')==1).unionByName(prior_noncancelled)
#doesnt resolve issue


In [0]:
result2.

In [0]:
result.filter(f.col('DEP_DELAY').isNull()).groupBy('CANCELLED').count().show()

In [0]:

null_counts = result.select(
    [f.count(f.when(f.col(c).cast("long").isNull() | f.isnan(f.col(c).cast("long")), c)).alias(c) 
     for c in result.columns]
)

display(null_counts)

In [0]:
display(result.filter(f.col('priorflight_arr_time_calc').isNull()))

In [0]:
df.filter(f.col('TAIL_NUM').isNull()).count()

In [0]:

output_path = "dbfs:/student-groups/Group_4_1/interim/join_checkpoints/joined_1y_weather_cleaned_combo_lags.parquet"
(
    result.write
    .mode("overwrite")
    .parquet(output_path)
)

In [0]:

# convert time variable to datetime
df = df.withColumn(dep_utc_varname, to_timestamp(col(dep_utc_varname)))

# add hour variable (needed for seasonality)
df = df.withColumn("dep_hour_utc", f.hour(col(dep_utc_varname)))

# define outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

# cast weather columns to double
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]

num_weather_cols = [c for c in weather_cols if c not in remove_me]

for column in num_weather_cols:
    df = df.withColumn(column, col(column).cast("double"))



In [0]:
display(df)