# Baseline Modeling on Joined Data

Erica Landreth

## Imports and Setup

In [0]:
# Configure Spark settings for better performance
from pyspark.sql import SparkSession
spark = SparkSession.builder\
    .config("spark.executor.memory", "16g")\
    .config("spark.executor.cores", "32g")\
    .config("spark.executor.cores", 4)\
    .appName('Final Project Training')\
    .getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "200")
spark.conf.set("spark.default.parallelism", "200")

In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, to_timestamp, to_date, when
from prophet.make_holidays import make_holidays_df
from xgboost.spark import SparkXGBClassifier

### Set options

In [0]:
# data time period
period = "" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds and overlap
k = 5
overlap = 0.2

# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup)
compute_seasonality = False
apply_seasonality = False

# define train/test split date
if period == "3m":
    min_test_dt = "2015-03-01"
elif period == "1y":
    min_test_dt = "2019-10-01"
elif period == "":
    min_test_dt = "2019-01-01"
print(f"Min test set date for {period} dataset: {min_test_dt}")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

## Load data and perform simple transformations

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"

# read in joined, cleaned dataset
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather_{period}.parquet") # !!!
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_weather_cleaned_combo.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather{period}_v1.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned.parquet")
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr.parquet")

# convert time variable to datetime
df = df.withColumn(dep_utc_varname, to_timestamp(col(dep_utc_varname)))

# add hour and date variables (needed for seasonality and CV splits, respectively)
df = df.withColumn("dep_hour_utc", f.hour(col(dep_utc_varname))) \
    .withColumn("dep_date_utc", to_date(col(dep_utc_varname)))

# define outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

# cast weather columns to double
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]
for column in num_weather_cols:
    df = df.withColumn(column, col(column).cast("double"))

df.cache()


In [0]:
# # Group by the year and count the number of records for each year
# df_year_counts = df.groupBy("YEAR").count()

# # Display the result
# display(df_year_counts)

In [0]:
# split into train and test
df_train = df.filter(f.col(dep_utc_varname) < min_test_dt)
df_train.cache()
# print(f"Train data: {df_train.count()} records")
df_test = df.filter(f.col(dep_utc_varname) >= min_test_dt) \
    .filter(f.col(dep_utc_varname) < "2020-01-01")
df_test.cache()
# print(f"Test data: {df_test.count()} records")

## Get cross-validation splits

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def get_cv_time_limits_by_days_with_overlap(df, k=3, blocking=False, overlap=0, dep_utc_varname=dep_utc_varname, verbose=True):
    '''
    Get time bins for time-series cross validation, based on # days in dataset
    '''
    
    min_date = df.select(f.min("dep_date_utc")).collect()[0][0]
    max_date = df.select(f.max("dep_date_utc")).collect()[0][0]
    n_days = (max_date - min_date).days + 1
    total_width = k+1 - overlap*(k-1)
    chunk_size = np.ceil(n_days/total_width) # last chunk may be slightly smaller than the others

    # idx = np.arange(0,)
    # idx = np.arange(0,n_days,chunk_size)
    # idx[-1] = n_days-1
    # idx = [int(i)+1 for i in idx]
    
    if verbose:
        print(f'Splitting data into {k} folds with {overlap} overlap')
        print(f'Min date: {min_date}, max date: {max_date}')
        print(f'{chunk_size:,} days per fold')
        print("************************************************************")

    out = []
    for i in range(k):
        # define indices based on chunk size and overlap
        if i == 0:
            train_min_offset = 0
            train_max_offset = chunk_size
        else:
            train_min_offset += np.ceil((1-overlap)*chunk_size)
            train_max_offset += np.floor((1-overlap)*chunk_size)
        test_min_offset = train_max_offset
        test_max_offset = test_min_offset + chunk_size

        # define minimum training time based on cross-validation style
        if not blocking:
            t_min_train = min_date
        else:
            t_min_train = min_date + timedelta(days=train_min_offset)
        # define maximum training time
        t_max_train = min_date + timedelta(days=train_max_offset)
        # define minimum test time
        t_min_test = min_date + timedelta(days=test_min_offset)
        # define maximum test_time
        t_max_test = min_date + timedelta(days=test_max_offset)

        if t_max_test > max_date + timedelta(1):
            t_max_test = max_date + timedelta(1)

        out.append({"train_min":t_min_train, "train_max":t_max_train,
                    "test_min":t_min_test, "test_max":t_max_test})
    out = pd.DataFrame(out)
        
    if verbose:
        for i in range(k):
            print(f'    TRAIN set for fold {i} goes from {out["train_min"][i]} to {out["train_max"][i]}')
            print(f'    TEST set for fold {i} goes from {out["test_min"][i]} to {out["test_max"][i]}')
        print("(Note that the max dates are non-inclusive)")
        
    return out

In [0]:
cv_cutoffs = [
    {"train_min": "2014-12-31", "train_max": "2015-10-09", "test_min": "2015-10-09", "test_max": "2016-07-17"},
    {"train_min": "2015-08-14", "train_max": "2016-05-21","test_min": "2016-05-21", "test_max": "2017-02-27"},
    {"train_min": "2016-03-27", "train_max": "2017-01-01","test_min": "2017-01-01", "test_max": "2017-10-10"},
    {"train_min": "2016-11-08", "train_max": "2017-08-14","test_min": "2017-08-14", "test_max": "2018-05-23"},
    {"train_min": "2017-06-22", "train_max": "2018-03-27","test_min": "2018-03-27", "test_max": "2019-01-01"}
    ]
cv_cutoffs = pd.DataFrame(cv_cutoffs)
cv_cutoffs

In [0]:
# # get cross-validation split times
# cv_cutoffs = get_cv_time_limits_by_days_with_overlap(df_train.select("dep_date_utc"), k=k, blocking=True, overlap=overlap,
#     dep_utc_varname=dep_utc_varname, verbose=True)
# cv_cutoffs

## Modeling

### Define columns to be used in model.

In [0]:
# weather columns
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]

# seasonality columns
seasonality_cols = ["daily","weekly","yearly","holidays"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed"]

# date related columns
date_cols = ["YEAR","MONTH","DAY_OF_MONTH","DAY_OF_WEEK"]

# flight metadata
flight_metadata_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]

#  'priorflight_cancelled_true',
#  'priorflight_origin',
#  'priorflight_dest',
#  'priorflight_sched_deptime',
#  'priorflight_true_deptime',
#  'priorflight_sched_elapsed',
#  'priorflight_true_elapsed',
#  'priorflight_true_depdelay',
#  'priorflight_sched_arrtime',
#  'priorflight_true_arrtime',
#  'priorflight_depdelay_calc',
#  'priorflight_isdeparted',
#  'priorflight_deptime_calc',
#  'priorflight_isdelayed_calc',
#  'priorflight_isarrived_calc',
#  'priorflight_arr_time_calc',
#  'turnaround_time_calc'

# prior & current flight cols
num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME'
                #    ,'priorflight_elapsed_time_calc_raw'
                ]

bool_flight_cols = ['priorflight_isdeparted', 
                    'priorflight_isarrived_calc',
                    'priorflight_isdelayed_calc',
                    'priorflight_cancelled_true']

# graph columns
graph_cols = ["pagerank"]

# fields that will not be features but need to be kept for processing
keep_me = ["outcome","DAY_OF_WEEK","ORIGIN","dep_hour_utc",dep_utc_varname] # TODO: REVISIT WHICH ACTUALLY NEEDED

########## Define columns to be used as numeric and categorical features in the pipeline ##########
numeric_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
categorical_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

### Define modeling pipeline

In [0]:
## DEFINE ALL ARCHITECTURE TYPES

# logistic regression model
lr = LogisticRegression(featuresCol='features_scaled', \
    labelCol='outcome',maxIter=50)

# xgboost model
xgb = SparkXGBClassifier(
  features_col="features_scaled",
  label_col="outcome",
  max_depth=10,
  
)

# random forest model
rf = RandomForestClassifier(numTrees=30, maxDepth=10, featuresCol="features_scaled" , labelCol="outcome", seed=42)

# multilayer perceptron model
mlp = None # FILL IN


In [0]:
# List to hold the stages of the pipeline
stages = []

# 1. Index and encode categorical columns
for column in categorical_cols:
    indexer = StringIndexer(
        inputCol=column, 
        outputCol=column + "_index", 
        handleInvalid="keep"
    )
    encoder = OneHotEncoder(
        inputCol=column + "_index", 
        outputCol=column + "_vec", 
        handleInvalid="keep"
    )
    stages += [indexer, encoder]

# 4. Update feature list to include imputed columns
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

features = numeric_cols + categorical_vec_columns

# 5. Assemble features
assembler = VectorAssembler(
    inputCols=features, 
    outputCol="features", 
    handleInvalid="skip"
)

# 6. Scale features
scaler = MinMaxScaler(
    inputCol="features", 
    outputCol="features_scaled"
)

# 7. Logistic regression model
lr = LogisticRegression(
    featuresCol="features_scaled", 
    labelCol="outcome", 
    maxIter=50
)

##############################################################################
# FILL IN DESIRED MODEL TYPE BELOW

# # construct pipeline object from all components
# pipeline = Pipeline(stages=stages+[assembler,scaler,xgb])

### Helper functions

In [0]:
# CODE BELOW DERIVED FROM DEMO 11 NOTEBOOK

def upsample(train_df,verbose=False):
  '''Upsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = non_delay_count / delay_count

  train_delay = train_df.filter(f.col('outcome') == 0)
  train_non_delay = train_df.filter(f.col('outcome') == 1).sample(withReplacement=True, fraction=keep_percent,seed=42)
  train_upsampled = train_delay.union(train_non_delay)
  return train_upsampled


def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

def cv_eval(preds):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  return F2, pr

def timeSeriesSplitCV(df, pipeline, cv_info, sampling=None, metric='f2', verbose=True, dep_utc_varname=dep_utc_varname):
  '''
  Perform timSeriesSplit k-fold cross validation 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    
    # print(f"{train_df.count()} (unsampled) TRAIN records in fold {i+1}")
    # print(f"{dev_df.count()} DEV records in fold {i+1}")

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    elif sampling=='up':
      train_df = upsample(train_df)
      # train_df = train_df.cache()
    # elif sampling=='weights':
    #   train_df = add_class_weights(train_df).cache()
      
    # #print info on train and dev set for this fold
    # if verbose:
    #   print('    TRAIN set for fold {} goes from {} to {}, count is {:,} flights ({})'.format((i+1), 
    #                                                                                   train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
    #                                                                                   train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
    #                                                                                   train_df.count(),
    #                                                                                   sampling + '-sampled' if sampling else 'no sampling'))
    #   print('    DEV set for fold {} goes from {} to {}, count is {:,} flights'.format((i+1), 
    #                                                                                   dev_df.agg({dep_utc_varname:'min'}).collect()[0][0],
    #                                                                                   dev_df.agg({dep_utc_varname:'max'}).collect()[0][0],
    #                                                                                   dev_df.count()))
    
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
    # Fit params on the model
    model = pipeline.fit(train_df)
    dev_pred = model.transform(dev_df)
    if metric=='f2':
      score = cv_eval(dev_pred)[0]
    elif metric=='pr':
      score = cv_eval(dev_pred)[1]
    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")

  # # Train on full df
  # print('Training on full train dataset, and validating on dev dataset with best parameters from CV:')
  # print(best_parameters)
    
  # if verbose:
  #   print('    TRAIN set for best parameter fitted model goes from {} to {}, count is {:,} flights ({})'.format(train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
  #                                                                                                    train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
  #                                                                                                    train_df.count(),
  #                                                                                                    sampling + '-sampled' if sampling else 'no sampling'))
  return avg_score

In [0]:
# final training and evaluation

def final_eval(df_train, df_test,pipeline):
    df_train = downsample(df_train).cache()
    df_train = df_train \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"train","pagerank")
    df_train = df_train.fillna({col:0 for col in \
        ['daily','weekly','yearly','holidays','mean_dep_delay', 'prop_delayed']})

    df_test = df_test \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"train","pagerank") # I THINK WE HAVE TO USE THE VALUE TRAINED ON THE TRAINING DATA, NOT TEST
    df_test = df_test.fillna({col:0 for col in \
        ['daily','weekly','yearly','holidays','mean_dep_delay', 'prop_delayed']})


    model = pipeline.fit(df_train)
    dev_pred = model.transform(df_test)
    # get f2 score
    score = cv_eval(dev_pred)[0]
    print(score)

    return dev_pred

In [0]:
# df_test.withColumnRenamed("test","pagerank")

### Train, cross validate, predict

In [0]:
# timeSeriesSplitCV(df_train, pipeline, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname='sched_depart_utc')

In [0]:
# final_eval(df.limit(10), df.limit(10),pipeline)

## Logistic Regression

In [0]:
# from pyspark.sql.functions import expr

# # Generate a list of train_ columns
# train_columns = [col for col in df_train.columns if col.startswith('train_')]

# # Create an expression to calculate the average of train_ columns
# avg_expr = " + ".join(train_columns) + f" / {len(train_columns)}"

# # Add the new column 'train' with the average value
# df_train = df_train.withColumn("train", expr(avg_expr))
# display(df_train)

### THERE IS ALREADY A COLUMN CALLED TRAIN

In [0]:
pipeline_lr = Pipeline(stages=stages+[assembler,scaler,lr])

In [0]:
timeSeriesSplitCV(df_train, pipeline_lr, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname='sched_depart_utc')

In [0]:
# df_train.columns

In [0]:
dev_pred_lr = final_eval(df_train, df_test,pipeline_lr)

## XGBoost

In [0]:
pipeline_xgb = Pipeline(stages=stages+[assembler,scaler,xgb])

In [0]:
timeSeriesSplitCV(df_train, pipeline_xgb, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname='sched_depart_utc')

In [0]:
# final training and evaluation

TRAIN = downsample(df_train) \
    .withColumnRenamed(f"daily_full","daily") \
    .withColumnRenamed(f"weekly_full","weekly") \
    .withColumnRenamed(f"yearly_full","yearly") \
    .withColumnRenamed(f"holidays_full","holidays") \
    .withColumnRenamed(f"train","pagerank") \
    .fillna({col:0 for col in \
        ['daily','weekly','yearly','holidays','mean_dep_delay', 'prop_delayed']})

TEST = df_test \
    .withColumnRenamed(f"daily_full","daily") \
    .withColumnRenamed(f"weekly_full","weekly") \
    .withColumnRenamed(f"yearly_full","yearly") \
    .withColumnRenamed(f"holidays_full","holidays") \
    .withColumnRenamed(f"train","pagerank") \
    .fillna({col:0 for col in \
        ['daily','weekly','yearly','holidays','mean_dep_delay', 'prop_delayed']})


model = pipeline_xgb.fit(TRAIN)
dev_pred_xgb = model.transform(TEST)
# get f2 score
score = cv_eval(dev_pred_xgb)[0]
print(score)

In [0]:
TRAIN = TRAIN.withColumnRenamed("test","pagerank")
TEST = TEST.withColumnRenamed("test","pagerank")

model = pipeline_xgb.fit(TRAIN)
dev_pred_xgb = model.transform(TEST)
# get f2 score
score = cv_eval(dev_pred_xgb)[0]
print(score)

In [0]:
dev_pred_xgb = final_eval(df_train, df_test,pipeline_xgb)

## Random Forest

In [0]:
pipeline_rf = Pipeline(stages=stages+[assembler,scaler,rf])

In [0]:
timeSeriesSplitCV(df_train, pipeline_rf, cv_cutoffs, sampling='down', metric='f2', verbose=True, dep_utc_varname='sched_depart_utc')

In [0]:
model = pipeline_rf.fit(TRAIN)
dev_pred_rf = model.transform(TEST)
# get f2 score
score = cv_eval(dev_pred_rf)[0]
print(score)

In [0]:
dev_pred_rf = final_eval(df_train, df_test,pipeline_rf)