# Imports and Setup

In [0]:
# # Configure Spark settings for better performance
# from pyspark.sql import SparkSession
# spark = SparkSession.builder\
#     .config("spark.executor.memory", "16g")\
#     .config("spark.executor.cores", 4)\
#     .appName('Final Project Training')\
#     .getOrCreate()
# spark.conf.set("spark.sql.shuffle.partitions", "200")
# spark.conf.set("spark.default.parallelism", "200")

In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, to_timestamp, to_date, when
from prophet.make_holidays import make_holidays_df
from xgboost.spark import SparkXGBClassifier

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.functions import vector_to_array


In [0]:
spark.version

### Set options

In [0]:
# data time period
period = "" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds and overlap
k = 5
overlap = 0.2

# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup)
compute_seasonality = False
apply_seasonality = False

# define train/test split date
if period == "3m":
    min_test_dt = "2015-03-01"
elif period == "1y":
    min_test_dt = "2019-10-01"
elif period == "":
    min_test_dt = "2019-01-01"
print(f"Min test set date for {period} dataset: {min_test_dt}")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

## Load data and perform simple transformations

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"

# read in joined, cleaned dataset
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather_{period}.parquet") # !!!
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_weather_cleaned_combo.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather{period}_v1.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned.parquet")
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr_v2.parquet")

# convert time variable to datetime
df = df.withColumn(dep_utc_varname, to_timestamp(col(dep_utc_varname)))

# add hour and date variables (needed for seasonality and CV splits, respectively)
df = df.withColumn("dep_hour_utc", f.hour(col(dep_utc_varname))) \
    .withColumn("dep_date_utc", to_date(col(dep_utc_varname)))

# define outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

# cast weather columns to double
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]
for column in num_weather_cols:
    df = df.withColumn(column, col(column).cast("double"))

df.cache()


In [0]:
# # Group by the year and count the number of records for each year
# df_year_counts = df.groupBy("YEAR").count()

# # Display the result
# display(df_year_counts)

In [0]:
# split into train and test
df_train = df.filter(f.col(dep_utc_varname) < min_test_dt)
# df_train.cache()
# print(f"Train data: {df_train.count()} records")
df_test = df.filter(f.col(dep_utc_varname) >= min_test_dt) \
    .filter(f.col(dep_utc_varname) < "2020-01-01")
# df_test.cache()
# print(f"Test data: {df_test.count()} records")

## Get cross-validation splits

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def get_cv_time_limits_by_days_with_overlap(df, k=3, blocking=False, overlap=0, dep_utc_varname=dep_utc_varname, verbose=True):
    '''
    Get time bins for time-series cross validation, based on # days in dataset
    '''
    
    min_date = df.select(f.min("dep_date_utc")).collect()[0][0]
    max_date = df.select(f.max("dep_date_utc")).collect()[0][0]
    n_days = (max_date - min_date).days + 1
    total_width = k+1 - overlap*(k-1)
    chunk_size = np.ceil(n_days/total_width) # last chunk may be slightly smaller than the others

    # idx = np.arange(0,)
    # idx = np.arange(0,n_days,chunk_size)
    # idx[-1] = n_days-1
    # idx = [int(i)+1 for i in idx]
    
    if verbose:
        print(f'Splitting data into {k} folds with {overlap} overlap')
        print(f'Min date: {min_date}, max date: {max_date}')
        print(f'{chunk_size:,} days per fold')
        print("************************************************************")

    out = []
    for i in range(k):
        # define indices based on chunk size and overlap
        if i == 0:
            train_min_offset = 0
            train_max_offset = chunk_size
        else:
            train_min_offset += np.ceil((1-overlap)*chunk_size)
            train_max_offset += np.floor((1-overlap)*chunk_size)
        test_min_offset = train_max_offset
        test_max_offset = test_min_offset + chunk_size

        # define minimum training time based on cross-validation style
        if not blocking:
            t_min_train = min_date
        else:
            t_min_train = min_date + timedelta(days=train_min_offset)
        # define maximum training time
        t_max_train = min_date + timedelta(days=train_max_offset)
        # define minimum test time
        t_min_test = min_date + timedelta(days=test_min_offset)
        # define maximum test_time
        t_max_test = min_date + timedelta(days=test_max_offset)

        if t_max_test > max_date + timedelta(1):
            t_max_test = max_date + timedelta(1)

        out.append({"train_min":t_min_train, "train_max":t_max_train,
                    "test_min":t_min_test, "test_max":t_max_test})
    out = pd.DataFrame(out)
        
    if verbose:
        for i in range(k):
            print(f'    TRAIN set for fold {i} goes from {out["train_min"][i]} to {out["train_max"][i]}')
            print(f'    TEST set for fold {i} goes from {out["test_min"][i]} to {out["test_max"][i]}')
        print("(Note that the max dates are non-inclusive)")
        
    return out

In [0]:
cv_cutoffs = [
    {"train_min": "2014-12-31", "train_max": "2015-10-09", "test_min": "2015-10-09", "test_max": "2016-07-17"},
    {"train_min": "2015-08-14", "train_max": "2016-05-21","test_min": "2016-05-21", "test_max": "2017-02-27"},
    {"train_min": "2016-03-27", "train_max": "2017-01-01","test_min": "2017-01-01", "test_max": "2017-10-10"},
    {"train_min": "2016-11-08", "train_max": "2017-08-14","test_min": "2017-08-14", "test_max": "2018-05-23"},
    {"train_min": "2017-06-22", "train_max": "2018-03-27","test_min": "2018-03-27", "test_max": "2019-01-01"}
    ]
cv_cutoffs = pd.DataFrame(cv_cutoffs)
cv_cutoffs

In [0]:
# # get cross-validation split times
# cv_cutoffs = get_cv_time_limits_by_days_with_overlap(df_train.select("dep_date_utc"), k=k, blocking=True, overlap=overlap,
#     dep_utc_varname=dep_utc_varname, verbose=True)
# cv_cutoffs

# Ensemble

## MLP

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

In [0]:
def getTSCVmods(df, 
                      pre_pipeline,
                      cv_info, 
                      hidden_layers,
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info
  3) hidden layer sizes in a list


  note that the scaling+classification pipeline is initialized and fit in this method itself 

  returns scores and pipelines
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  encoder_pipelines = []
  classifier_pipelines = []
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)
    encoder_pipelines.append(train_df_transformed_model)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size] + hidden_layers + [2]
    #input features, hidden layers, classification head
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=200,
                                                layers=layers,
                                                blockSize=128,
                                                stepSize=.0524,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}")
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))
    classifier_pipelines.append(mlp_model)
    print(f"transforming encoded dev df for fold {i}")
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome'))

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return scores, encoder_pipelines, classifier_pipelines

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = ["origin_HourlyDewPointTemperature", "origin_HourlyPrecipitation", "origin_HourlyWindGustSpeed", "origin_HourlyVisibility", "origin_HourlyPressureChange"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, "train"]

In [0]:
#pre-pipeline
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(
    inputCol='{0}_index'.format(column), 
    outputCol='{0}_ohe'.format(column)
    ) for column in cat_cols]



[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
[encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

# Fill missing values with 0 for the specified columns
# df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

scores, encoding_pipelines, classifier_pipelines = getTSCVmods(
    df_train,
    vec_pipeline_full,
    cv_cutoffs,
    hidden_layers = [128, 32],
    sampling='down',
    metric='f2',
    verbose=True,
    dep_utc_varname='sched_depart_utc'
)

In [0]:
scores

In [0]:
scores

In [0]:
for idx, pipeline in enumerate(encoding_pipelines):
    pipeline.write().overwrite().save(f"dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/encoding_pipeline_{idx}")

for idx, pipeline in enumerate(classifier_pipelines):
    pipeline.write().overwrite().save(f"dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints//classifier_pipeline_{idx}")

In [0]:
import matplotlib.pyplot as plt

for i, pipeline in enumerate(classifier_pipelines):
    objective_history = pipeline.stages[-1].summary().objectiveHistory
    plt.plot(objective_history, label=f'Fold {i}')

plt.xlabel('Iteration')
plt.ylabel('Log Loss')
plt.title('Objective History for Each Fold Pipeline')
plt.legend()
plt.show()

In [0]:
test0=encoding_pipelines[0].transform(df_test.withColumnsRenamed({'daily_full':'daily','weekly_full':'weekly','yearly_full':'yearly','holidays_full':'holidays'}))
test0=classifier_pipelines[0].transform(test0)

In [0]:
classifier_pipelines

In [0]:
starter=df_test.withColumnsRenamed({'daily_full':'daily','weekly_full':'weekly','yearly_full':'yearly','holidays_full':'holidays'})

In [0]:
filter_cols= numeric_cols_cv+seasonality_cols_cv+cat_cols+['outcome']

In [0]:
starter.columns

In [0]:
starter.columns

In [0]:
fold0_test_encoded.columns[:-25]

In [0]:
starter.columns

In [0]:
df_test.columns

In [0]:
starter=df_test.withColumnsRenamed({'daily_full':'daily','weekly_full':'weekly','yearly_full':'yearly','holidays_full':'holidays'})


fold0_test_encoded = encoding_pipelines[0].transform(starter)
fold0_test_transformed = classifier_pipelines[0] \
    .transform(fold0_test_encoded) \
    .withColumn("fold0_probs", vector_to_array("probability")[1])

In [0]:
fold0_test_encoded.columns[:-25]

In [0]:
fold1_test_encoded = encoding_pipelines[1] \
    .transform(fold0_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs']))

fold1_test_transformed = classifier_pipelines[1] \
    .transform(fold1_test_encoded) \
    .withColumn("fold1_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs']) #fold 1 preds

In [0]:
display(fold1_test_transformed)

In [0]:
fold2_test_encoded = encoding_pipelines[2] \
    .transform(fold1_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs', 'fold1_probs']))

fold2_test_transformed = classifier_pipelines[2] \
    .transform(fold2_test_encoded) \
    .withColumn("fold2_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs', 'fold2_probs']) #fold 2 preds

In [0]:
fold3_test_encoded = encoding_pipelines[3] \
    .transform(fold2_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs', 'fold1_probs', 'fold2_probs']))

fold3_test_transformed = classifier_pipelines[3] \
    .transform(fold3_test_encoded) \
    .withColumn("fold3_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs']) #fold 3 preds

In [0]:
fold4_test_encoded = encoding_pipelines[4] \
    .transform(fold3_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs', 'fold1_probs', 'fold2_probs', 'fold3_probs']))

fold4_test_transformed = classifier_pipelines[4] \
    .transform(fold4_test_encoded) \
    .withColumn("fold4_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs', 'fold4_probs']) #fold 4 preds

In [0]:
display(fold4_test_transformed)

In [0]:
alpha = 0.5  # decay rate; adjust as needed
num_folds = 5

raw_weights = np.array([alpha ** (num_folds - 1 - i) for i in range(num_folds)])
weights = raw_weights / raw_weights.sum()  # normalize to sum to 1

In [0]:
weights

In [0]:
ewa_expr = sum([weights[i] * col(f"fold{i}_probs") for i in range(num_folds)])

final_df = fold4_test_transformed.withColumn("ewa_prob", ewa_expr)

In [0]:
final_df=final_df.withColumn('prediction', when(col('ewa_prob') >= 0.5, 1).otherwise(0))

In [0]:
display(final_df)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome",
    predictionCol='prediction', 
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0
)

evaluator.evaluate(final_df.withColumn("prediction", col("prediction").cast(DoubleType())))


In [0]:
final_df=final_df.withColumnsRenamed({'fold0_probs':'mlp_fold0',
                             'fold1_probs':'mlp_fold1',
                             'fold2_probs':'mlp_fold2',
                             'fold3_probs':'mlp_fold3',
                             'fold4_probs':'mlp_fold4',
                             'ewa_prob':'mlp_ewa_prob',
                             'prediction':'mlp_prediction'})

In [0]:
final_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/mlp_results_df.parquet")

In [0]:


# Create a confusion matrix
confusion_matrix = final_df.groupBy("outcome", "mlp_prediction").count()

display(confusion_matrix)

## XGBoost

In [0]:
final_dfx = spark.read.parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/mlp_results_df.parquet")

In [0]:
display(final_dfx)

In [0]:
from pyspark.ml.pipeline import PipelineModel


In [0]:
categorical_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "YEAR",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily","weekly","yearly","holidays"]

weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]


# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank"]

keep_me = ["outcome",dep_utc_varname]


numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *num_weather_cols, *graph_cols]
# numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
fold0_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_0")

fold1_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_1")
fold2_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_2")
fold3_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_3")
fold4_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_4")




In [0]:
fold0_mod.stages[-1].get_booster().feature_names

In [0]:
mlp_cols = ['mlp_fold0','mlp_fold1','mlp_fold2','mlp_fold3','mlp_fold4','mlp_ewa_prob','mlp_prediction']
filter_cols = [*keep_me, *numeric_cols, *categorical_cols, *mlp_cols]


In [0]:
final_dfx.columns

In [0]:
xgb_final_df=final_dfx.withColumnRenamed('train','pagerank')

In [0]:
display(xgb_final_df)

In [0]:
fold0_test=fold0_mod.transform(xgb_final_df).withColumn("fold0_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs']) #fold 0 preds


In [0]:
display(fold0_test)

In [0]:
# fold0_test=fold0_mod.transform(final_df.withColumnsRenamed({'train':'pagerank'})).withColumn("fold0_probs", vector_to_array("probability")[1]).select(filter_cols+['fold0_probs']) #fold 0 preds

fold1_test=fold1_mod.transform(fold0_test).withColumn("fold1_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs']) #fold 1 preds

fold2_test=fold2_mod.transform(fold1_test).withColumn("fold2_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs', 'fold2_probs']) #fold 2 preds

fold3_test=fold3_mod.transform(fold2_test).withColumn("fold3_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs']) 

fold4_test=fold4_mod.transform(fold3_test).withColumn("fold4_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs', 'fold4_probs'])




In [0]:
fold4_test.checkpoint()
display(fold4_test)

In [0]:
alpha = 0.5  # decay rate; adjust as needed
num_folds = 5

raw_weights = np.array([alpha ** (num_folds - 1 - i) for i in range(num_folds)])
weights = raw_weights / raw_weights.sum()  # normalize to sum to 1

In [0]:
ewa_expr = sum([weights[i] * col(f"fold{i}_probs") for i in range(num_folds)])

xgb_final_df = fold4_test.withColumn("ewa_prob", ewa_expr)

xgb_final_df=xgb_final_df.withColumn('prediction', when(col('ewa_prob') >= 0.5, 1).otherwise(0))

In [0]:
xgb_final_df=xgb_final_df.withColumnsRenamed({'fold0_probs':'xgb_fold0',
                             'fold1_probs':'xgb_fold1',
                             'fold2_probs':'xgb_fold2',
                             'fold3_probs':'xgb_fold3',
                             'fold4_probs':'xgb_fold4',
                             'ewa_prob':'xgb_ewa_prob',
                             'prediction':'xgb_prediction'})

In [0]:
display(xgb_final_df.select('outcome','mlp_prediction','xgb_prediction'))

In [0]:
xgb_final_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgb_mlp_results_df.parquet")

In [0]:
folds = [fold0_mod,fold1_mod, fold2_mod, fold3_mod, fold4_mod]  # Add all fold models here
feature_importance_dfs = []

for fold_mod in folds:
    va = fold_mod.stages[-3]
    tree = fold_mod.stages[-1]
    mappings = list(zip(va.getInputCols(), tree.get_feature_importances().keys()))
    avg_gain = pd.DataFrame(list(tree.get_feature_importances('gain').items()), columns=['id', 'avg_gain'])
    weight = pd.DataFrame(list(tree.get_feature_importances('weight').items()), columns=['id', 'weight'])
    avg_cover = pd.DataFrame(list(tree.get_feature_importances('cover').items()), columns=['id', 'avg_cover'])

    feature_importance_df = avg_gain.merge(weight, on='id').merge(avg_cover, on='id')
    mappings_df = pd.DataFrame(mappings, columns=['name', 'id'])
    feature_importance_df = feature_importance_df.merge(mappings_df, on='id')
    feature_importance_dfs.append(feature_importance_df)

# Combine all feature importance dataframes if needed


In [0]:
import matplotlib.pyplot as plt

for i, feature_importance_df in enumerate(feature_importance_dfs):
    # Top 10 features by avg_gain
    top_gain = feature_importance_df.nlargest(10, 'avg_gain')

    # Top 10 features by weight
    top_weight = feature_importance_df.nlargest(10, 'weight')

    # Top 10 features by avg_cover
    top_cover = feature_importance_df.nlargest(10, 'avg_cover')

    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # Plot for avg_gain
    axes[0].barh(top_gain['name'][::-1], top_gain['avg_gain'][::-1], color='skyblue')
    axes[0].set_title(f'Fold {i+1} - Top 10 Features by Average Gain')
    axes[0].set_xlabel('Average Gain')
    axes[0].set_ylabel('Features')

    # Plot for weight
    axes[1].barh(top_weight['name'][::-1], top_weight['weight'][::-1], color='lightgreen')
    axes[1].set_title(f'Fold {i+1} - Top 10 Features by Weight')
    axes[1].set_xlabel('Weight')
    axes[1].set_ylabel('Features')

    # Plot for avg_cover
    axes[2].barh(top_cover['name'][::-1], top_cover['avg_cover'][::-1], color='salmon')
    axes[2].set_title(f'Fold {i+1} - Top 10 Features by Average Cover')
    axes[2].set_xlabel('Average Cover')
    axes[2].set_ylabel('Features')

    plt.tight_layout()
    plt.show()

## LR

In [0]:
xgb_final_df= spark.read.parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgb_mlp_results_df.parquet")

In [0]:
import mlflow

In [0]:
import mlflow
lr_model = 'runs:/895cf8de5c794d7fb932457e7b48b831/model'

# Load model
lr_mod = mlflow.spark.load_model(lr_model)



In [0]:
lr_df=lr_mod.transform(xgb_final_df.withColumnRenamed('train','pagerank'))

In [0]:
lr_df=lr_df.withColumn("lr_probs", vector_to_array("probability")[1])

In [0]:

lr_df=lr_df.withColumnRenamed('prediction','lr_prediction')

In [0]:
lr_df.columns

In [0]:
lr_df = lr_df.select(xgb_final_df.columns + ['lr_prediction','lr_probs'])

In [0]:
lr_df=lr_df.withColumn("lr_probs", vector_to_array("probability")[1])
lr_df=lr_df.withColumnRenamed('prediction','lr_prediction')

## RF

In [0]:
import mlflow
rf_model = 'runs:/d62a6556ba1b42f5a9285c462a11ac33/rf_model'

# Load model
rf_mod = mlflow.spark.load_model(rf_model)




In [0]:
len(rf_mod.stages[-3].getInputCols())

In [0]:
rf_df=rf_mod.transform(lr_df.withColumnRenamed('train','pagerank'))

In [0]:
rf_df=rf_df.withColumn("rf_probs", vector_to_array("probability")[1])
rf_df=rf_df.withColumnRenamed('prediction','rf_prediction')

In [0]:
rf_df.columns

# Assemble

In [0]:
from pyspark.sql.functions import col, round

# Assuming lr_df has the columns 'mlp_prediction', 'xgb_prediction', and 'lr_prediction'
avg_prediction_df = rf_df.withColumn(
    "avg_prediction",
    round((col("mlp_prediction") + col("xgb_prediction") + col("lr_prediction") + col('rf_prediction')) / 4)
)

display(avg_prediction_df)

In [0]:

evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome",
    predictionCol='avg_prediction', 
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0
)

evaluator.evaluate(avg_prediction_df.withColumn("avg_prediction", col("avg_prediction").cast(DoubleType())))


In [0]:
rf_df.columns

In [0]:
from pyspark.sql.functions import col, round

# Assuming lr_df has the columns 'mlp_prediction', 'xgb_prediction', and 'lr_prediction'
avg_prediction_df = rf_df.withColumn(
    "avg_prediction",
    f.when(round((col("mlp_ewa_prob") + col("xgb_ewa_prob") + col("lr_probs") + col('rf_probs')) / 4) >= .5, 1.0).otherwise(0.0)
)

display(avg_prediction_df)

In [0]:

evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome",
    predictionCol='avg_prediction', 
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0
)

evaluator.evaluate(avg_prediction_df.withColumn("avg_prediction", col("avg_prediction").cast(DoubleType())))


In [0]:
rf_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/mlp_xgb_lr_rf_results.parquet")

# Ensemble CV

In [0]:
def get_val_preds_MLP(df, encoding_pipelines, classifier_pipelines,
                      cv_info,
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  For the test portion of each CV fold, transform it with the already trained encoder/classifier to get the predictions
  return the df with predictions which will then be used to get the other models' preds

  input: train_df
  '''

  k = len(cv_info)

  # Start k-fold
  for i in range(k):
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 

    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = encoding_pipelines[i].transform(dev_df)

    print(f"transforming encoded dev df for fold {i}")
    dev_pred = classifier_pipelines[i].transform(dev_df_transformed)
    dev_pred = dev_pred.withColumnsRenamed({"daily": f"daily_{i}",
                                            "weekly": f"weekly_{i}",
                                            "yearly": f"yearly_{i}",
                                            "holidays": f"holidays_{i}",
                                            "pagerank": f"train_{i}"})

    if i == 0:
        all_dev_preds = dev_pred
    else:
        all_dev_preds = all_dev_preds.union(dev_pred)
  return all_dev_preds

In [0]:
dev_df = get_val_preds_MLP(df_train,
                  encoding_pipelines,
                  classifier_pipelines,
                  cv_cutoffs)
dev_df.columns

In [0]:
display(dev_df)

In [0]:
classifier_pipelines[0].stages[-1]

In [0]:
# sanity check
evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome",
    predictionCol='prediction',
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0)

evaluator.evaluate(dev_df.withColumn("prediction", col("prediction").cast(DoubleType())))

In [0]:
dev_df.checkpoint() #since i have to do this a bunch more times

In [0]:
dev_df_next= dev_df.select(df_train.columns + ['prediction']).withColumnRenamed('prediction','mlp_prediction')

In [0]:
def get_val_preds_XGB(df, 
                     mod_pipelines,
                      cv_info = cv_cutoffs,
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  For the test portion of each CV fold, transform it with the already trained encoder/classifier to get the predictions
  return the df with predictions which will then be used to get the other models' preds

  input: train_df
  '''

  k = len(cv_info)

  # Start k-fold
  for i in range(k):
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 

    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
    print(f"encoding dev set for fold {i}")
    dev_pred = mod_pipelines[i].transform(dev_df)

    dev_pred = dev_pred.withColumnsRenamed({"daily": f"daily_{i}",
                                            "weekly": f"weekly_{i}",
                                            "yearly": f"yearly_{i}",
                                            "holidays": f"holidays_{i}",
                                            "pagerank": f"train_{i}",
                                            "prediction": "xgb_prediction",
                                            "probability": "xgb_probability"})

    if i == 0:
        all_dev_preds = dev_pred
    else:
        all_dev_preds = all_dev_preds.union(dev_pred)
  return all_dev_preds.select(df.columns + ["xgb_prediction", "xgb_probability"])

In [0]:
xgb_pipelines= [fold0_mod, fold1_mod, fold2_mod, fold3_mod, fold4_mod]

In [0]:
dev_df_xgb = get_val_preds_XGB(dev_df_next, xgb_pipelines)

In [0]:
dev_df_xgb.columns

In [0]:
dev_df_xgb.filter(f.col('sched_depart_utc') > cv_cutoffs.loc[0]['test_min']).filter(f.col('sched_depart_utc') <= cv_cutoffs.loc[0]['test_max']).count()

In [0]:
cv_cutoffs

# Analysis

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql.functions import col


In [0]:
avg_prediction_df.checkpoint()

In [0]:
avg_prediction_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/ensemble.parquet")

In [0]:
len('_prediction')

In [0]:
'mlp_prediction'[:-11].upper()

In [0]:

# Function to plot confusion matrix in percentage/proportion
def plot_confusion_matrix(predictions, label_col, prediction_col):
    # Cast prediction column to DoubleType
    predictions = predictions.withColumn(prediction_col, col(prediction_col).cast("double"))
    
    # Select only the columns of interest
    prediction_and_labels = predictions.select(prediction_col, label_col).rdd.map(tuple)
    
    # Compute confusion matrix
    metrics = MulticlassMetrics(prediction_and_labels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    
    # Normalize the confusion matrix to get proportions
    confusion_matrix = confusion_matrix / confusion_matrix.sum(axis=1)[:, np.newaxis]
    
    # Get unique labels from the DataFrame
    labels = predictions.select(label_col).distinct().orderBy(label_col).rdd.flatMap(lambda x: x).collect()
    
    # Convert to pandas dataframe for easier plotting
    cm_df = pd.DataFrame(confusion_matrix, index=labels, columns=labels)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_df, annot=True, fmt=".2%", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix: {prediction_col[:-11].upper()}')
    plt.show()

# List of prediction columns
prediction_columns = [col for col in avg_prediction_df.columns if col.endswith('_prediction')]

# Plot confusion matrix for each prediction column
for prediction_col in prediction_columns:
    plot_confusion_matrix(avg_prediction_df, "outcome", prediction_col)

In [0]:
avg_prediction_df.filter(col('avg_prediction')==0).filter(col("outcome")==1).groupBy('CANCELLED').count().show()

In [0]:
avg_prediction_df.columns

In [0]:
avg_prediction_df.filter(col('avg_prediction')==0).filter(col("outcome")==1).groupBy('QUARTER').count().orderBy(f.col('count').desc()).show()

In [0]:

# Filter false negatives
false_negatives_df = avg_prediction_df.filter((col('avg_prediction') == 0) & (col('outcome') == 1))

# Filter false positives
false_positives_df = avg_prediction_df.filter((col('avg_prediction') == 1) & (col('outcome') == 0))

# Filter true negatives
true_negatives_df = avg_prediction_df.filter((col('avg_prediction') == 0) & (col('outcome') == 0))

# Filter true positives
true_positives_df = avg_prediction_df.filter((col('avg_prediction') == 1) & (col('outcome') == 1))

In [0]:


# Group by QUARTER and count
false_negatives_quarter_df = false_negatives_df.groupBy('QUARTER').count().withColumnRenamed('count', 'false_negatives')
false_positives_quarter_df = false_positives_df.groupBy('QUARTER').count().withColumnRenamed('count', 'false_positives')
true_negatives_quarter_df = true_negatives_df.groupBy('QUARTER').count().withColumnRenamed('count', 'true_negatives')
true_positives_quarter_df = true_positives_df.groupBy('QUARTER').count().withColumnRenamed('count', 'true_positives')

# Join dataframes on QUARTER
time_series_df = false_negatives_quarter_df.join(false_positives_quarter_df, 'QUARTER', 'outer') \
                                           .join(true_negatives_quarter_df, 'QUARTER', 'outer') \
                                           .join(true_positives_quarter_df, 'QUARTER', 'outer') \
                                           .orderBy('QUARTER')

# Convert to Pandas DataFrame for plotting
time_series_pd = time_series_df.toPandas()

# Plot time series
plt.figure(figsize=(12, 8))
plt.plot(time_series_pd['QUARTER'], time_series_pd['false_negatives'], label='False Negatives', marker='o')
plt.plot(time_series_pd['QUARTER'], time_series_pd['false_positives'], label='False Positives', marker='o')
plt.plot(time_series_pd['QUARTER'], time_series_pd['true_negatives'], label='True Negatives', marker='o')
plt.plot(time_series_pd['QUARTER'], time_series_pd['true_positives'], label='True Positives', marker='o')
plt.title('Time Series of Predictions')
plt.xlabel('Quarter')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.show()

In [0]:

import matplotlib.dates as mdates

false_negatives_day_df = false_negatives_df.withColumn('DAY', col('FL_DATE').cast('date')).groupBy('DAY').count().withColumnRenamed('count', 'false_negatives')
false_positives_day_df = false_positives_df.withColumn('DAY', col('FL_DATE').cast('date')).groupBy('DAY').count().withColumnRenamed('count', 'false_positives')
true_negatives_day_df = true_negatives_df.withColumn('DAY', col('FL_DATE').cast('date')).groupBy('DAY').count().withColumnRenamed('count', 'true_negatives')
true_positives_day_df = true_positives_df.withColumn('DAY', col('FL_DATE').cast('date')).groupBy('DAY').count().withColumnRenamed('count', 'true_positives')

# Join dataframes on DAY
time_series_day_df = false_negatives_day_df.join(false_positives_day_df, 'DAY', 'outer') \
                                           .join(true_negatives_day_df, 'DAY', 'outer') \
                                           .join(true_positives_day_df, 'DAY', 'outer') \
                                           .orderBy('DAY')

# Convert to Pandas DataFrame for plotting
time_series_day_pd = time_series_day_df.toPandas()



# Plot time series
plt.figure(figsize=(15, 8))
plt.plot(time_series_day_pd['DAY'], time_series_day_pd['false_negatives'], label='False Negatives', color='darkblue')
plt.plot(time_series_day_pd['DAY'], time_series_day_pd['false_positives'], label='False Positives', color='darkred')
# plt.plot(time_series_day_pd['DAY'], time_series_day_pd['true_negatives'], label='True Negatives')
# plt.plot(time_series_day_pd['DAY'], time_series_day_pd['true_positives'], label='True Positives')

# Formatting
plt.title('Incorrect Predictions by Day and Type')
plt.xlabel('Day')
plt.ylabel('Count of Predictions')
plt.legend()
plt.grid(True)

# Set x-axis major ticks to every month
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=15))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d-%b-%Y'))  # Format: 01-Jan-2025

plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.tight_layout()       # Avoid label cutoff
plt.show()

In [0]:
correct_mlp_df = avg_prediction_df.filter((col('mlp_prediction') == col('outcome')) &
                                       (col('lr_prediction') != col('outcome')) &
                                       (col('rf_prediction') != col('outcome')) &
                                       (col('xgb_prediction') != col('outcome')))

display(correct_mlp_df)


In [0]:
display(avg_prediction_df.groupBy('FL_DATE').count().orderBy(f.col('COUNT').desc()))

In [0]:
avg_prediction_df.filter(col('avg_prediction')==0).filter(col("outcome")==1).groupBy('YEAR').count().show()