# Imports and Setup

In [0]:
# # Configure Spark settings for better performance
# from pyspark.sql import SparkSession
# spark = SparkSession.builder\
#     .config("spark.executor.memory", "16g")\
#     .config("spark.executor.cores", 4)\
#     .appName('Final Project Training')\
#     .getOrCreate()
# spark.conf.set("spark.sql.shuffle.partitions", "200")
# spark.conf.set("spark.default.parallelism", "200")

In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, to_timestamp, to_date, when
from prophet.make_holidays import make_holidays_df
from xgboost.spark import SparkXGBClassifier

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.functions import vector_to_array


In [0]:
spark.version

### Set options

In [0]:
# data time period
period = "" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds and overlap
k = 5
overlap = 0.2

# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup)
compute_seasonality = False
apply_seasonality = False

# define train/test split date
if period == "3m":
    min_test_dt = "2015-03-01"
elif period == "1y":
    min_test_dt = "2019-10-01"
elif period == "":
    min_test_dt = "2019-01-01"
print(f"Min test set date for {period} dataset: {min_test_dt}")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

## Load data and perform simple transformations

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"

# read in joined, cleaned dataset
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather_{period}.parquet") # !!!
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_weather_cleaned_combo.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather{period}_v1.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned.parquet")
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr_v2.parquet")

# convert time variable to datetime
df = df.withColumn(dep_utc_varname, to_timestamp(col(dep_utc_varname)))

# add hour and date variables (needed for seasonality and CV splits, respectively)
df = df.withColumn("dep_hour_utc", f.hour(col(dep_utc_varname))) \
    .withColumn("dep_date_utc", to_date(col(dep_utc_varname)))

# define outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

# cast weather columns to double
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]
for column in num_weather_cols:
    df = df.withColumn(column, col(column).cast("double"))

df.cache()


In [0]:
# # Group by the year and count the number of records for each year
# df_year_counts = df.groupBy("YEAR").count()

# # Display the result
# display(df_year_counts)

In [0]:
# split into train and test
df_train = df.filter(f.col(dep_utc_varname) < min_test_dt)
# df_train.cache()
# print(f"Train data: {df_train.count()} records")
df_test = df.filter(f.col(dep_utc_varname) >= min_test_dt) \
    .filter(f.col(dep_utc_varname) < "2020-01-01")
# df_test.cache()
# print(f"Test data: {df_test.count()} records")

## Get cross-validation splits

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def get_cv_time_limits_by_days_with_overlap(df, k=3, blocking=False, overlap=0, dep_utc_varname=dep_utc_varname, verbose=True):
    '''
    Get time bins for time-series cross validation, based on # days in dataset
    '''
    
    min_date = df.select(f.min("dep_date_utc")).collect()[0][0]
    max_date = df.select(f.max("dep_date_utc")).collect()[0][0]
    n_days = (max_date - min_date).days + 1
    total_width = k+1 - overlap*(k-1)
    chunk_size = np.ceil(n_days/total_width) # last chunk may be slightly smaller than the others

    # idx = np.arange(0,)
    # idx = np.arange(0,n_days,chunk_size)
    # idx[-1] = n_days-1
    # idx = [int(i)+1 for i in idx]
    
    if verbose:
        print(f'Splitting data into {k} folds with {overlap} overlap')
        print(f'Min date: {min_date}, max date: {max_date}')
        print(f'{chunk_size:,} days per fold')
        print("************************************************************")

    out = []
    for i in range(k):
        # define indices based on chunk size and overlap
        if i == 0:
            train_min_offset = 0
            train_max_offset = chunk_size
        else:
            train_min_offset += np.ceil((1-overlap)*chunk_size)
            train_max_offset += np.floor((1-overlap)*chunk_size)
        test_min_offset = train_max_offset
        test_max_offset = test_min_offset + chunk_size

        # define minimum training time based on cross-validation style
        if not blocking:
            t_min_train = min_date
        else:
            t_min_train = min_date + timedelta(days=train_min_offset)
        # define maximum training time
        t_max_train = min_date + timedelta(days=train_max_offset)
        # define minimum test time
        t_min_test = min_date + timedelta(days=test_min_offset)
        # define maximum test_time
        t_max_test = min_date + timedelta(days=test_max_offset)

        if t_max_test > max_date + timedelta(1):
            t_max_test = max_date + timedelta(1)

        out.append({"train_min":t_min_train, "train_max":t_max_train,
                    "test_min":t_min_test, "test_max":t_max_test})
    out = pd.DataFrame(out)
        
    if verbose:
        for i in range(k):
            print(f'    TRAIN set for fold {i} goes from {out["train_min"][i]} to {out["train_max"][i]}')
            print(f'    TEST set for fold {i} goes from {out["test_min"][i]} to {out["test_max"][i]}')
        print("(Note that the max dates are non-inclusive)")
        
    return out

In [0]:
cv_cutoffs = [
    {"train_min": "2014-12-31", "train_max": "2015-10-09", "test_min": "2015-10-09", "test_max": "2016-07-17"},
    {"train_min": "2015-08-14", "train_max": "2016-05-21","test_min": "2016-05-21", "test_max": "2017-02-27"},
    {"train_min": "2016-03-27", "train_max": "2017-01-01","test_min": "2017-01-01", "test_max": "2017-10-10"},
    {"train_min": "2016-11-08", "train_max": "2017-08-14","test_min": "2017-08-14", "test_max": "2018-05-23"},
    {"train_min": "2017-06-22", "train_max": "2018-03-27","test_min": "2018-03-27", "test_max": "2019-01-01"}
    ]
cv_cutoffs = pd.DataFrame(cv_cutoffs)
cv_cutoffs

In [0]:
# # get cross-validation split times
# cv_cutoffs = get_cv_time_limits_by_days_with_overlap(df_train.select("dep_date_utc"), k=k, blocking=True, overlap=overlap,
#     dep_utc_varname=dep_utc_varname, verbose=True)
# cv_cutoffs

## MLP Hyperparameter Tuning

In [0]:
def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

In [0]:
def time_series_cv_folds(
    df,
    time_col: str,
    k: int=3,
    blocking: bool=False,
    overlap: float=0.0,
    verbose: bool=False
):
    """
    Split a time-series PySpark DataFrame into k train/test folds with optional overlap and blocking.
    
    Args:
        df (DataFrame): PySpark DataFrame with a timestamp column.
        dep_utc_time_colvarname (str): Name of the timestamp column.
        k (int): Number of folds.
        blocking (bool): Whether to block the training set to avoid cumulative data.
        overlap (float): Fraction of overlap between validation windows (e.g. 0.2 = 20% overlap).
        verbose (bool): Whether to print the time splits.
        
    Returns:
        List of (train_df, val_df) tuples.
    """
    # Get time boundaries
    min_date = df.select(F.min(time_col)).first()[0]
    max_date = df.select(F.max(time_col)).first()[0]
    n_days = (max_date - min_date).days + 1

    # Adjust chunk sizing
    total_width = k + 1 - overlap * (k - 1)
    chunk_size = int(np.ceil(n_days / total_width))

    if verbose:
        print(f"Splitting data into {k} folds with {overlap*100:.0f}% overlap")
        print(f"Min date: {min_date}, Max date: {max_date}")
        print(f"{chunk_size:,} days per fold")
        print("************************************************************")

    folds = []
    for i in range(k):
        # Offset calculation with overlap
        train_start_offset = 0 if not blocking else int(i * (1 - overlap) * chunk_size)
        train_end_offset = int((i + 1) * chunk_size)
        val_start_offset = train_end_offset
        val_end_offset = int(val_start_offset + chunk_size)

        # Compute actual timestamps
        train_start = min_date + timedelta(days=train_start_offset)
        train_end = min_date + timedelta(days=train_end_offset)
        val_start = min_date + timedelta(days=val_start_offset)
        val_end = min_date + timedelta(days=val_end_offset)

        if val_start >= max_date:
            break
        if val_end > max_date:
            val_end = max_date + timedelta(days=1)

        # Apply filters
        train_df = df.filter((F.col(time_col) >= train_start) & (F.col(time_col) < train_end))
        val_df = df.filter((F.col(time_col) >= val_start) & (F.col(time_col) < val_end))

        if verbose:
            print(f"Fold {i + 1}:")
            print(f"  TRAIN: {train_start.date()} → {train_end.date()} ({train_df.count():,} rows)")
            print(f"  VAL:   {val_start.date()} → {val_end.date()} ({val_df.count():,} rows)")
            print("------------------------------------------------------------")

        folds.append((train_df, val_df))

    return folds

In [0]:
# Testing Time Series CV function
folds = time_series_cv_folds(
    df_train,
    time_col="sched_depart_utc",
    k=5,
    overlap=.2,
    blocking=True,
    verbose=True
)

In [0]:
folds

In [0]:
len(cv_cutoffs)

In [0]:
def timeSeriesSplitCV(df, 
                      pre_pipeline,
                      hidden_layers,
                      stepSize,
                      maxIter,
                      blockSize,
                      cv_info= cv_cutoffs,
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info
  3) hidden layer sizes in a list


  note that the scaling+classification pipeline is initialized and fit in this method itself 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed','priororigin_mean_dep_delay']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed', 'priororigin_mean_dep_delay']})
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size] + hidden_layers + [2]
    #input features, hidden layers, classification head
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=maxIter,
                                                stepSize=stepSize,
                                                layers=layers,
                                                blockSize=blockSize,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}")
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))

    print(f"transforming encoded dev df for fold {i}")
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome'))

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return avg_score

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = ["origin_HourlyDewPointTemperature", "origin_HourlyPrecipitation", "origin_HourlyWindGustSpeed", "origin_HourlyVisibility", "origin_HourlyPressureChange"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
from typing import List, Dict, Tuple, Any, Union,Callable
import numpy as np
import random
from datetime import timedelta

import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    MultilayerPerceptronClassifier
)
from xgboost.spark import SparkXGBClassifier
import mlflow
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

In [0]:
    indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

    encoders = [OneHotEncoder(
        inputCol='{0}_index'.format(column), 
        outputCol='{0}_ohe'.format(column)
        ) for column in cat_cols]



    [encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
    [encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

    # Fill missing values with 0 for the specified columns
    # df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


    featuresCreator = VectorAssembler(
        inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
        outputCol='features', handleInvalid='skip')

    stages = indexers + encoders

    pre_pipeline = Pipeline(stages= stages + [featuresCreator])

In [0]:
def model_tuner(
    model_name: str,
    model_params: Dict[str, Any],
    mlflow_run_name: str = "/Users/m.bakr@berkeley.edu/flight_delay_tuning",
    metric: str = "F2",
    verbose: bool = True
) -> Dict[str, Union[float, str, Dict[str, Any]]]:
    """
    Universal tuning function for PySpark classification models using time-series cross-validation.

    Args:
        model_name (str): One of ['logreg', 'rf', 'mlp', 'xgb']
        model_params (Dict[str, Any]): Parameters to apply to the model
        folds (List of (train_df, val_df)): Time-aware CV folds
        mlflow_run_name (str): Optional MLflow parent run name
        verbose (bool): Whether to log outputs during tuning

    Returns:
        Dict with best average F2 or pr score, model name, and parameters
    """

    # Model factory
    model_factory = {
        "logreg": LogisticRegression,
        "rf": RandomForestClassifier,
        "mlp": MultilayerPerceptronClassifier,
        "xgb": SparkXGBClassifier
    }

    assert model_name in model_factory, f"Unsupported model: {model_name}"

    ModelClass = model_factory[model_name]




    scores = []

    with mlflow.start_run(run_name=mlflow_run_name):

        indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column), handleInvalid='keep') for column in cat_cols]

        encoders = [OneHotEncoder(
        inputCol='{0}_index'.format(column), 
        outputCol='{0}_ohe'.format(column)
        ) for column in cat_cols]



        [encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]

        featuresCreator = VectorAssembler(
            inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
            outputCol='features', handleInvalid='skip')

        stages = indexers + encoders

        pre_pipeline = Pipeline(stages= stages + [featuresCreator])
        scores = timeSeriesSplitCV(df=df_train,
                                   pre_pipeline= pre_pipeline, **model_params)
        

        avg_score = float(np.mean(scores))
        mlflow.log_param("model", model_name)
        mlflow.log_params(model_params)
        mlflow.log_metric("avg_{metric}_score", avg_score)

        if verbose:
            print(f"✅ Average {metric} Score: {avg_score:.4f} | Model: {model_name}")

    return {
        "model": model_name,
        "params": model_params,
        "avg_f2_score": avg_score
    }

In [0]:
cv_cutoffs

In [0]:


def make_hyperopt_objective(
    model_name: str,
    param_space_converter: Callable[[Dict[str, Any]], Dict[str, Any]],
    mlflow_experiment_name: str = "Hyperopt_Universal_Tuning",
    verbose: bool = True
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
    """
    Creates a Hyperopt-compatible objective function for any PySpark classifier.

    Args:
        model_name (str): One of 'logreg', 'rf', 'mlp', 'xgb'.
        folds (List of (train_df, val_df)): Time-series CV folds.
        param_space_converter (Callable): Converts Hyperopt sample into model params.
        mlflow_experiment_name (str): MLflow experiment name.
        verbose (bool): Logging toggle.

    Returns:
        Callable that can be passed as fn to hyperopt.fmin()
    """

    def objective(sampled_params: Dict[str, Any]) -> Dict[str, Any]:
        # Convert sampled param space to Spark-friendly params
        model_params = param_space_converter(sampled_params)

        result = model_tuner(
            model_name=model_name,
            model_params=model_params,
            mlflow_run_name=f"hyperopt_{model_name}",
            verbose=verbose
        )

        return {
            "loss": -result["avg_f2_score"],  # Minimize negative F2
            "status": STATUS_OK,
            "params": result["params"]
        }

    return objective


In [0]:
mlp_space = {
    "hidden_layers": hp.choice("hidden_layers", [[64, 32], [32, 8, 4], [128, 32]]),
    "stepSize": hp.uniform("stepSize", 0.01, 0.3),
    "maxIter": hp.choice("maxIter", [100, 200]),
    "blockSize": hp.choice("blockSize", [64, 128])
}

def mlp_param_mapper(sampled):
    return {
        "hidden_layers": list(sampled["hidden_layers"]),
        "stepSize": sampled["stepSize"],
        "maxIter": sampled["maxIter"],
        "blockSize": sampled["blockSize"]
    }

mlp_obj = make_hyperopt_objective(
    model_name="mlp",
    param_space_converter=mlp_param_mapper,
    mlflow_experiment_name="MLP_Hyperopt",
    verbose=True
)

best_mlp = fmin(
    fn=mlp_obj,
    space=mlp_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best MLP params:", best_mlp)

Must show training and performance scores, including training curves by epoch.

# Ensemble

## MLP

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

In [0]:
def getTSCVmods(df, 
                      pre_pipeline,
                      cv_info, 
                      hidden_layers,
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info
  3) hidden layer sizes in a list


  note that the scaling+classification pipeline is initialized and fit in this method itself 

  returns scores and pipelines
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  encoder_pipelines = []
  classifier_pipelines = []
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)
    encoder_pipelines.append(train_df_transformed_model)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size] + hidden_layers + [2]
    #input features, hidden layers, classification head
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=200,
                                                layers=layers,
                                                blockSize=128,
                                                stepSize=.0524,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}")
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))
    classifier_pipelines.append(mlp_model)
    print(f"transforming encoded dev df for fold {i}")
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome'))

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return scores, encoder_pipelines, classifier_pipelines

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = ["origin_HourlyDewPointTemperature", "origin_HourlyPrecipitation", "origin_HourlyWindGustSpeed", "origin_HourlyVisibility", "origin_HourlyPressureChange"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, "train"]

In [0]:
#pre-pipeline
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(
    inputCol='{0}_index'.format(column), 
    outputCol='{0}_ohe'.format(column)
    ) for column in cat_cols]



[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
[encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

# Fill missing values with 0 for the specified columns
# df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

scores, encoding_pipelines, classifier_pipelines = getTSCVmods(
    df_train,
    vec_pipeline_full,
    cv_cutoffs,
    hidden_layers = [128, 32],
    sampling='down',
    metric='f2',
    verbose=True,
    dep_utc_varname='sched_depart_utc'
)

In [0]:
scores

In [0]:
scores

In [0]:
for idx, pipeline in enumerate(encoding_pipelines):
    pipeline.write().overwrite().save(f"dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/encoding_pipeline_{idx}")

for idx, pipeline in enumerate(classifier_pipelines):
    pipeline.write().overwrite().save(f"dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints//classifier_pipeline_{idx}")

In [0]:
import matplotlib.pyplot as plt

for i, pipeline in enumerate(classifier_pipelines):
    objective_history = pipeline.stages[-1].summary().objectiveHistory
    plt.plot(objective_history, label=f'Fold {i}')

plt.xlabel('Iteration')
plt.ylabel('Log Loss')
plt.title('Objective History for Each Fold Pipeline')
plt.legend()
plt.show()

In [0]:
test0=encoding_pipelines[0].transform(df_test.withColumnsRenamed({'daily_full':'daily','weekly_full':'weekly','yearly_full':'yearly','holidays_full':'holidays'}))
test0=classifier_pipelines[0].transform(test0)

In [0]:
classifier_pipelines

In [0]:
starter=df_test.withColumnsRenamed({'daily_full':'daily','weekly_full':'weekly','yearly_full':'yearly','holidays_full':'holidays'})

In [0]:
filter_cols= numeric_cols_cv+seasonality_cols_cv+cat_cols+['outcome']

In [0]:
starter.columns

In [0]:
starter.columns

In [0]:
fold0_test_encoded.columns[:-25]

In [0]:
starter.columns

In [0]:
df_test.columns

In [0]:
starter=df_test.withColumnsRenamed({'daily_full':'daily','weekly_full':'weekly','yearly_full':'yearly','holidays_full':'holidays'})


fold0_test_encoded = encoding_pipelines[0].transform(starter)
fold0_test_transformed = classifier_pipelines[0] \
    .transform(fold0_test_encoded) \
    .withColumn("fold0_probs", vector_to_array("probability")[1])

In [0]:
fold0_test_encoded.columns[:-25]

In [0]:
fold1_test_encoded = encoding_pipelines[1] \
    .transform(fold0_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs']))

fold1_test_transformed = classifier_pipelines[1] \
    .transform(fold1_test_encoded) \
    .withColumn("fold1_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs']) #fold 1 preds

In [0]:
display(fold1_test_transformed)

In [0]:
fold2_test_encoded = encoding_pipelines[2] \
    .transform(fold1_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs', 'fold1_probs']))

fold2_test_transformed = classifier_pipelines[2] \
    .transform(fold2_test_encoded) \
    .withColumn("fold2_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs', 'fold2_probs']) #fold 2 preds

In [0]:
fold3_test_encoded = encoding_pipelines[3] \
    .transform(fold2_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs', 'fold1_probs', 'fold2_probs']))

fold3_test_transformed = classifier_pipelines[3] \
    .transform(fold3_test_encoded) \
    .withColumn("fold3_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs']) #fold 3 preds

In [0]:
fold4_test_encoded = encoding_pipelines[4] \
    .transform(fold3_test_transformed.select(fold0_test_encoded.columns[:-25] + 
                                             ['fold0_probs', 'fold1_probs', 'fold2_probs', 'fold3_probs']))

fold4_test_transformed = classifier_pipelines[4] \
    .transform(fold4_test_encoded) \
    .withColumn("fold4_probs", vector_to_array("probability")[1]) \
    .select(starter.columns+
            ['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs', 'fold4_probs']) #fold 4 preds

In [0]:
display(fold4_test_transformed)

In [0]:
alpha = 0.5  # decay rate; adjust as needed
num_folds = 5

raw_weights = np.array([alpha ** (num_folds - 1 - i) for i in range(num_folds)])
weights = raw_weights / raw_weights.sum()  # normalize to sum to 1

In [0]:
weights

In [0]:
ewa_expr = sum([weights[i] * col(f"fold{i}_probs") for i in range(num_folds)])

final_df = fold4_test_transformed.withColumn("ewa_prob", ewa_expr)

In [0]:
final_df=final_df.withColumn('prediction', when(col('ewa_prob') >= 0.5, 1).otherwise(0))

In [0]:
display(final_df)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome",
    predictionCol='prediction', 
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0
)

evaluator.evaluate(final_df.withColumn("prediction", col("prediction").cast(DoubleType())))


In [0]:
final_df=final_df.withColumnsRenamed({'fold0_probs':'mlp_fold0',
                             'fold1_probs':'mlp_fold1',
                             'fold2_probs':'mlp_fold2',
                             'fold3_probs':'mlp_fold3',
                             'fold4_probs':'mlp_fold4',
                             'ewa_prob':'mlp_ewa_prob',
                             'prediction':'mlp_prediction'})

In [0]:
final_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/mlp_results_df.parquet")

## XGBoost

In [0]:
final_dfx = spark.read.parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/mlp_results_df.parquet")

In [0]:
display(final_dfx)

In [0]:
from pyspark.ml.pipeline import PipelineModel


In [0]:
categorical_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "YEAR",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily","weekly","yearly","holidays"]

weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]


# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank"]

keep_me = ["outcome",dep_utc_varname]


numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *num_weather_cols, *graph_cols]
# numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
fold0_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_0")

fold1_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_1")
fold2_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_2")
fold3_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_3")
fold4_mod=PipelineModel.load("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/xgboost_fold_4")




In [0]:
mlp_cols = ['mlp_fold0','mlp_fold1','mlp_fold2','mlp_fold3','mlp_fold4','mlp_ewa_prob','mlp_prediction']
filter_cols = [*keep_me, *numeric_cols, *categorical_cols, *mlp_cols]


In [0]:
final_dfx.columns

In [0]:
xgb_final_df=final_dfx.withColumnRenamed('train','pagerank')

In [0]:
display(xgb_final_df)

In [0]:
fold0_test=fold0_mod.transform(xgb_final_df).withColumn("fold0_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs']) #fold 0 preds


In [0]:
# fold0_test=fold0_mod.transform(final_df.withColumnsRenamed({'train':'pagerank'})).withColumn("fold0_probs", vector_to_array("probability")[1]).select(filter_cols+['fold0_probs']) #fold 0 preds

fold1_test=fold1_mod.transform(fold0_test).withColumn("fold1_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs']) #fold 1 preds

fold2_test=fold2_mod.transform(fold1_test).withColumn("fold2_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs', 'fold2_probs']) #fold 2 preds

fold3_test=fold3_mod.transform(fold2_test).withColumn("fold3_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs']) 

fold4_test=fold4_mod.transform(fold3_test).withColumn("fold4_probs", vector_to_array("probability")[1]).select(xgb_final_df.columns+['fold0_probs','fold1_probs', 'fold2_probs', 'fold3_probs', 'fold4_probs'])




In [0]:
fold4_test.checkpoint()
display(fold4_test)

In [0]:
alpha = 0.5  # decay rate; adjust as needed
num_folds = 5

raw_weights = np.array([alpha ** (num_folds - 1 - i) for i in range(num_folds)])
weights = raw_weights / raw_weights.sum()  # normalize to sum to 1

In [0]:
ewa_expr = sum([weights[i] * col(f"fold{i}_probs") for i in range(num_folds)])

xgb_final_df = fold4_test.withColumn("ewa_prob", ewa_expr)

xgb_final_df=xgb_final_df.withColumn('prediction', when(col('ewa_prob') >= 0.5, 1).otherwise(0))

In [0]:
xgb_final_df=xgb_final_df.withColumnsRenamed({'fold0_probs':'xgb_fold0',
                             'fold1_probs':'xgb_fold1',
                             'fold2_probs':'xgb_fold2',
                             'fold3_probs':'xgb_fold3',
                             'fold4_probs':'xgb_fold4',
                             'ewa_prob':'xgb_ewa_prob',
                             'prediction':'xgb_prediction'})

In [0]:
final_df.

In [0]:
xgb_final_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/model_results_df.parquet")

## LR

In [0]:
xgb_final_df= spark.read.parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/model_results_df.parquet")

In [0]:
import mlflow

In [0]:
run_id = "182e305fc13048549e73bcc23fb6af78"
model_uri = f"runs:/{run_id}/model"
loaded_lr_model = mlflow.spark.load_model(model_uri)
loaded_lr_model

In [0]:
lr_df=loaded_lr_model.transform(xgb_final_df.withColumnRenamed('train','pagerank'))

In [0]:
final_df.count()

In [0]:
lr_df=lr_df.withColumn("lr_probs", vector_to_array("probability")[1])

In [0]:
lr_df=lr_df.withColumnRenamed('prediction','lr_prediction')

In [0]:
lr_df.columns

In [0]:
[c for c in lr_df.columns if 'mlp' in c or 'xgb' in c or 'lr' in c]

In [0]:
lr_df.write.mode("overwrite").parquet("dbfs:/student-groups/Group_4_1/interim/modeling_checkpoints/mlp_xgb_lr_results.parquet")