# Imports and Setup

In [0]:
# # Configure Spark settings for better performance
# from pyspark.sql import SparkSession
# spark = SparkSession.builder\
#     .config("spark.executor.memory", "16g")\
#     .config("spark.executor.cores", 4)\
#     .appName('Final Project Training')\
#     .getOrCreate()
# spark.conf.set("spark.sql.shuffle.partitions", "200")
# spark.conf.set("spark.default.parallelism", "200")

In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, to_timestamp, to_date, when
from prophet.make_holidays import make_holidays_df
from xgboost.spark import SparkXGBClassifier

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
spark.version

### Set options

In [0]:
# data time period
period = "" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds and overlap
k = 5
overlap = 0.2

# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup)
compute_seasonality = False
apply_seasonality = False

# define train/test split date
if period == "3m":
    min_test_dt = "2015-03-01"
elif period == "1y":
    min_test_dt = "2019-10-01"
elif period == "":
    min_test_dt = "2019-01-01"
print(f"Min test set date for {period} dataset: {min_test_dt}")

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

## Load data and perform simple transformations

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"

# read in joined, cleaned dataset
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather_{period}.parquet") # !!!
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_weather_cleaned_combo.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_flights_weather{period}_v1.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat.parquet")
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned.parquet")
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr_v2.parquet")

# convert time variable to datetime
df = df.withColumn(dep_utc_varname, to_timestamp(col(dep_utc_varname)))

# add hour and date variables (needed for seasonality and CV splits, respectively)
df = df.withColumn("dep_hour_utc", f.hour(col(dep_utc_varname))) \
    .withColumn("dep_date_utc", to_date(col(dep_utc_varname)))

# define outcome variable
df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))

# cast weather columns to double
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]
for column in num_weather_cols:
    df = df.withColumn(column, col(column).cast("double"))

df.cache()


In [0]:
# # Group by the year and count the number of records for each year
# df_year_counts = df.groupBy("YEAR").count()

# # Display the result
# display(df_year_counts)

In [0]:
df = df.withColumn('pagerank_train', (df['train_0'] + df['train_1'] + df['train_2'] + df['train_3'] + df['train_4']) / 5)

In [0]:
# split into train and test
df_train = df.filter(f.col(dep_utc_varname) < min_test_dt)
# df_train.cache()
# print(f"Train data: {df_train.count()} records")
df_test = df.filter(f.col(dep_utc_varname) >= min_test_dt) \
    .filter(f.col(dep_utc_varname) < "2020-01-01")
# df_test.cache()
# print(f"Test data: {df_test.count()} records")

## Get cross-validation splits

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def get_cv_time_limits_by_days_with_overlap(df, k=3, blocking=False, overlap=0, dep_utc_varname=dep_utc_varname, verbose=True):
    '''
    Get time bins for time-series cross validation, based on # days in dataset
    '''
    
    min_date = df.select(f.min("dep_date_utc")).collect()[0][0]
    max_date = df.select(f.max("dep_date_utc")).collect()[0][0]
    n_days = (max_date - min_date).days + 1
    total_width = k+1 - overlap*(k-1)
    chunk_size = np.ceil(n_days/total_width) # last chunk may be slightly smaller than the others

    # idx = np.arange(0,)
    # idx = np.arange(0,n_days,chunk_size)
    # idx[-1] = n_days-1
    # idx = [int(i)+1 for i in idx]
    
    if verbose:
        print(f'Splitting data into {k} folds with {overlap} overlap')
        print(f'Min date: {min_date}, max date: {max_date}')
        print(f'{chunk_size:,} days per fold')
        print("************************************************************")

    out = []
    for i in range(k):
        # define indices based on chunk size and overlap
        if i == 0:
            train_min_offset = 0
            train_max_offset = chunk_size
        else:
            train_min_offset += np.ceil((1-overlap)*chunk_size)
            train_max_offset += np.floor((1-overlap)*chunk_size)
        test_min_offset = train_max_offset
        test_max_offset = test_min_offset + chunk_size

        # define minimum training time based on cross-validation style
        if not blocking:
            t_min_train = min_date
        else:
            t_min_train = min_date + timedelta(days=train_min_offset)
        # define maximum training time
        t_max_train = min_date + timedelta(days=train_max_offset)
        # define minimum test time
        t_min_test = min_date + timedelta(days=test_min_offset)
        # define maximum test_time
        t_max_test = min_date + timedelta(days=test_max_offset)

        if t_max_test > max_date + timedelta(1):
            t_max_test = max_date + timedelta(1)

        out.append({"train_min":t_min_train, "train_max":t_max_train,
                    "test_min":t_min_test, "test_max":t_max_test})
    out = pd.DataFrame(out)
        
    if verbose:
        for i in range(k):
            print(f'    TRAIN set for fold {i} goes from {out["train_min"][i]} to {out["train_max"][i]}')
            print(f'    TEST set for fold {i} goes from {out["test_min"][i]} to {out["test_max"][i]}')
        print("(Note that the max dates are non-inclusive)")
        
    return out

In [0]:
cv_cutoffs = [
    {"train_min": "2014-12-31", "train_max": "2015-10-09", "test_min": "2015-10-09", "test_max": "2016-07-17"},
    {"train_min": "2015-08-14", "train_max": "2016-05-21","test_min": "2016-05-21", "test_max": "2017-02-27"},
    {"train_min": "2016-03-27", "train_max": "2017-01-01","test_min": "2017-01-01", "test_max": "2017-10-10"},
    {"train_min": "2016-11-08", "train_max": "2017-08-14","test_min": "2017-08-14", "test_max": "2018-05-23"},
    {"train_min": "2017-06-22", "train_max": "2018-03-27","test_min": "2018-03-27", "test_max": "2019-01-01"}
    ]
cv_cutoffs = pd.DataFrame(cv_cutoffs)
cv_cutoffs

In [0]:
# # get cross-validation split times
# cv_cutoffs = get_cv_time_limits_by_days_with_overlap(df_train.select("dep_date_utc"), k=k, blocking=True, overlap=overlap,
#     dep_utc_varname=dep_utc_varname, verbose=True)
# cv_cutoffs

# Modeling Setup

### Define columns to be used in model.

In [0]:
# weather columns
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]

# seasonality columns
seasonality_cols = ["daily","weekly","yearly","holidays"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed"]

# date related columns
date_cols = ["YEAR","MONTH","DAY_OF_MONTH","DAY_OF_WEEK"]

# flight metadata
flight_metadata_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]

# prior & current flight cols
num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                ]

bool_flight_cols = ['priorflight_isdeparted', 
                    'priorflight_isarrived_calc',
                    'priorflight_isdelayed_calc',
                    'priorflight_cancelled_true']

# graph columns
graph_cols = ["pagerank"]

# fields that will not be features but need to be kept for processing
keep_me = ["outcome",dep_utc_varname] + [col for col in df.columns if "train" in col or "test" in col or "daily" in col or "weekly" in col or "yearly" in col or "holidays" in col]

########## Define columns to be used as numeric and categorical features in the pipeline ##########
numeric_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
categorical_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

In [0]:
filter_cols = [*keep_me, *numeric_cols, *categorical_cols]
filter_cols = [c for c in filter_cols if c in df.columns]

df_train = df_train.select(filter_cols)
df_train.cache()
df_test = df_test.select(filter_cols)
df_test.cache()

### Helper functions

In [0]:
df_train.columns

In [0]:
# CODE BELOW DERIVED FROM DEMO 11 NOTEBOOK

def upsample(train_df,verbose=False):
  '''Upsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = non_delay_count / delay_count

  train_delay = train_df.filter(f.col('outcome') == 0)
  train_non_delay = train_df.filter(f.col('outcome') == 1).sample(withReplacement=True, fraction=keep_percent,seed=42)
  train_upsampled = train_delay.union(train_non_delay)
  return train_upsampled


def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

def cv_eval(preds):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  return F2, pr

def timeSeriesSplitCV(df, pipeline, cv_info, sampling=None, metric='f2', verbose=True, dep_utc_varname=dep_utc_varname):
  '''
  Perform timSeriesSplit k-fold cross validation 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    
    # print(f"{train_df.count()} (unsampled) TRAIN records in fold {i+1}")
    # print(f"{dev_df.count()} DEV records in fold {i+1}")

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    elif sampling=='up':
      train_df = upsample(train_df)
      # train_df = train_df.cache()
    # elif sampling=='weights':
    #   train_df = add_class_weights(train_df).cache()
      
    # #print info on train and dev set for this fold
    # if verbose:
    #   print('    TRAIN set for fold {} goes from {} to {}, count is {:,} flights ({})'.format((i+1), 
    #                                                                                   train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
    #                                                                                   train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
    #                                                                                   train_df.count(),
    #                                                                                   sampling + '-sampled' if sampling else 'no sampling'))
    #   print('    DEV set for fold {} goes from {} to {}, count is {:,} flights'.format((i+1), 
    #                                                                                   dev_df.agg({dep_utc_varname:'min'}).collect()[0][0],
    #                                                                                   dev_df.agg({dep_utc_varname:'max'}).collect()[0][0],
    #                                                                                   dev_df.count()))
    
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    
    print(f"training for fold {i}")
        
    # Fit params on the model
    model = pipeline.fit(train_df)
    dev_pred = model.transform(dev_df)
    if metric=='f2':
      score = cv_eval(dev_pred)[0]
    elif metric=='pr':
      score = cv_eval(dev_pred)[1]
    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")

  # # Train on full df
  # print('Training on full train dataset, and validating on dev dataset with best parameters from CV:')
  # print(best_parameters)
    
  # if verbose:
  #   print('    TRAIN set for best parameter fitted model goes from {} to {}, count is {:,} flights ({})'.format(train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
  #                                                                                                    train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
  #                                                                                                    train_df.count(),
  #                                                                                                    sampling + '-sampled' if sampling else 'no sampling'))
  return avg_score

In [0]:
# final training and evaluation

def final_eval(df_train, df_test,pipeline):
    df_train = downsample(df_train).cache()
    df_train = df_train \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"train","pagerank")
    df_train = df_train.fillna({col:0 for col in \
        ['daily','weekly','yearly','holidays','mean_dep_delay', 'prop_delayed']})

    df_test = df_test \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"train","pagerank")
    df_test = df_test.fillna({col:0 for col in \
        ['daily','weekly','yearly','holidays','mean_dep_delay', 'prop_delayed']})


    model = pipeline.fit(df_train)
    dev_pred = model.transform(df_test)
    # get f2 score
    score = cv_eval(dev_pred)[0]
    print(score)

    return dev_pred

# Base MLP Model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [0]:
def timeSeriesSplitCV(df, 
                      pre_pipeline,
                      cv_info, 
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info


  note that the scaling+classification pipeline is initialized and fit in this method itself 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    

    
    
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size, 4, 2, 2]
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=50,
                                                layers=layers,
                                                blockSize=128,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}") #added logging msg
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))

    print(f"transforming encoded dev df for fold {i}") #added logging msg 
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome')) #added select features only

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return avg_score

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "origin_region"
    ]

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed"]

# prior & current flight cols
num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME'
                ]
graph_cols = ["pagerank_train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]

In [0]:
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(
    inputCol='{0}_index'.format(column), 
    outputCol='{0}_ohe'.format(column)
    ) for column in cat_cols]



[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
[encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

In [0]:


# Fill missing values with 0 for the specified columns
df_filled = df_train.fillna({c: 0 for c in numeric_cols if c in df_train.columns})

# # Ensure the dataframe is properly formatted and has no issues with dimensions
# df_filled = df_filled.repartition(200)  # Adjust the number of partitions as needed
featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

timeSeriesSplitCV(
    df_filled,
    vec_pipeline_full,
    cv_cutoffs,
    sampling='down',
    metric='f2',
    verbose=True,
    dep_utc_varname='sched_depart_utc'
)

In [0]:
dev_pred_mlp = final_eval(df_train.fillna({c: 0 for c in numeric_cols if c in df_train.columns}), df_test.fillna({c: 0 for c in numeric_cols if c in df_train.columns}), pipeline_mlp)

In [0]:
# Fill missing values with 0 for the specified columns
df_filled = df_train.fillna({c: 0 for c in numeric_cols if c in df_train.columns})

# # Ensure the dataframe is properly formatted and has no issues with dimensions
# df_filled = df_filled.repartition(200)  # Adjust the number of partitions as needed
featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])




In [0]:
test_filled = df_test.fillna({c: 0 for c in numeric_cols if c in df_test.columns})


In [0]:
df_train_filled = df_filled.select(*numeric_cols, *cat_cols, 'outcome')

In [0]:

encoding_model = vec_pipeline_full.fit(df_train_filled)

In [0]:

df_train_encoded = encoding_model.transform(df_train_filled)

In [0]:

layers = [df_train_encoded.first()['features'].size, 4, 2, 2]

In [0]:
layers

In [0]:

df_test_encoded = encoding_model.transform(test_filled)

In [0]:
df_test_encoded.first()['features'].size

In [0]:
[df_test_encoded.first()['features'].size] + [4, 8, 16, 2]

In [0]:
df_test_encoded.columns

In [0]:

scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=50,
                                                layers=layers,
                                                blockSize=128,
                                                seed=1234)
pipeline_mlp = Pipeline(stages=[scaler, classifier])


In [0]:

df_train_encoded_selected = df_train_encoded.select('features','outcome')

In [0]:
def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

In [0]:

downsampled_train = downsample(df_train_encoded_selected)

In [0]:

mlp_model = pipeline_mlp.fit(downsampled_train)

In [0]:

df_test_encoded_selected = df_test_encoded.select('features','outcome')

In [0]:

test_preds = mlp_model.transform(df_test_encoded_selected)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome", 
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0
)

evaluator.evaluate(test_preds)

In [0]:
display(test_preds)

# Upgrading MLP Model

In [0]:
df = df.withColumn('priorflight_sched_elapsed', f.col('priorflight_sched_elapsed').cast('int')/60)

In [0]:


df=df.fillna({'priorflight_sched_elapsed': 60})


In [0]:
# split into train and test
df_train = df.filter(f.col(dep_utc_varname) < min_test_dt)
# df_train.cache()
# print(f"Train data: {df_train.count()} records")
df_test = df.filter(f.col(dep_utc_varname) >= min_test_dt) \
    .filter(f.col(dep_utc_varname) < "2020-01-01")
# df_test.cache()
# print(f"Test data: {df_test.count()} records")

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
weather_cols = [c for c in weather_cols if c not in remove_me]

# time columns
time_cols = ["mean_dep_delay","prop_delayed"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank_train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
def timeSeriesSplitCV(df, 
                      pre_pipeline,
                      cv_info, 
                      hidden_layers,
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info
  3) hidden layer sizes in a list


  note that the scaling+classification pipeline is initialized and fit in this method itself 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size] + hidden_layers + [2]
    #input features, hidden layers, classification head
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=50,
                                                layers=layers,
                                                blockSize=128,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}")
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))

    print(f"transforming encoded dev df for fold {i}")
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome'))

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return avg_score

In [0]:
cv_cutoffs = [
    {"train_min": "2014-12-31", "train_max": "2015-10-09", "test_min": "2015-10-09", "test_max": "2016-07-17"},
    {"train_min": "2015-08-14", "train_max": "2016-05-21","test_min": "2016-05-21", "test_max": "2017-02-27"},
    {"train_min": "2016-03-27", "train_max": "2017-01-01","test_min": "2017-01-01", "test_max": "2017-10-10"},
    {"train_min": "2016-11-08", "train_max": "2017-08-14","test_min": "2017-08-14", "test_max": "2018-05-23"},
    {"train_min": "2017-06-22", "train_max": "2018-03-27","test_min": "2018-03-27", "test_max": "2019-01-01"}
    ]
cv_cutoffs = pd.DataFrame(cv_cutoffs)
cv_cutoffs

In [0]:
#pre-pipeline
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(
    inputCol='{0}_index'.format(column), 
    outputCol='{0}_ohe'.format(column)
    ) for column in cat_cols]



[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
[encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

# Fill missing values with 0 for the specified columns
df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

timeSeriesSplitCV(
    df_filled,
    vec_pipeline_full,
    cv_cutoffs,
    hidden_layers = [8,4,4],
    sampling='down',
    metric='f2',
    verbose=True,
    dep_utc_varname='sched_depart_utc'
)

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
weather_cols = [c for c in weather_cols if c not in remove_me]

# time columns
time_cols = ["mean_dep_delay","prop_delayed"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank_train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

# Features Round 2

In [0]:
df = df.withColumn('priorflight_sched_elapsed', f.col('priorflight_sched_elapsed').cast('int')/60)
df=df.fillna({'priorflight_sched_elapsed': 60})

# split into train and test
df_train = df.filter(f.col(dep_utc_varname) < min_test_dt)
# df_train.cache()
# print(f"Train data: {df_train.count()} records")
df_test = df.filter(f.col(dep_utc_varname) >= min_test_dt) \
    .filter(f.col(dep_utc_varname) < "2020-01-01")
# df_test.cache()
# print(f"Test data: {df_test.count()} records")

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = ["origin_HourlyDewPointTemperature", "origin_HourlyPrecipitation", "origin_HourlyWindGustSpeed", "origin_HourlyVisibility", "origin_HourlyPressureChange"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank_train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
def timeSeriesSplitCV(df, 
                      pre_pipeline,
                      cv_info, 
                      hidden_layers,
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info
  3) hidden layer sizes in a list


  note that the scaling+classification pipeline is initialized and fit in this method itself 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed','priororigin_mean_dep_delay']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed', 'priororigin_mean_dep_delay']})
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size] + hidden_layers + [2]
    #input features, hidden layers, classification head
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=50,
                                                layers=layers,
                                                blockSize=256,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}")
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))

    print(f"transforming encoded dev df for fold {i}")
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome'))

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return avg_score

In [0]:
df_filled.columns

In [0]:
#pre-pipeline
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(
    inputCol='{0}_index'.format(column), 
    outputCol='{0}_ohe'.format(column)
    ) for column in cat_cols]



[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
[encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

# Fill missing values with 0 for the specified columns
# df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

timeSeriesSplitCV(
    df_filled,
    vec_pipeline_full,
    cv_cutoffs,
    hidden_layers = [4,2],
    sampling='down',
    metric='f2',
    verbose=True,
    dep_utc_varname='sched_depart_utc'
)

In [0]:


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

test_filled = df_test.fillna({c: 0 for c in numeric_cols if c in df_test.columns})

df_train_filled = df_filled.select(*numeric_cols, *cat_cols, 'outcome')

encoding_model = vec_pipeline_full.fit(df_train_filled)
print(f"Fit the encoding model")


df_train_encoded = encoding_model.transform(df_train_filled)
print(f"Encoded train set")


layers = [df_train_encoded.first()['features'].size,4,4,2]
print(f"Got layer sizes: {layers}")

df_test_encoded = encoding_model.transform(test_filled)
print(f"Encoded test set")


scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=50,
                                                layers=layers,
                                                blockSize=128,
                                                seed=1234)
pipeline_mlp = Pipeline(stages=[scaler, classifier])

df_train_encoded_selected = df_train_encoded.select('features','outcome')
downsampled_train = downsample(df_train_encoded_selected)
print(f"Downsampled train set")


mlp_model = pipeline_mlp.fit(downsampled_train)
print(f"Fit train set")


df_test_encoded_selected = df_test_encoded.select('features','outcome')

test_preds = mlp_model.transform(df_test_encoded_selected)
print(f"Transformed test set")



In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = ["origin_HourlyDewPointTemperature", "origin_HourlyPrecipitation", "origin_HourlyWindGustSpeed", "origin_HourlyVisibility", "origin_HourlyPressureChange"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["pagerank_train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
#pre-pipeline
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(
    inputCol='{0}_index'.format(column), 
    outputCol='{0}_ohe'.format(column)
    ) for column in cat_cols]



[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
[encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

# Fill missing values with 0 for the specified columns
# df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

timeSeriesSplitCV(
    df_filled,
    vec_pipeline_full,
    cv_cutoffs,
    hidden_layers = [4,4],
    sampling='down',
    metric='f2',
    verbose=True,
    dep_utc_varname='sched_depart_utc'
)

# Hyperparameter Tuning

In [0]:
def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

In [0]:
def time_series_cv_folds(
    df,
    time_col: str,
    k: int=3,
    blocking: bool=False,
    overlap: float=0.0,
    verbose: bool=False
):
    """
    Split a time-series PySpark DataFrame into k train/test folds with optional overlap and blocking.
    
    Args:
        df (DataFrame): PySpark DataFrame with a timestamp column.
        dep_utc_time_colvarname (str): Name of the timestamp column.
        k (int): Number of folds.
        blocking (bool): Whether to block the training set to avoid cumulative data.
        overlap (float): Fraction of overlap between validation windows (e.g. 0.2 = 20% overlap).
        verbose (bool): Whether to print the time splits.
        
    Returns:
        List of (train_df, val_df) tuples.
    """
    # Get time boundaries
    min_date = df.select(F.min(time_col)).first()[0]
    max_date = df.select(F.max(time_col)).first()[0]
    n_days = (max_date - min_date).days + 1

    # Adjust chunk sizing
    total_width = k + 1 - overlap * (k - 1)
    chunk_size = int(np.ceil(n_days / total_width))

    if verbose:
        print(f"Splitting data into {k} folds with {overlap*100:.0f}% overlap")
        print(f"Min date: {min_date}, Max date: {max_date}")
        print(f"{chunk_size:,} days per fold")
        print("************************************************************")

    folds = []
    for i in range(k):
        # Offset calculation with overlap
        train_start_offset = 0 if not blocking else int(i * (1 - overlap) * chunk_size)
        train_end_offset = int((i + 1) * chunk_size)
        val_start_offset = train_end_offset
        val_end_offset = int(val_start_offset + chunk_size)

        # Compute actual timestamps
        train_start = min_date + timedelta(days=train_start_offset)
        train_end = min_date + timedelta(days=train_end_offset)
        val_start = min_date + timedelta(days=val_start_offset)
        val_end = min_date + timedelta(days=val_end_offset)

        if val_start >= max_date:
            break
        if val_end > max_date:
            val_end = max_date + timedelta(days=1)

        # Apply filters
        train_df = df.filter((F.col(time_col) >= train_start) & (F.col(time_col) < train_end))
        val_df = df.filter((F.col(time_col) >= val_start) & (F.col(time_col) < val_end))

        if verbose:
            print(f"Fold {i + 1}:")
            print(f"  TRAIN: {train_start.date()} → {train_end.date()} ({train_df.count():,} rows)")
            print(f"  VAL:   {val_start.date()} → {val_end.date()} ({val_df.count():,} rows)")
            print("------------------------------------------------------------")

        folds.append((train_df, val_df))

    return folds

In [0]:
# Testing Time Series CV function
folds = time_series_cv_folds(
    df_train,
    time_col="sched_depart_utc",
    k=5,
    overlap=.2,
    blocking=True,
    verbose=True
)

In [0]:
folds

In [0]:
len(cv_cutoffs)

In [0]:
def timeSeriesSplitCV(df, 
                      pre_pipeline,
                      hidden_layers,
                      stepSize,
                      maxIter,
                      blockSize,
                      cv_info= cv_cutoffs,
                      sampling='down', 
                      metric='f2', 
                      verbose=True,
                      dep_utc_varname=dep_utc_varname):
  '''
  Perform timeSeriesSplit k-fold cross validation. Params:
  1) pre_pipeline: indexers, encoders, and vector assembler
  2) cross validation info
  3) hidden layer sizes in a list


  note that the scaling+classification pipeline is initialized and fit in this method itself 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    print(f"processing for fold {i}")
    ppl = pre_pipeline # hopefully avoid getting the recursive depth issue
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 
    

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      # train_df = train_df.cache()
    
    # prep seasonality columns (rename, fill as needed)
    train_df = train_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    train_df = train_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed','priororigin_mean_dep_delay']})
    dev_df = dev_df \
      .withColumnRenamed(f"daily_{i}","daily") \
      .withColumnRenamed(f"weekly_{i}","weekly") \
      .withColumnRenamed(f"yearly_{i}","yearly") \
      .withColumnRenamed(f"holidays_{i}","holidays") \
      .withColumnRenamed(f"train_{i}","pagerank")
    dev_df = dev_df.fillna({col:0 for col in \
      ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed', 'priororigin_mean_dep_delay']})
        
    # Fit the first pipeline on the model to get feature encodings:

    print(f"fitting encoding pipeline for fold {i}")

    train_df_transformed_model = ppl.fit(train_df)

    print(f"encoding train set for fold {i}")
    train_df_transformed= train_df_transformed_model.transform(train_df)
    
    print(f"encoding dev set for fold {i}")
    dev_df_transformed = train_df_transformed_model.transform(dev_df)

    # Fit the second pipeline on the model to get scaling and classification:

    print(f"getting layer sizes for fold {i}")
    layers = [train_df_transformed.first()['features'].size] + hidden_layers + [2]
    #input features, hidden layers, classification head
    

    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_scaled")
    
    classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                                featuresCol='features_scaled',
                                                maxIter=maxIter,
                                                stepSize=stepSize,
                                                layers=layers,
                                                blockSize=blockSize,
                                                seed=1234)
    pipeline_mlp = Pipeline(stages=[scaler, classifier])

    print(f"fitting encoded train df for fold {i}")
    mlp_model = pipeline_mlp.fit(train_df_transformed.select('features','outcome'))

    print(f"transforming encoded dev df for fold {i}")
    dev_pred = mlp_model.transform(dev_df_transformed.select('features','outcome'))

    if metric=='f2':
      evaluator = MulticlassClassificationEvaluator(
        labelCol="outcome", 
        metricName="fMeasureByLabel",
        beta=2.0,
        metricLabel=1.0
      )

      score = evaluator.evaluate(dev_pred)

    scores.append(score)
    print(f'Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")


  return avg_score

In [0]:
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "priororigin_type",
    "priorflight_carrier",
    "origin_region"
    ]
# seasonality columns
seasonality_cols = ["daily_full","weekly_full","yearly_full","holidays_full"]
seasonality_cols_cv = ["daily","weekly","yearly","holidays"]

weather_cols = ["origin_HourlyDewPointTemperature", "origin_HourlyPrecipitation", "origin_HourlyWindGustSpeed", "origin_HourlyVisibility", "origin_HourlyPressureChange"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed", "priororigin_mean_dep_delay"]

num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                   'priorflight_sched_elapsed'
                ]
graph_cols = ["train"]

numeric_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]
numeric_cols_cv = [*seasonality_cols_cv, *time_cols, *num_flight_cols, *weather_cols, *graph_cols]

In [0]:
from typing import List, Dict, Tuple, Any, Union,Callable
import numpy as np
import random
from datetime import timedelta

import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    MultilayerPerceptronClassifier
)
from xgboost.spark import SparkXGBClassifier
import mlflow
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

In [0]:
    indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

    encoders = [OneHotEncoder(
        inputCol='{0}_index'.format(column), 
        outputCol='{0}_ohe'.format(column)
        ) for column in cat_cols]



    [encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]
    [encoders[i].getHandleInvalid() for i in range(len(encoders))] #sanity check

    # Fill missing values with 0 for the specified columns
    # df_filled = df_train.fillna({c: 0 for c in numeric_cols_cv if c in df_train.columns})


    featuresCreator = VectorAssembler(
        inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
        outputCol='features', handleInvalid='skip')

    stages = indexers + encoders

    pre_pipeline = Pipeline(stages= stages + [featuresCreator])

In [0]:
def model_tuner(
    model_name: str,
    model_params: Dict[str, Any],
    mlflow_run_name: str = "/Users/m.bakr@berkeley.edu/flight_delay_tuning",
    metric: str = "F2",
    verbose: bool = True
) -> Dict[str, Union[float, str, Dict[str, Any]]]:
    """
    Universal tuning function for PySpark classification models using time-series cross-validation.

    Args:
        model_name (str): One of ['logreg', 'rf', 'mlp', 'xgb']
        model_params (Dict[str, Any]): Parameters to apply to the model
        folds (List of (train_df, val_df)): Time-aware CV folds
        mlflow_run_name (str): Optional MLflow parent run name
        verbose (bool): Whether to log outputs during tuning

    Returns:
        Dict with best average F2 or pr score, model name, and parameters
    """

    # Model factory
    model_factory = {
        "logreg": LogisticRegression,
        "rf": RandomForestClassifier,
        "mlp": MultilayerPerceptronClassifier,
        "xgb": SparkXGBClassifier
    }

    assert model_name in model_factory, f"Unsupported model: {model_name}"

    ModelClass = model_factory[model_name]




    scores = []

    with mlflow.start_run(run_name=mlflow_run_name):

        indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column), handleInvalid='keep') for column in cat_cols]

        encoders = [OneHotEncoder(
        inputCol='{0}_index'.format(column), 
        outputCol='{0}_ohe'.format(column)
        ) for column in cat_cols]



        [encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]

        featuresCreator = VectorAssembler(
            inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols_cv,
            outputCol='features', handleInvalid='skip')

        stages = indexers + encoders

        pre_pipeline = Pipeline(stages= stages + [featuresCreator])
        scores = timeSeriesSplitCV(df=df_train,
                                   pre_pipeline= pre_pipeline, **model_params)
        

        avg_score = float(np.mean(scores))
        mlflow.log_param("model", model_name)
        mlflow.log_params(model_params)
        mlflow.log_metric("avg_{metric}_score", avg_score)

        if verbose:
            print(f"✅ Average {metric} Score: {avg_score:.4f} | Model: {model_name}")

    return {
        "model": model_name,
        "params": model_params,
        "avg_f2_score": avg_score
    }

In [0]:
cv_cutoffs

In [0]:


def make_hyperopt_objective(
    model_name: str,
    param_space_converter: Callable[[Dict[str, Any]], Dict[str, Any]],
    mlflow_experiment_name: str = "Hyperopt_Universal_Tuning",
    verbose: bool = True
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
    """
    Creates a Hyperopt-compatible objective function for any PySpark classifier.

    Args:
        model_name (str): One of 'logreg', 'rf', 'mlp', 'xgb'.
        folds (List of (train_df, val_df)): Time-series CV folds.
        param_space_converter (Callable): Converts Hyperopt sample into model params.
        mlflow_experiment_name (str): MLflow experiment name.
        verbose (bool): Logging toggle.

    Returns:
        Callable that can be passed as fn to hyperopt.fmin()
    """

    def objective(sampled_params: Dict[str, Any]) -> Dict[str, Any]:
        # Convert sampled param space to Spark-friendly params
        model_params = param_space_converter(sampled_params)

        result = model_tuner(
            model_name=model_name,
            model_params=model_params,
            mlflow_run_name=f"hyperopt_{model_name}",
            verbose=verbose
        )

        return {
            "loss": -result["avg_f2_score"],  # Minimize negative F2
            "status": STATUS_OK,
            "params": result["params"]
        }

    return objective


In [0]:
mlp_space = {
    "hidden_layers": hp.choice("hidden_layers", [[64, 32], [32, 8, 4], [128, 32]]),
    "stepSize": hp.uniform("stepSize", 0.01, 0.3),
    "maxIter": hp.choice("maxIter", [100, 200]),
    "blockSize": hp.choice("blockSize", [64, 128])
}

def mlp_param_mapper(sampled):
    return {
        "hidden_layers": list(sampled["hidden_layers"]),
        "stepSize": sampled["stepSize"],
        "maxIter": sampled["maxIter"],
        "blockSize": sampled["blockSize"]
    }

mlp_obj = make_hyperopt_objective(
    model_name="mlp",
    param_space_converter=mlp_param_mapper,
    mlflow_experiment_name="MLP_Hyperopt",
    verbose=True
)

best_mlp = fmin(
    fn=mlp_obj,
    space=mlp_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

print("Best MLP params:", best_mlp)

Must show training and performance scores, including training curves by epoch.

# Debugging

## Downsampling strategy

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def downsample_time_buckets(train_df, time_col='sched_depart_utc', bucket='week', verbose=False):
    '''
    Downsample majority class within time buckets

    Inputs:
      time_col: column name with datetime/timestamp
      bucket: 'day', 'week', 'month' - time bucket for stratification
    '''
    # Add time bucket column
    if bucket == 'day':
        train_df = train_df.withColumn('time_bucket', F.to_date(time_col))
    elif bucket == 'week':
        train_df = train_df.withColumn('time_bucket', F.date_trunc('week', time_col))
    elif bucket == 'month':
        train_df = train_df.withColumn('time_bucket', F.date_trunc('month', time_col))
    else:
        raise ValueError("bucket must be 'day', 'week', or 'month'")
    
    # Compute counts per bucket and class
    counts = train_df.groupBy('time_bucket', 'outcome').count()
    
    # Collect counts to driver for fraction calculation (if dataset is large, consider approximate methods)
    counts_pd = counts.toPandas()
    
    # Calculate sampling fractions per bucket
    fractions = {}
    for bucket_val in counts_pd['time_bucket'].unique():
        bucket_data = counts_pd[counts_pd['time_bucket'] == bucket_val]
        delay_count = bucket_data.loc[bucket_data['outcome'] == 1, 'count'].values
        non_delay_count = bucket_data.loc[bucket_data['outcome'] == 0, 'count'].values
        
        if len(delay_count) == 0 or len(non_delay_count) == 0:
            # If only one class present, keep all
            fractions[(bucket_val, 0)] = 1.0
            fractions[(bucket_val, 1)] = 1.0
        else:
            delay_count = delay_count[0]
            non_delay_count = non_delay_count[0]
            frac = delay_count / non_delay_count if non_delay_count > delay_count else 1.0
            fractions[(bucket_val, 0)] = frac
            fractions[(bucket_val, 1)] = 1.0
    
    # Downsample majority class per bucket using sampleBy
    sampled_df = train_df.sampleBy(
        col='outcome',
        fractions={0: 1.0, 1: 1.0},  # We'll sample manually below
        seed=42
    )
    
    # Because sampleBy doesn't support stratification on two columns, do per bucket filtering:
    downsampled_parts = []
    for bucket_val in fractions.keys():
        class_label = bucket_val[1]
        frac = fractions[bucket_val]
        subset = train_df.filter(
            (F.col('time_bucket') == bucket_val[0]) & (F.col('outcome') == class_label)
        )
        if class_label == 0 and frac < 1.0:
            subset = subset.sample(withReplacement=False, fraction=frac, seed=42)
        downsampled_parts.append(subset)
    
    downsampled_df = downsampled_parts[0]
    for part in downsampled_parts[1:]:
        downsampled_df = downsampled_df.union(part)
    
    downsampled_df = downsampled_df.drop('time_bucket')
    
    if verbose:
        print(f"Downsampled training set size: {downsampled_df.count()}")
    
    return downsampled_df


In [0]:
display(train0.withColumn('time_bucket', F.date_trunc('week', 'sched_depart_utc')).select('sched_depart_utc','time_bucket'))

In [0]:
train0= df_train.filter(F.col("sched_depart_utc")>="2014-12-31").filter(F.col("sched_depart_utc")<="2015-10-09")

In [0]:
downsample1= downsample(train0)
downsample2 = downsample_time_buckets(train0)

## MLP

In [0]:
numeric_cols

In [0]:
df_train.columns

In [0]:
# SOLUTION 1 
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc',
    'QUARTER',
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "origin_type",
    "origin_region"


    ]

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(inputCol='{0}_index'.format(
    column), outputCol='{0}_ohe'.format(
    column, handleInvalid='keep')) for column in cat_cols]

featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

scaler = MinMaxScaler(inputCol='features',outputCol='features_scaled')


In [0]:
df_train

In [0]:
# SOLUTION 2
pipeline0 = Pipeline(stages=indexers + encoders + [featuresCreator])
model0 = pipeline0.fit(df_train \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"train","pagerank"))

In [0]:
tmp = model0.transform(df_train \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"test","pagerank")).limit(1).select('features').toPandas()
L = tmp.iloc[0,0].size

In [0]:
L # 47

In [0]:
# SOLUTION 3
layers = [47, 4, 2, 2]
classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features_scaled',
                                            maxIter=50,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)
pipeline_mlp = Pipeline(stages=indexers + encoders + [featuresCreator, scaler, classifier])

In [0]:
df_train = df_train \
        .withColumnRenamed(f"daily_full","daily") \
        .withColumnRenamed(f"weekly_full","weekly") \
        .withColumnRenamed(f"yearly_full","yearly") \
        .withColumnRenamed(f"holidays_full","holidays") \
        .withColumnRenamed(f"train_1","pagerank")
df1=df_train.filter(f.col('YEAR')==2015).limit(100)
df2=df_train.filter(f.col('YEAR')==2016).limit(100)

In [0]:

# ALTERNATE
# SOLUTION 1 
stages=[]
cat_cols = [
    'OP_UNIQUE_CARRIER',
    'priorflight_isdeparted',
    'priorflight_isarrived_calc',
    'priorflight_isdelayed_calc'
    ]

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(inputCol='{0}_index'.format(
    column), outputCol='{0}_ohe'.format(
    column, handleInvalid='keep')) for column in cat_cols]

[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]


featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

pre_pipeline = Pipeline(stages=indexers + encoders + [featuresCreator])
pre_model = pre_pipeline.fit(df1)
dim_size= pre_model.transform(df1).limit(1).select('features').get()[0][0].size

scaler = MinMaxScaler(inputCol='features',outputCol='features_scaled')





layers = [dim_size, 4, 2, 2]
classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features_scaled',
                                            maxIter=50,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)
pipeline_mlp = Pipeline(stages=[scaler, classifier])

In [0]:
pre_model.stages

In [0]:
pre_model.stages[1].getHandleInvalid()

In [0]:
tmp.loc[0,0].size

In [0]:
tmp.limit(1).first()['features'].size

In [0]:
df2_tmp= pre_model.transform(df2) #ERROR HAPPENS HERE?
L2 = df2_tmp.limit(1).first()['features'].size
print('df2 size: {L2}')

In [0]:
indexer = StringIndexer(
        inputCol="OP_UNIQUE_CARRIER", 
        outputCol= "OP_UNIQUE_CARRIER_index", 
        handleInvalid="keep"
    )

encoder = OneHotEncoder(
        inputCol="OP_UNIQUE_CARRIER_index", 
        outputCol="OP_UNIQUE_CARRIER_vec", 
        handleInvalid="keep"
    )

In [0]:
df1.columns

In [0]:
indexer = StringIndexer(
        inputCol="OP_UNIQUE_CARRIER", 
        outputCol= "OP_UNIQUE_CARRIER_index", 
        handleInvalid="keep"
    )

encoder = OneHotEncoder(
        inputCol="OP_UNIQUE_CARRIER_index", 
        outputCol="OP_UNIQUE_CARRIER_vec", 
        handleInvalid="keep"
    )
df1_indexed_0_model = indexer.fit(df1)
df1_indexed_0 = df1_indexed_0_model.transform(df1)
df1_encoded_0_model = encoder.fit(df1_indexed_0)
df1_encoded_0 = df1_encoded_0_model.transform(df1_indexed_0)

In [0]:
display(df1_encoded_0)

In [0]:
df1_indexed_0.printSchema()
df1_encoded_0.printSchema()


In [0]:
df1_indexed_0.printSchema()
df1_encoded_0.printSchema()


In [0]:
df1.filter(f.col('OP_UNIQUE_CARRIER').isNull()).count()

In [0]:
display(df1_encoded_0.select('OP_UNIQUE_CARRIER_index').limit(1))

In [0]:
display(df1_encoded_0.select('OP_UNIQUE_CARRIER_vec').limit(1))

In [0]:
display(df1_encoded_0)

In [0]:

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(inputCol='{0}_index'.format(
    column), outputCol='{0}_ohe'.format(
    column, handleInvalid='keep')) for column in cat_cols]

prepre_pipeline = Pipeline(stages = indexers + encoders)


In [0]:
df1_prepre_model = prepre_pipeline.fit(df1) 

In [0]:
df1_prepre_transformed = df1_prepre_model.transform(df1)

In [0]:
df1_prepre_transformed.printSchema()


In [0]:
df1.filter(f.col('priorflight_isdelayed_calc').isNull()).count()

In [0]:
display(df1_prepre_transformed)

In [0]:
df2_prepre_transformed = df1_prepre_model.transform(df2)

In [0]:
df2_prepre_transformed.printSchema()

In [0]:
display(df2_prepre_transformed)

In [0]:
featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

vec_pipeline = Pipeline(stages=[featuresCreator])


In [0]:
df1_pre_transformed_model = vec_pipeline.fit(df1_prepre_transformed)
df1_pre_transformed = df1_pre_transformed_model.transform(df1_prepre_transformed)


In [0]:
df1_pre_transformed.printSchema()
display(df1_pre_transformed)

In [0]:
df2_pre_transformed = df1_pre_transformed_model.transform(df2_prepre_transformed)


In [0]:
display(df2_pre_transformed)

In [0]:
df2_pre_transformed.first()['features'].size

In [0]:
layers = [df2_pre_transformed.first()['features'].size, 4, 2, 2]
classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features_scaled',
                                            maxIter=50,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)
pipeline_mlp = Pipeline(stages=[scaler, classifier])

In [0]:
mod_model = pipeline_mlp.fit(df1_pre_transformed)
df2preds = mod_model.transform(df2_pre_transformed)

In [0]:
display(df2preds)

In [0]:
featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])


In [0]:
encoders

In [0]:

df1_vec_transformed_model = vec_pipeline_full.fit(df1)
df2_vec_transformed = df1_vec_transformed_model.transform(df2)

In [0]:
df2_vec_transformed.printSchema()

In [0]:
display(df2_vec_transformed)

In [0]:
df2_vec_transformed.first()['features'].size

In [0]:
df1_vec_transformed = df1_vec_transformed_model.transform(df1)


In [0]:
display(df1_vec_transformed)

In [0]:
layers = [df1_vec_transformed.first()['features'].size, 4, 2, 2]
classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features_scaled',
                                            maxIter=50,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)
pipeline_mlp = Pipeline(stages=[scaler, classifier])

In [0]:
vec_mod_model = pipeline_mlp.fit(df1_vec_transformed)

In [0]:
df2_vec_preds = vec_mod_model.transform(df2_vec_transformed)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome", 
    metricName="fMeasureByLabel",
    beta=2.0,
    metricLabel=1.0
)

evaluator.evaluate(df2_vec_preds)

In [0]:
evaluator.getPredictionCol()

In [0]:
evaluator.getProbabilityCol()

In [0]:
df2_vec_preds.columns

In [0]:
display(df2_vec_preds)

## MOre stages

In [0]:
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column), handleInvalid='keep') for column in cat_cols]

encoders = [OneHotEncoder(inputCol='{0}_index'.format(
    column), outputCol='{0}_ohe'.format(
    column, handleInvalid='keep')) for column in cat_cols]
[encoders[i].setHandleInvalid('keep') for i in range(len(encoders))]


In [0]:
[encoders[i].getHandleInvalid() for i in range(len(encoders))]

In [0]:
featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features', handleInvalid='skip')

stages = indexers + encoders
vec_pipeline_full = Pipeline(stages= stages + [featuresCreator])

df_train_vec_transformed_model = vec_pipeline_full.fit(df_train)
#df2_vec_transformed = df1_vec_transformed_model.transform(df2)

In [0]:

specs= df_train_vec_transformed_model.transform(df_train.limit(10))

In [0]:
display(specs)

In [0]:

layers = [specs.first()['features'].size, 4, 2, 2]
layers

In [0]:
layers

In [0]:
df_train_vec_transformed = df_train_vec_transformed_model.transform(df_train)


In [0]:
display(df_train_vec_transformed)

In [0]:
scaler.

In [0]:
scaler = MinMaxScaler(
    inputCol="features", 
    outputCol="features_scaled"
)

classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features_scaled',
                                            maxIter=50,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)
pipeline_mlp = Pipeline(stages=[scaler, classifier])



In [0]:
df_train_vec_transformed.columns

In [0]:
df_train_vec_transformed

In [0]:
train_vec_mod_model = pipeline_mlp.fit(df_train_vec_transformed.select('features','outcome'))

## run the mlp fr maybe 