In [0]:
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"


In [0]:
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/modeling_checkpoints")
df=spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_1y_cleaned_engineered.parquet")


In [0]:
# data time period
period = "1y" # on of the following values ("", "3m", "6m", "1y")

# number of cross-validation folds
k = 5
overlap = 0.2


# compute seasonality?
# (False if you've already saved out seasonality models for a given CV split setup) #using false since erica already computed
compute_seasonality = False

# define train/test split date
min_test_dt = "2019-10-01"

# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

In [0]:
df = df.withColumns(
    {
        "dep_hour_utc": 
            F.hour(col(dep_utc_varname)),
        "outcome":  
            (F.when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double")
            }
)

In [0]:
# split into train and test

df_train = df.filter(F.col(dep_utc_varname) < min_test_dt)
df_train.cache()
df_test = df.filter(F.col(dep_utc_varname) >= min_test_dt)
df_test.cache()

In [0]:
def get_seasonality_data(df, fold, k):
  """
  Look up seasonlaity features from saved seasonality model.
  """
  if fold == 'full':
      fn_model = f"seasonality_model_{period}_train.parquet"
  else:
      fn_model = f"seasonality_model_{period}_cv{fold}of{k}_overlap{overlap}.parquet"
  model = spark.read.parquet(f"{team_BASE_DIR}/interim/{fn_model}")

  joined_df = df.join(model, 
                    (df["ORIGIN"] == model["ORIGIN"]) & 
                    (df["DAY_OF_WEEK"] == model["dow"]) & 
                    (df["dep_hour_utc"] == model["hour"]),
                    how="left").drop(model["ORIGIN"])
  
  return joined_df


# CODE BELOW DERIVED FROM DEMO 11 NOTEBOOK
import pyspark.sql.functions as f


def upsample(train_df,verbose=False):
  '''Upsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = non_delay_count / delay_count

  train_delay = train_df.filter(f.col('outcome') == 0)
  train_non_delay = train_df.filter(f.col('outcome') == 1).sample(withReplacement=True, fraction=keep_percent,seed=42)
  train_upsampled = train_delay.union(train_non_delay)
  return train_upsampled


def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

def cv_eval(preds):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  return F2, pr

def timeSeriesSplitCV(df, pipeline, cv_info, sampling=None, metric='f2', verbose=True, dep_utc_varname=dep_utc_varname):
  '''
  Perform time series split k-fold cross validation 
  '''

  k = len(cv_info)
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    
    # Create train set
    train_df = df.filter((df[dep_utc_varname] >= cv_info["train_min"][i]) & \
      (df[dep_utc_varname] < cv_info["train_max"][i])).cache()
      
    # Create dev set
    dev_df = df.filter((df[dep_utc_varname] >= cv_info["test_min"][i]) & \
      (df[dep_utc_varname] < cv_info["test_max"][i])).cache() 

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      train_df = train_df.cache()
    elif sampling=='up':
      train_df = upsample(train_df)
      train_df = train_df.cache()
    # elif sampling=='weights':
    #   train_df = add_class_weights(train_df).cache()
      
    #print info on train and dev set for this fold
    if verbose:
      print('    TRAIN set for fold {} goes from {} to {}, count is {:,} flights ({})'.format((i+1), 
                                                                                      train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
                                                                                      train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
                                                                                      train_df.count(),
                                                                                      sampling + '-sampled' if sampling else 'no sampling'))
      print('    DEV set for fold {} goes from {} to {}, count is {:,} flights'.format((i+1), 
                                                                                      dev_df.agg({dep_utc_varname:'min'}).collect()[0][0],
                                                                                      dev_df.agg({dep_utc_varname:'max'}).collect()[0][0],
                                                                                      dev_df.count()))
      
    # TODO: remove once feat engineering applied outside
    train_df = get_seasonality_data(train_df, i, k)
    train_df = train_df.fillna({col:0 for col in ['daily','weekly']})
    dev_df = get_seasonality_data(dev_df, i, k)
    dev_df = dev_df.fillna({col:0 for col in ['daily','weekly']})

    # print(train_df.dtypes)
    # print(dev_df.dtypes)
        
    # Fit params on the model
    model = pipeline.fit(train_df)
    dev_pred = model.transform(dev_df)
    if metric=='f2':
      score = cv_eval(dev_pred)[0]
    elif metric=='pr':
      score = cv_eval(dev_pred)[1]
    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")

  # # Train on full df
  # print('Training on full train dataset, and validating on dev dataset with best parameters from CV:')
  # print(best_parameters)
    
  # if verbose:
  #   print('    TRAIN set for best parameter fitted model goes from {} to {}, count is {:,} flights ({})'.format(train_df.agg({dep_utc_varname:'min'}).collect()[0][0],
  #                                                                                                    train_df.agg({dep_utc_varname:'max'}).collect()[0][0],
  #                                                                                                    train_df.count(),
  #                                                                                                    sampling + '-sampled' if sampling else 'no sampling'))
  return avg_score

In [0]:

df_train_downsampled = downsample(df_train).cache()
df_train_seasonal = get_seasonality_data(df_train_downsampled, 'full', k).cache()
df_test_seasonal = get_seasonality_data(df_test, 'full', k).cache()


In [0]:
import mlflow


# Baseline evaluation

In [0]:
base_run = 'runs:/5b1caaedee434f1fa84c4b0723099f5e/model'
base_model = mlflow.spark.load_model(base_run)
base_pred = base_model.transform(df_test_seasonal)
# get f2 score
base_score = cv_eval(base_pred)[0]
print(base_score) #sanity check

In [0]:
from pyspark.sql.functions import col
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

confusion_df = base_pred.groupBy('outcome', 'prediction').count().orderBy('outcome', 'prediction')
confusion_pd = confusion_df.toPandas()

confusion_matrix = confusion_pd.pivot(index='outcome', columns='prediction', values='count').fillna(0)
sns.heatmap(confusion_matrix, annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Outcome')
plt.title('Confusion Matrix')
plt.show()

In [0]:
confusion_pd

# Added features evaluation

In [0]:
feats_run = 'runs:/b734b75c14d44c05a96782540f3a4868/model'

feats_model = mlflow.spark.load_model(feats_run)
feats_pred = feats_model.transform(df_test_seasonal) #df for analysis
# get f2 score
feats_score = cv_eval(feats_pred)[0]
print(feats_score) #sanity check

# Added interactions evaluation

In [0]:
interacts_run = 'runs:/7da4c9d4d4cd4c418c56268729962bda/model'

# Load model
interacts_model = mlflow.spark.load_model(interacts_run)

# Perform inference via model.transform()
interacts_pred = interacts_model.transform(df_test_seasonal)

interacts_score = cv_eval(interacts_pred)[0]
print(interacts_score) #sanity check

# Added regularization evaluation

In [0]:

reg_run= 'runs:/d6c4162bb8fd4818bbb714c2c711d628/model'

# Load model
reg_model = mlflow.spark.load_model(reg_run)

# Perform inference via model.transform()
reg_preds=reg_model.transform(df_test_seasonal)
reg_score = cv_eval(reg_preds)[0]
print(reg_score)

In [0]:
df_test
# df_test_seasonal