In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
import pytz
from datetime import datetime, timedelta
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.sql.functions import when
from pyspark.ml.classification import LogisticRegression
import numpy as np
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp
import pandas as pd

In [0]:
folder_path = "dbfs:/student-groups/Group_4_1"
dataset = "OTPW_3M_2015"
df = spark.read.parquet(f"{folder_path}/interim/{dataset}_clean.parquet")
display(df)

In [0]:
# split data into train and test
# FOR FULL YEAR DATASET, TEST DATA IS OCT THROUGH DEC
df_train = df.filter(f.col("dep_datetime") < "2015-03-01")
df_test = df.filter(f.col("dep_datetime") >= "2015-03-01")

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def get_cv_time_limits(df, k=3, blocking=False, dep_utc_varname="dep_datetime", verbose=True):
    '''
    Get time bins for time-series cross validation
    '''
    n = df.count()
    df = df.withColumn("row_id", f.row_number()
            .over(Window.partitionBy().orderBy(dep_utc_varname)))
    chunk_size = np.floor(n/(k+1))

    idx = np.arange(0,)
    idx = np.arange(0,n,chunk_size)
    idx[-1] = n-1
    idx = [int(i)+1 for i in idx]
    
    if verbose:
        print('')
        print(f'Number of validation datapoints for each fold is {chunk_size:,}')
        print("************************************************************")

    bin_edges = df.filter(f.col("row_id").isin(idx)).select("row_id",dep_utc_varname).toPandas()

    out = []
    for i in range(k):
        # define minimum training time based on cross-validation style
        if not blocking:
            t_min_train = bin_edges[dep_utc_varname][0]
        else:
            t_min_train = bin_edges[dep_utc_varname][i]
        # define maximum training time
        t_max_train = bin_edges[dep_utc_varname][i+1]
        # define minimum test time
        t_min_test = bin_edges[dep_utc_varname][i+1]
        # define maximum test_time
        t_max_test = bin_edges[dep_utc_varname][i+2]

        out.append({"train_min":t_min_train, "train_max":t_max_train,
                    "test_min":t_min_test, "test_max":t_max_test})
    out = pd.DataFrame(out)
        
    if verbose:
        for i in range(k):
            print(f'    TRAIN set for fold {i} goes from {out["train_min"][i]} to {out["train_max"][i]}')
            print(f'    TEST set for fold {i} goes from {out["test_min"][i]} to {out["test_max"][i]}')
        
    return out

In [0]:
cv_cutoffs = get_cv_time_limits(df_train, k=3, blocking=True, 
    dep_utc_varname="dep_datetime", verbose=True)
cv_cutoffs

## Example pipeline

In [0]:
# simplistic cleaning
df = df.withColumn("precip", col("HourlyPrecipitation").cast("double")) \
    .withColumn("wind_speed", col("HourlyWindSpeed").cast("double")) \
    .withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double")).fillna({"precip":0, "wind_speed":0})

# cast departure time to timestamp
df = df.withColumn("dep_datetime", to_timestamp(col("dep_datetime")))

# select relevant vars
df = df.select("outcome","precip","wind_speed","DEP_TIME_BLK","dep_datetime")

In [0]:
from pyspark.ml import Pipeline
## Create the pipeline Object
## Resources: 
### https://spark.apache.org/docs/latest/ml-pipeline.html
### https://www.analyticsvidhya.com/blog/2021/05/a-complete-guide-for-creating-machine-learning-pipelines-using-pyspark-mllib-on-google-colab/
############### YOUR CODE HERE #################

# very simple model: time of day and simple weather info

indexer = StringIndexer(inputCol="DEP_TIME_BLK", outputCol="DEP_TIME_BLK_index")
encoder = OneHotEncoder(inputCol="DEP_TIME_BLK_index", outputCol="DEP_TIME_BLK_vec")
features = ["DEP_TIME_BLK_vec","precip","wind_speed"]
assembler = VectorAssembler(inputCols=features, outputCol="features")
scaler = StandardScaler(inputCol="features", \
    outputCol="features_scaled",withMean=True)
lr = LogisticRegression(featuresCol='features_scaled', \
    labelCol='outcome',maxIter=50)
pipeline = Pipeline(stages=[indexer,encoder,assembler,scaler,lr])

# ## Train the model
# lr_pipeline = pipeline.fit(trainDF)
# ## Transform the model
# lr_predictions_pipeline = lr_pipeline.transform(heldOutDF)
############### YOUR CODE HERE #################

# ## Create the evaluator (RUN AS IS)
# ## HINT: The MSE here (mse) should match lr_mse (For full points)
# eval = RegressionEvaluator(labelCol = "quality")
# mse = eval.evaluate(lr_predictions_pipeline, {eval.metricName: "mse"})
# print("Linear Regression Model:")
# print(f"\t MSE: {mse}")

In [0]:
# split data into train and test
df_train = df.filter(f.col("dep_datetime") < "2015-03-01")
df_test = df.filter(f.col("dep_datetime") >= "2015-03-01")

In [0]:
# CODE IN THIS CELL DERIVED FROM DEMO 11 NOTEBOOK

def upsample(train_df,verbose=False):
  '''Upsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = non_delay_count / delay_count

  train_delay = train_df.filter(f.col('outcome') == 0)
  train_non_delay = train_df.filter(f.col('outcome') == 1).sample(withReplacement=True, fraction=keep_percent,seed=42)
  train_upsampled = train_delay.union(train_non_delay)
  return train_upsampled


def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(f.col("outcome") == 1).count()
  non_delay_count = train_df.filter(f.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(f.col('outcome') == 1)
  train_non_delay = train_df.filter(f.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled

def cv_eval(preds):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  return F2, pr

def timeSeriesSplitCV(dataset, pipeline, k=3, blocking=False, sampling=None, metric='f2', verbose=True):
  '''
  Perform timSeriesSplit k-fold cross validation 
  '''
  # # Initiate trackers
  # best_score = 0
  # best_param_vals = None
   
  df=dataset
  n=df.count()
  df = df.withColumn("row_id", f.row_number().over(Window.partitionBy().orderBy("dep_datetime")))
  chunk_size = int(n/(k+1))
  
  print('')
  print(f'Number of validation datapoints for each fold is {chunk_size:,}')
  print("************************************************************")
  
  # Track score
  scores=[]
  
  # Start k-fold
  for i in range(k):
    
    # If TimeseriesSplit 
    if not blocking:
      train_df = df.filter(f.col('row_id') <= chunk_size * (i+1)).cache()
    # If BlockingSplit
    else:
      train_df = df.filter((f.col('row_id') > chunk_size * i)&(f.col('row_id') <= chunk_size * (i+1))).cache()
      
    # Create dev set
    dev_df = df.filter((f.col('row_id') > chunk_size * (i+1))&(f.col('row_id') <= chunk_size * (i+2))).cache()  

    # Apply sampling on train if selected
    if sampling=='down':
      train_df = downsample(train_df)
      train_df = train_df.cache()
    elif sampling=='up':
      train_df = upsample(train_df)
      train_df = train_df.cache()
    # elif sampling=='weights':
    #   train_df = add_class_weights(train_df).cache()
      
    #print info on train and dev set for this fold
    if verbose:
      print('    TRAIN set for fold {} goes from {} to {}, count is {:,} flights ({})'.format((i+1), 
                                                                                      train_df.agg({'dep_datetime':'min'}).collect()[0][0],
                                                                                      train_df.agg({'dep_datetime':'max'}).collect()[0][0],
                                                                                      train_df.count(),
                                                                                      sampling + '-sampled' if sampling else 'no sampling'))
      print('    DEV set for fold {} goes from {} to {}, count is {:,} flights'.format((i+1), 
                                                                                      dev_df.agg({'dep_datetime':'min'}).collect()[0][0],
                                                                                      dev_df.agg({'dep_datetime':'max'}).collect()[0][0],
                                                                                      dev_df.count()))      
    # Fit params on the model
    model = pipeline.fit(train_df)
    dev_pred = model.transform(dev_df)
    if metric=='f2':
      score = cv_eval(dev_pred)[0]
    elif metric=='pr':
      score = cv_eval(dev_pred)[1]
    scores.append(score)
    print(f'    Number of training datapoints for fold number {i+1} is {train_df.count():,} with a {metric} score of {score:.2f}') 
    print('------------------------------------------------------------')
  
  # Take average of all scores
  avg_score = np.average(scores)    
  print(f'Average {metric} score across all folds is {avg_score:.2f}')
  print("************************************************************")

  # # Train on full df
  # print('Training on full train dataset, and validating on dev dataset with best parameters from CV:')
  # print(best_parameters)
    
  # if verbose:
  #   print('    TRAIN set for best parameter fitted model goes from {} to {}, count is {:,} flights ({})'.format(train_df.agg({'dep_datetime':'min'}).collect()[0][0],
  #                                                                                                    train_df.agg({'dep_datetime':'max'}).collect()[0][0],
  #                                                                                                    train_df.count(),
  #                                                                                                    sampling + '-sampled' if sampling else 'no sampling'))
  return avg_score

In [0]:
score = timeSeriesSplitCV(df_train, pipeline, k=3, blocking=False, sampling=None, metric='f2', verbose=True)