# PLEASE CLONE THIS NOTEBOOK INTO YOUR PERSONAL FOLDER IF NEEDED
# DO NOT RUN OR EDIT CODE IN THE SHARED FOLDER

### Demo for using custom CV function
- LDC implementation of custom CV function described in this [blog post](https://www.timlrx.com/blog/creating-a-custom-cross-validation-function-in-pyspark)
- Custom CV has same functionality as PySpark CrossValidator - see documentation [here](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.tuning.CrossValidator.html)
- Toy model using 3m airline dataset

In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler

from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator



##### Load Data

In [0]:
# inspect mount's final project folder
display(dbutils.fs.ls("/mnt/mids-w261/datasets_final_project_2022"))

path,name,size,modificationTime
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data/,parquet_airlines_data/,0,1656618287000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_1y/,parquet_airlines_data_1y/,0,1656630272000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_3m/,parquet_airlines_data_3m/,0,1656630114000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_6m/,parquet_airlines_data_6m/,0,1656630205000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data/,parquet_weather_data/,0,1656622074000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_1y/,parquet_weather_data_1y/,0,1656631614000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_3m/,parquet_weather_data_3m/,0,1656630651000
dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_6m/,parquet_weather_data_6m/,0,1656631047000
dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/,stations_data/,0,1656713663000


In [0]:
# load 3m airline dataset
df = spark.read.parquet("/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_3m/")

# drop null values in outcome
df = df.na.drop(subset=['DEP_DEL15', 'DEP_DELAY_NEW'])

# keep only needed columns
include = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DEP_DEL15']
df = df[include]

display(df)

MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15
2,19,4,1.0
2,20,5,0.0
2,21,6,1.0
2,22,7,0.0
2,24,2,0.0
2,25,3,0.0
2,26,4,0.0
2,27,5,0.0
2,1,7,0.0
2,2,1,0.0


In [0]:
df.count()

Out[4]: 2722232

##### Prepare Data for Modeling

In [0]:
# create inpute vectors for modeling
feature_cols = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK']

assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
rf_input = assemble.transform(df) \
                   .withColumnRenamed('DEP_DEL15', 'label') \
                   .cache()

# sanity check
display(rf_input)

MONTH,DAY_OF_MONTH,DAY_OF_WEEK,label,features
2,19,4,1.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 19.0, 4.0))"
2,20,5,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 20.0, 5.0))"
2,21,6,1.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 21.0, 6.0))"
2,22,7,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 22.0, 7.0))"
2,24,2,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 24.0, 2.0))"
2,25,3,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 25.0, 3.0))"
2,26,4,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 26.0, 4.0))"
2,27,5,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 27.0, 5.0))"
2,1,7,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 1.0, 7.0))"
2,2,1,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 2.0, 1.0))"


##### Run Grid Search with Custom CV
Steps:
- Import custom CV module
- Set up grid search
- Create dictionary of dataframes for custom CV function to loop through
- Run cross validation
- Retrieve best model from CV

In [0]:
# Import all functions from custom_cv module

In [0]:
%run "/Shared/custom_cv"

In [0]:
# set up grid search: estimator, set of params, and evaluator
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
grid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5, 10])\
            .addGrid(rf.numTrees, [10, 15])\
            .build()

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5)

In [0]:
# create dictionary of dataframes for custom cv fn to loop through
# assign train and test based on time series split 
# train: month 1, months 1 & 2, test: month 2, month 3
d = {}

d['df1'] = rf_input.filter(rf_input.MONTH <= 2)\
                   .withColumn('cv', F.when(rf_input.MONTH == 1, 'train')
                                         .otherwise('test'))

d['df2'] = rf_input.filter(rf_input.MONTH <= 3)\
                   .withColumn('cv', F.when(rf_input.MONTH <= 2, 'train')
                                         .otherwise('test'))

In [0]:
d['df1'].createOrReplaceTempView('tester_view')

In [0]:
d['df1'].groupby('MONTH','cv').count().orderBy('MONTH').show()
d['df2'].groupby('MONTH','cv').count().orderBy('MONTH').show()

+-----+-----+------+
|MONTH|   cv| count|
+-----+-----+------+
|    1|train|916622|
|    2| test|818264|
+-----+-----+------+

+-----+-----+------+
|MONTH|   cv| count|
+-----+-----+------+
|    1|train|916622|
|    2|train|818264|
|    3| test|987346|
+-----+-----+------+



In [0]:
display(d['df1'])

MONTH,DAY_OF_MONTH,DAY_OF_WEEK,label,features,cv
2,19,4,1.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 19.0, 4.0))",test
2,20,5,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 20.0, 5.0))",test
2,21,6,1.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 21.0, 6.0))",test
2,22,7,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 22.0, 7.0))",test
2,24,2,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 24.0, 2.0))",test
2,25,3,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 25.0, 3.0))",test
2,26,4,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 26.0, 4.0))",test
2,27,5,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 27.0, 5.0))",test
2,1,7,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 1.0, 7.0))",test
2,2,1,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 2.0, 1.0))",test


In [0]:
# run cross validation & return the crossvalidation F0.5 score for 'test' set
cv = CustomCrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator,
     splitWord = ('train', 'test'), cvCol = 'cv', parallelism=4)

cvModel = cv.fit(d)

fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
Best Model:  {Param(parent='RandomForestClassifier_4308e3b278f7', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5, Param(parent='RandomForestClassifier_4308e3b278f7', name='numTrees', doc='Number of trees to train (>= 1).'): 10} Detailed Score [0.8346288268940373, 0.8245311795644182] Avg Score 0.8295800032292278


In [0]:
rf_input

Out[17]: DataFrame[MONTH: int, DAY_OF_MONTH: int, DAY_OF_WEEK: int, label: double, features: vector]

In [0]:
# make predictions
predictions = cvModel.transform(d['df1'])

display(predictions.groupby('label', 'prediction').count())

label,prediction,count
1.0,0.0,364532
0.0,0.0,1370354


In [0]:
display(predictions)

MONTH,DAY_OF_MONTH,DAY_OF_WEEK,label,features,cv,rawPrediction,probability,prediction
2,19,4,1.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 19.0, 4.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,20,5,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 20.0, 5.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,21,6,1.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 21.0, 6.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,22,7,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 22.0, 7.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,24,2,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 24.0, 2.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,25,3,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 25.0, 3.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,26,4,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 26.0, 4.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,27,5,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 27.0, 5.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,1,7,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 1.0, 7.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
2,2,1,0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 2.0, 1.0))",test,"Map(vectorType -> dense, length -> 2, values -> List(8.064742437067611, 1.9352575629323883))","Map(vectorType -> dense, length -> 2, values -> List(0.8064742437067611, 0.19352575629323882))",0.0
