In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap


from pyspark.sql import functions as f
from pyspark.sql import SQLContext
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, avg, mean
from pyspark.sql.functions import min
from pyspark.sql.functions import col, max
from pyspark.sql.functions import format_string
from pyspark.sql.functions import substring
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import concat
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import lit
from pyspark.sql.functions import to_utc_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import instr
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer,OneHotEncoder
from pyspark.ml.classification import MultilayerPerceptronClassifier


from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator



"is" with a literal. Did you mean "=="?
"is" with a literal. Did you mean "=="?
"is" with a literal. Did you mean "=="?
"is" with a literal. Did you mean "=="?
"is not" with a literal. Did you mean "!="?
"is not" with a literal. Did you mean "!="?


In [0]:
%run "/Shared/w261_Section4_Group2/Phase 3/custom_cv_module"

In [0]:
#Initializes blob storage credentials/location
blob_container = "w261-sec4-group2" # The name of your container created in https://portal.azure.com
storage_account = "kdevery" # The name of your Storage account created in https://portal.azure.com
secret_scope = "sec4-group2" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

#Points to SAS token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# Read in training and test data
train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features").cache()

#feature processing of dfs
train_df=train_df.select("*", f.row_number().over(Window.partitionBy().orderBy("Date_Time_sched_dep_utc")).alias("Index"))
train_df = train_df.withColumn("holiday_period", train_df["holiday_period"].cast(IntegerType()))
test_df = test_df.withColumn("holiday_period", test_df["holiday_period"].cast(IntegerType()))

carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
train_df = carrier_indexer.fit(train_df).transform(train_df)

#one hot encoding
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
train_df = onehotencoder_carrier_vector.fit(train_df).transform(train_df)
                             

In [0]:
processed_train_df = spark.read.parquet(f"{blob_url}/processed_train")
processed_test_df = spark.read.parquet(f"{blob_url}/processed_test")

In [0]:
def undersample(data, verbose = False):
    """
    Under samples the majority class
    """

    delay_count = data.filter(f.col('label') == 1 ).count()
    non_delay_count = data.filter(f.col('label') == 0 ).count()
    #   print(f' total count : {data.count()}')
    #   print(f' delayed count : {delay_count}')
    #   print(f' non delayed count : {non_delay_count}')

    fraction_undersample = delay_count / non_delay_count
    #   print(f' delayed / non delayed: {fraction_undersample}')

    train_delayed = data.filter(f.col('label') == 1)
    #   print(f' non delayed count df : {train_delayed.count()}')

    train_non_delay_undersample = data.filter(f.col('label') == 0).sample(withReplacement=True, fraction=fraction_undersample, seed = 261)
    #   print(f' oversampled delayed count : {train_non_delay_undersample.count()}')

    data_undersampled = train_delayed.union(train_non_delay_undersample)
    #   print(f' train_df Oversampled : {train_undersampled.count()}')

    return data_undersampled

In [0]:
p_fold_1 = spark.read.parquet(f"{blob_url}/processed_fold_1")
p_fold_2 = spark.read.parquet(f"{blob_url}/processed_fold_2")
p_fold_3 = spark.read.parquet(f"{blob_url}/processed_fold_3")
p_fold_4 = spark.read.parquet(f"{blob_url}/processed_fold_4")
p_fold_5 = spark.read.parquet(f"{blob_url}/processed_fold_5")

d_undersampled = {}
d_undersampled['df1'] = undersample(p_fold_1)
d_undersampled['df2'] = undersample(p_fold_2)
d_undersampled['df3'] = undersample(p_fold_3)
d_undersampled['df4'] = undersample(p_fold_4)
d_undersampled['df5'] = undersample(p_fold_5)

In [0]:
processed_train_df_undersampled = undersample(processed_train_df)

In [0]:
#Multi Layer Perceptron with undersampling

# set up grid search: estimator, set of params, and evaluator
MLPC_model = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector", maxIter = 100, layers = [39,26,2], blockSize = 64, solver = 'l-bfgs')

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)

#run to select best mode
MLPCmodel = MLPC_model.fit(processed_train_df_undersampled)


In [0]:
#make predictions
MLPC_predictions = MLPCmodel.transform(processed_test_df)
display(MLPC_predictions.groupby('label', 'prediction').count())

#save predictions to blob
MLPC_predictions.write.parquet(f"{blob_url}/MLPC_predictions_df")

label,prediction,count
1.0,1.0,475553
0.0,1.0,359006
1.0,0.0,543150
0.0,0.0,4481597


In [0]:
display(MLPC_predictions)

label,scaled_feature_vector,index,rawPrediction,probability,prediction
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 1.1771165614071244, 1.1807240188784864, 2.001448286999518, 4.804541102597839, 0.5290014255667469, 2.5358333967474525, 0.3529775526800816, 2.2518971367325515, 0.2623600525700863, 2.453487744696793, 0.4321511055947388))",0,"Map(vectorType -> dense, length -> 2, values -> List(1.4963778040813425, 0.17501286815648714))","Map(vectorType -> dense, length -> 2, values -> List(0.7894087066552684, 0.21059129334473156))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.8181980723075531, 1.518073738558054, 2.001448286999518, 5.254269120180973, 0.34977829198919363, 0.9689112552858259, 0.433041426600483, 2.3371183222456193, 0.27964920300085777, 2.453487744696793, 0.4321511055947388))",1,"Map(vectorType -> dense, length -> 2, values -> List(1.390235393915282, 0.02563228034230247))","Map(vectorType -> dense, length -> 2, values -> List(0.7965068058349072, 0.20349319416509282))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.8181980723075531, 1.1807240188784864, 2.001448286999518, 5.254269120180973, 0.4050167164495406, 0.9689112552858259, 0.433041426600483, 2.3371183222456193, 0.27964920300085777, 2.453487744696793, 0.4321511055947388))",2,"Map(vectorType -> dense, length -> 2, values -> List(1.3664643855095564, 0.0022611376038675635))","Map(vectorType -> dense, length -> 2, values -> List(0.7964419864381027, 0.2035580135618973))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.8181980723075531, 2.001448286999518, 5.254269120180973, 0.34688490538266537, 0.9689112552858259, 0.433041426600483, 2.3371183222456193, 0.27964920300085777, 2.453487744696793, 0.4321511055947388))",3,"Map(vectorType -> dense, length -> 2, values -> List(1.3651408543683816, -0.1393656475497651))","Map(vectorType -> dense, length -> 2, values -> List(0.8182456432893342, 0.18175435671066564))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.722940084679231, 2.361448037756973, 2.001448286999518, 4.804541102597839, 0.38154845811321203, 0.7047198828462189, 0.5403663049134857, 1.8916360906620093, 0.4360534659923761, 2.453487744696793, 0.4321511055947388))",4,"Map(vectorType -> dense, length -> 2, values -> List(1.8008258832076554, 0.3844621106234475))","Map(vectorType -> dense, length -> 2, values -> List(0.8047677376829427, 0.1952322623170573))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.722940084679231, 0.8433742991989189, 2.001448286999518, 4.804541102597839, 0.3154302374467945, 0.7047198828462189, 0.5403663049134857, 1.8916360906620093, 0.4360534659923761, 2.453487744696793, 0.4321511055947388))",5,"Map(vectorType -> dense, length -> 2, values -> List(1.7759023290613731, 0.16280198392210726))","Map(vectorType -> dense, length -> 2, values -> List(0.833841383741017, 0.16615861625898296))",0.0
1.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 7, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.722940084679231, 2.1927731779171893, 10.563083864358715, 2.001448286999518, 4.804541102597839, 0.608351639401759, 0.7047198828462189, 0.5403663049134857, 1.8916360906620093, 0.4360534659923761, 2.453487744696793, 0.4321511055947388))",6,"Map(vectorType -> dense, length -> 2, values -> List(1.0402246529944632, 0.3728279756467573))","Map(vectorType -> dense, length -> 2, values -> List(0.6609199868461985, 0.3390800131538016))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 26, 30, 31, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 1.5003133051460746, 0.5060245795193513, 7.296613042424572, 1.3850953356442304, 1.5473254271491774, 2.506155999639365, 0.1854260324295884, 2.453487744696793, 0.4321511055947388))",7,"Map(vectorType -> dense, length -> 2, values -> List(0.8104965844869096, 0.030669358664644403))","Map(vectorType -> dense, length -> 2, values -> List(0.6856428759527085, 0.3143571240472916))",0.0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 26, 30, 31, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 1.6500044285620095, 1.0120491590387026, 7.296613042424572, 1.5574891757029543, 1.0216639197618846, 3.020166622012421, 0.35300129827896976, 2.453487744696793, 0.4321511055947388))",8,"Map(vectorType -> dense, length -> 2, values -> List(0.751891329667302, 0.17805502951008423))","Map(vectorType -> dense, length -> 2, values -> List(0.6396479093273493, 0.3603520906726508))",0.0
1.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(1.5157167525696686, 1.251849207495928, 0.9981490683130829, 0.8181980723075531, 1.0120491590387026, 2.001448286999518, 5.254269120180973, 1.5143994442576951, 0.9689112552858259, 0.756889388763837, 2.374154468913133, 0.4757177694132144, 2.364497822755491, 0.2879023802938084))",9,"Map(vectorType -> dense, length -> 2, values -> List(1.0369896619056398, 0.23092006598649467))","Map(vectorType -> dense, length -> 2, values -> List(0.6912713272152036, 0.3087286727847963))",0.0


In [0]:
evaluator.evaluate(MLPC_predictions)

Out[20]: 0.5457420909496324

In [0]:
#Multi Layer Perceptron without undersampling

# set up grid search: estimator, set of params, and evaluator
MLPC_model = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector", maxIter = 50, layers = [39,26,2], blockSize = 64, solver = 'l-bfgs')

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)

#run to select best mode
MLPCmodel = MLPC_model.fit(processed_train_df)

#make predictions
MLPC_predictions = MLPCmodel.transform(processed_test_df)
display(MLPC_predictions.groupby('label', 'prediction').count())

#save predictions to blob
#MLPC_predictions.write.parquet(f"{blob_url}/MLPC_predictions_df_no_undersampling")


label,prediction,count
1.0,1.0,354947
0.0,1.0,145932
1.0,0.0,663756
0.0,0.0,4694671


In [0]:
#no undersampling evaluation
evaluator.evaluate(MLPC_predictions)

Out[13]: 0.5872291187369281

In [0]:
#splitting training dataframe into five folds contained in dictionary "d"

d = {}
folds = ['df1','df2','df3','df4','df5']

each_len = train_df.count()/5
start = 1
val_size = each_len/5
stop = each_len
precision_list = []

for fold in folds:
    d[fold] = train_df.filter(col('Index').between(start,stop))\
                                  .withColumn('cv', F.when(col('Index').between(start,(stop-val_size)), 'train')
                                         .otherwise('val'))
    start += each_len
    stop += each_len

                                  

In [0]:
train_df.createOrReplaceTempView('train_view')

In [0]:
def process_fold_df(fold_df):
    
    
    #imputation
    fold_df.createOrReplaceTempView("fold_view")
    
    imputation_columns = ['CRS_ELAPSED_TIME','HourlyAltimeterSetting','HourlyDewPointTemperature',
             'HourlyDryBulbTemperature','HourlyRelativeHumidity','HourlySeaLevelPressure',
             'HourlyStationPressure','HourlyVisibility','HourlyWetBulbTemperature',
             'HourlyWindDirection','mean_carrier_delay','ORIGIN_Prophet_trend',
             'ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred',]

    means = {}

    for impute_col in imputation_columns:
        mean = spark.sql(f"SELECT AVG({impute_col}) FROM fold_view").collect()[0][0]
        means[impute_col] = mean
    
    print(means)
    
    #fill Nones and Nans - Seems to error sometimes?
    fold_df = fold_df.fillna(0,["HourlyWindGustSpeed"]) \
         .fillna(means["CRS_ELAPSED_TIME"],["CRS_ELAPSED_TIME"]) \
         .fillna(means["HourlyAltimeterSetting"],["HourlyAltimeterSetting"]) \
         .fillna(means["HourlyDewPointTemperature"],["HourlyDewPointTemperature"]) \
         .fillna(means["HourlyDryBulbTemperature"],["HourlyDryBulbTemperature"]) \
         .fillna(0,["HourlyPrecipitation"]) \
         .fillna(means["HourlyRelativeHumidity"],["HourlyRelativeHumidity"]) \
         .fillna(means["HourlySeaLevelPressure"],["HourlySeaLevelPressure"]) \
         .fillna(means["HourlyStationPressure"],["HourlyStationPressure"]) \
         .fillna(means["HourlyVisibility"],["HourlyVisibility"]) \
         .fillna(means["HourlyWetBulbTemperature"],["HourlyWetBulbTemperature"]) \
         .fillna(means["HourlyWindDirection"],["HourlyWindDirection"]) \
         .fillna(0,["HourlyWindSpeed"]) \
         .fillna("",["TAIL_NUM"])\
         .fillna(0,['holiday_period'])\
         .fillna(means['mean_carrier_delay'],['mean_carrier_delay'])\
         .fillna(0,['PREV_FLIGHT_DELAYED'])\
         .fillna(0,['origin_percent_delayed'])\
         .fillna(0,['dest_percent_delayed'])\
         .fillna(means['ORIGIN_Prophet_trend'],['ORIGIN_Prophet_trend'])\
         .fillna(means['ORIGIN_Prophet_pred'],['ORIGIN_Prophet_pred'])\
         .fillna(means['DEST_Prophet_trend'],['DEST_Prophet_trend'])\
         .fillna(means['DEST_Prophet_pred'],['DEST_Prophet_pred'])
         

    
    #vector assembler
    feature_cols = ['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE','HourlyWindSpeed','Rain','Blowing','Snow','Thunder','CloudySkyCondition','carrier_vec',         'holiday_period','mean_carrier_delay','Pagerank_Score','PREV_FLIGHT_DELAYED','origin_percent_delayed','dest_percent_delayed','ORIGIN_Prophet_trend','ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred']
    #assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
    #outputCol = "features"
    df_va = VectorAssembler(inputCols = feature_cols, outputCol = 'feature_vector')
    model_input = df_va.transform(fold_df)
    
    #rename delay flag to label
    model_input = model_input.withColumnRenamed("DEP_DEL15","label")
    #model_input = assemble.transform(fold_df) \
    #               .withColumnRenamed('DEP_DEL15', 'label')
    
    #scaling
    scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
    model_input = scaler.fit(model_input).transform(model_input)
    model_input = model_input.select('label', 'scaled_feature_vector','cv')
    
     #undersample
    model_input = undersample(model_input)
    model_input = model_input.withColumn("label", model_input["label"].cast(IntegerType()))
    
    return model_input

In [0]:
def process_train_df(fold_df):
    
    
    #imputation
    fold_df.createOrReplaceTempView("fold_view")
    
    imputation_columns = ['CRS_ELAPSED_TIME','HourlyAltimeterSetting','HourlyDewPointTemperature',
             'HourlyDryBulbTemperature','HourlyRelativeHumidity','HourlySeaLevelPressure',
             'HourlyStationPressure','HourlyVisibility','HourlyWetBulbTemperature',
             'HourlyWindDirection','mean_carrier_delay','ORIGIN_Prophet_trend',
             'ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred',]

    means = {}

    for impute_col in imputation_columns:
        mean = spark.sql(f"SELECT AVG({impute_col}) FROM fold_view").collect()[0][0]
        means[impute_col] = mean
    
    print(means)
    
    #fill Nones and Nans - Seems to error sometimes?
    fold_df = fold_df.fillna(0,["HourlyWindGustSpeed"]) \
         .fillna(means["CRS_ELAPSED_TIME"],["CRS_ELAPSED_TIME"]) \
         .fillna(means["HourlyAltimeterSetting"],["HourlyAltimeterSetting"]) \
         .fillna(means["HourlyDewPointTemperature"],["HourlyDewPointTemperature"]) \
         .fillna(means["HourlyDryBulbTemperature"],["HourlyDryBulbTemperature"]) \
         .fillna(0,["HourlyPrecipitation"]) \
         .fillna(means["HourlyRelativeHumidity"],["HourlyRelativeHumidity"]) \
         .fillna(means["HourlySeaLevelPressure"],["HourlySeaLevelPressure"]) \
         .fillna(means["HourlyStationPressure"],["HourlyStationPressure"]) \
         .fillna(means["HourlyVisibility"],["HourlyVisibility"]) \
         .fillna(means["HourlyWetBulbTemperature"],["HourlyWetBulbTemperature"]) \
         .fillna(means["HourlyWindDirection"],["HourlyWindDirection"]) \
         .fillna(0,["HourlyWindSpeed"]) \
         .fillna("",["TAIL_NUM"])\
         .fillna(0,['holiday_period'])\
         .fillna(means['mean_carrier_delay'],['mean_carrier_delay'])\
         .fillna(0,['PREV_FLIGHT_DELAYED'])\
         .fillna(0,['origin_percent_delayed'])\
         .fillna(0,['dest_percent_delayed'])\
         .fillna(means['ORIGIN_Prophet_trend'],['ORIGIN_Prophet_trend'])\
         .fillna(means['ORIGIN_Prophet_pred'],['ORIGIN_Prophet_pred'])\
         .fillna(means['DEST_Prophet_trend'],['DEST_Prophet_trend'])\
         .fillna(means['DEST_Prophet_pred'],['DEST_Prophet_pred'])
         

    
    #vector assembler
    feature_cols = ['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE','HourlyWindSpeed','Rain','Blowing','Snow','Thunder','CloudySkyCondition','carrier_vec',         'holiday_period','mean_carrier_delay','Pagerank_Score','PREV_FLIGHT_DELAYED','origin_percent_delayed','dest_percent_delayed','ORIGIN_Prophet_trend','ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred']
    #assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
    #outputCol = "features"
    df_va = VectorAssembler(inputCols = feature_cols, outputCol = 'feature_vector')
    model_input = df_va.transform(fold_df)
    
    #rename delay flag to label
    model_input = model_input.withColumnRenamed("DEP_DEL15","label")
    
     #undersample
    model_input = undersample(model_input)
    model_input = model_input.withColumn("label", model_input["label"].cast(IntegerType()))
    
    
    return model_input

In [0]:
d_processed = {}
for key in d.keys():
    print(key)
    d_processed[key] = process_fold_df(d[key])


df1
{'CRS_ELAPSED_TIME': 142.12910992829543, 'HourlyAltimeterSetting': 30.036580778251157, 'HourlyDewPointTemperature': 45.00854378313608, 'HourlyDryBulbTemperature': 60.17423843195577, 'HourlyRelativeHumidity': 62.37591717155816, 'HourlySeaLevelPressure': 30.02557336966036, 'HourlyStationPressure': 29.139293369258898, 'HourlyVisibility': 9.306115460749048, 'HourlyWetBulbTemperature': 52.37834875710346, 'HourlyWindDirection': 171.99296985916052, 'mean_carrier_delay': 0.1791215060089807, 'ORIGIN_Prophet_trend': 0.18222279885720952, 'ORIGIN_Prophet_pred': 0.18032802015747373, 'DEST_Prophet_trend': 0.18219394063971914, 'DEST_Prophet_pred': 0.18030541467777475}
df2
{'CRS_ELAPSED_TIME': 145.44278663656297, 'HourlyAltimeterSetting': 30.015902928331997, 'HourlyDewPointTemperature': 49.019680502804114, 'HourlyDryBulbTemperature': 65.21152453001328, 'HourlyRelativeHumidity': 61.39521148028556, 'HourlySeaLevelPressure': 29.998395626822127, 'HourlyStationPressure': 29.109979631699638, 'HourlyVisi

In [0]:
#d_processed['df1'].write.parquet(f"{blob_url}/processed_fold_1")
#d_processed['df2'].write.parquet(f"{blob_url}/processed_fold_2")
#d_processed['df3'].write.parquet(f"{blob_url}/processed_fold_3")
#d_processed['df4'].write.parquet(f"{blob_url}/processed_fold_4")
#d_processed['df5'].write.parquet(f"{blob_url}/processed_fold_5")

### Model Building

In [0]:
%run "/Shared/w261_Section4_Group2/Phase 3/custom_cv_module"

In [0]:
#for individual testing

#test_train = d_processed['df1'].filter(col('cv')=='train')
#test_val = d_processed['df1'].filter(col('cv')=='val')

#test_logistic_model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
#evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
#lrModel = test_logistic_model.fit(test_train)
#predictions = lrModel.transform(test_val)
#evaluator.evaluate(predictions)

### MLPC Model

In [0]:
#Multi Layer Perceptron Grid Search Hyperparameter selection

# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

# set up grid search: estimator, set of params, and evaluator
MLPC_model = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector")
grid = ParamGridBuilder()\
            .addGrid(MLPC_model.maxIter, [50,100,200])\
            .addGrid(MLPC_model.layers, [[38,26,2],[38,26,26,2]])\
            .addGrid(MLPC_model.blockSize, [32, 64])\
            .addGrid(MLPC_model.solver, ['gd', 'l-bfgs'] )\
            .build() 

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)

In [0]:
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")
display(test_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,two_hrs_pre_flight_utc,Date_Time_sched_dep_utc,Date_Time_sched_arrival_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_flight_per_day,origin_delays_per_day,dest_flight_per_day,dest_delays_per_day,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T23:00:00.000+0000,2021-02-05T01:00:00.000+0000,2021-02-05T03:00:00.000+0000,9E,N184GJ,ATL,ABE,2030,2220,-12.0,110.0,692.0,3,308.2,29.92,25,51,0.0,36,29.93,28.82,10.0,40,160.0,7,0,ATL_ABE,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0.0583090379008746,0.0405444716972231,0,661.0,29.0,3,0.0,0.0438729198184568,0.0,0.1208630143893774,0.070931832195133,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T18:00:00.000+0000,2021-02-04T20:00:00.000+0000,2021-02-04T21:00:00.000+0000,OH,N560NN,CLT,ABE,1505,1648,-3.0,103.0,481.0,2,222.6,30.07,17,48,0.0,29,30.08,29.25,10.0,36,210.0,9,0,CLT_ABE,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0.0385542168674698,0.015491552014988,0,353.0,19.0,3,0.0,0.0538243626062322,0.0,0.1254369752523904,0.075606138001747,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T23:00:00.000+0000,2021-02-05T01:00:00.000+0000,2021-02-05T03:00:00.000+0000,OH,N609NN,CLT,ABE,2020,2208,2.0,108.0,481.0,2,222.6,29.97,21,46,0.0,37,29.98,29.15,10.0,36,170.0,7,0,CLT_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.0446428571428571,0.015491552014988,0,353.0,19.0,3,0.0,0.0538243626062322,0.0,0.1254369752523904,0.075606138001747,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T14:00:00.000+0000,2021-02-04T16:00:00.000+0000,2021-02-04T17:00:00.000+0000,OH,N600NN,CLT,ABE,1110,1250,-6.0,100.0,481.0,2,222.6,30.09,22,32,0.0,66,30.1,29.26,10.0,28,0.0,0,0,CLT_ABE,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0.038235294117647,0.015491552014988,0,353.0,19.0,3,0.0,0.0538243626062322,0.0,0.1254369752523904,0.075606138001747,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T18:00:00.000+0000,2021-02-04T20:00:00.000+0000,2021-02-04T22:00:00.000+0000,9E,N8775A,DTW,ABE,1555,1726,-11.0,91.0,425.0,2,191.9,29.85,22,31,0.0,69,29.87,29.14,6.0,28,170.0,14,0,DTW_ABE,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0.0420560747663551,0.0112674970607994,0,268.0,18.0,3,0.0,0.0671641791044776,0.0,0.1015272129067507,0.117891694924153,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T13:00:00.000+0000,2021-02-04T15:00:00.000+0000,2021-02-04T16:00:00.000+0000,9E,N8688C,DTW,ABE,1000,1141,-30.0,101.0,425.0,2,191.9,29.98,16,22,0.0,78,30.01,29.27,6.0,20,170.0,5,0,DTW_ABE,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0.0347682119205298,0.0112674970607994,0,268.0,18.0,3,0.0,0.0671641791044776,0.0,0.1015272129067507,0.117891694924153,0.1322035001594901,0.1167872615990532
1.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-05T00:00:00.000+0000,2021-02-05T02:00:00.000+0000,2021-02-05T03:00:00.000+0000,9E,N8694A,DTW,ABE,2120,2259,107.0,99.0,425.0,2,191.9,29.57,21,32,,64,29.59,28.87,1.25,28,150.0,13,25,DTW_ABE,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0.0670553935860058,0.0112674970607994,0,268.0,18.0,3,0.0,0.0671641791044776,0.0,0.1015272129067507,0.117891694924153,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T15:00:00.000+0000,2021-02-04T17:00:00.000+0000,2021-02-04T19:00:00.000+0000,G4,220NV,SFB,ABE,1224,1448,2.0,144.0,882.0,4,14.9,30.17,34,53,0.0,48,30.17,30.11,10.0,44,150.0,3,0,SFB_ABE,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1526717557251908,0.024739595301454,0,,,3,0.0,,0.0,0.1345095047662523,0.0501319011338104,0.1322035001594901,0.1167872615990532
0.0,2021,1,2,4,4,2021-02-04 00:00:00,2021-02-04T22:00:00.000+0000,2021-02-05T00:00:00.000+0000,2021-02-05T02:00:00.000+0000,G4,220NV,PIE,ABE,1903,2128,15.0,145.0,970.0,4,0.1,30.07,30,63,0.0,29,30.07,30.06,10.0,48,230.0,6,0,PIE_ABE,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1716738197424892,0.0163349942200422,0,,,3,0.0,,0.0,0.1620972982914527,0.095437657558401,0.1322035001594901,0.1167872615990532
1.0,2021,2,5,11,2,2021-05-11 00:00:00,2021-05-11T16:00:00.000+0000,2021-05-11T18:00:00.000+0000,2021-05-11T20:00:00.000+0000,OH,N510AE,CLT,ABE,1450,1644,5.0,114.0,481.0,2,222.6,30.13,51,70,0.0,51,30.11,29.3,10.0,59,100.0,6,0,CLT_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1669242658423493,0.015491552014988,0,574.0,54.0,15,0.0,0.0940766550522648,0.0,0.1274247660153814,0.1286153614535008,0.1274083756739534,0.0778045692052313


In [0]:
# run cross validation & return the crossvalidation F0.5 score for 'validation' set
cv = CustomCrossValidator(estimator=MLPC_model, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=10)

#run to select best model
MLPC_Model = cv.fit(d_processed)


fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
fold 3 start...
fold 3 end
fold 4 start...
fold 4 end
fold 5 start...
fold 5 end
Best Model:  {Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='maxIter', doc='max number of iterations (>= 0).'): 100, Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.'): [38, 26, 2], Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='blockSize', doc='block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data.'): 64, Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='solver', doc='The solver algorithm for optimization. Supported options: l-bfgs, gd.'): 'l-bfgs'} Detailed Score [0.72607578076622, 0

In [0]:
bestModel = MLPC_Model.bestModel

In [0]:
#MLPC Evaluation

# Read in training and test data
train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

#string indexing of carrier for train
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
indexer_transformer = carrier_indexer.setHandleInvalid("keep").fit(train_df)
train_df = indexer_transformer.transform(train_df)

#one hot encoding for train
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
onehotencoder_transformer = onehotencoder_carrier_vector.fit(train_df)
train_df = onehotencoder_transformer.transform(train_df)

#string indexing of carrier for test
test_df = indexer_transformer.transform(test_df)
#one hot encoding for test
test_df = onehotencoder_transformer.transform(test_df)

#cast holiday to integer
train_df = train_df.withColumn("holiday_period", train_df["holiday_period"].cast(IntegerType()))
test_df = test_df.withColumn("holiday_period", test_df["holiday_period"].cast(IntegerType()))

processed_train_df = process_train_df(train_df)

#scale to train on train set
scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
scaler_transformer = scaler.fit(processed_train_df)
processed_train_df = scaler_transformer.transform(processed_train_df)

processed_test_df = process_train_df(test_df)
#scale to train on test set
processed_test_df = scaler_transformer.transform(processed_test_df)

#make predictions
MLPC_predictions = MLPC_Model.transform(processed_test_df)
display(MLPC_predictions.groupby('label', 'prediction').count())



{'CRS_ELAPSED_TIME': 142.19873196344375, 'HourlyAltimeterSetting': 30.033807209268396, 'HourlyDewPointTemperature': 47.031581851427795, 'HourlyDryBulbTemperature': 62.17946108108702, 'HourlyRelativeHumidity': 62.826167693631106, 'HourlySeaLevelPressure': 30.020612570533853, 'HourlyStationPressure': 29.138857040175477, 'HourlyVisibility': 9.348308673776204, 'HourlyWetBulbTemperature': 54.20766773299253, 'HourlyWindDirection': 172.5585708721293, 'mean_carrier_delay': 0.16986784944061004, 'ORIGIN_Prophet_trend': 0.17208792718152008, 'ORIGIN_Prophet_pred': 0.1687493872510451, 'DEST_Prophet_trend': 0.17206664389874615, 'DEST_Prophet_pred': 0.16873035480876158}
{'CRS_ELAPSED_TIME': 142.7358402172544, 'HourlyAltimeterSetting': 30.028331599000225, 'HourlyDewPointTemperature': 48.10941381755681, 'HourlyDryBulbTemperature': 64.10941143119706, 'HourlyRelativeHumidity': 61.50895857992817, 'HourlySeaLevelPressure': 30.011542257892327, 'HourlyStationPressure': 29.048171030881075, 'HourlyVisibility':

In [0]:
#save models
grid

Out[149]: [{Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='maxIter', doc='max number of iterations (>= 0).'): 50,
  Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.'): [38,
   26,
   2],
  Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='blockSize', doc='block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data.'): 32,
  Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='solver', doc='The solver algorithm for optimization. Supported options: l-bfgs, gd.'): 'gd'},
 {Param(parent='MultilayerPerceptronClassifier_32c7231b62f1', name='maxIter', doc='max number of iterations (>= 0).'): 50,
  Param(parent='MultilayerPerceptronClass

In [0]:
MLPC_Model.avgMetrics

Out[150]: [0.4539306917378712,
 0.7131275126885845,
 0.45379570281203874,
 0.7089419832586354,
 0.36904892762761093,
 0.6996300649822036,
 0.3690126629812791,
 0.7022380552997372,
 0.48549713160806635,
 0.7236391227120451,
 0.48571019999748477,
 0.7239959977965652,
 0.3816689678480065,
 0.7176455598397585,
 0.381595650776068,
 0.7199314348145786,
 0.5562254059149399,
 0.7203370372087137,
 0.5561209761251071,
 0.7198224560526588,
 0.4118251795283854,
 0.7203973044020815,
 0.41158473416869257,
 0.7170990972686524]