In [0]:
import pyspark
from pyspark.sql.types import StringType, BooleanType, IntegerType, DoubleType
import pyspark.sql.functions as F
from pyspark.ml.functions import vector_to_array
import airporttime
from datetime import datetime, timedelta

import numpy as np

In [0]:
blob_container = "w261-scrr" # The name of your container created in https://portal.azure.com
storage_account = "midsw261rv" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261scrr" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261scrrkey" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
%run "../libs/error_analysis"

In [0]:
%run "../libs/model_helper_functions"

In [0]:
#read random forest, logistic regression, and xgboost predictions.
rf_test_pred = spark.read.parquet(f"{blob_url}/rf_test_0410b/*")

lr_test_pred = spark.read.parquet(f"{blob_url}/lr_test_0410/*")

xgb_test_pred = spark.read.parquet(f"{blob_url}/xgb_test_0410b")

In [0]:
#join 3 models together on unique identifier. Include predictions and probabilities for each.
rf_ensemble = rf_test_pred.select(F.col("prediction").alias("pred_rf"), F.col("probability").alias("prob_rf"), F.col("TAIL_NUM").alias("TAIL_NUM_RF"), F.col("OP_CARRIER_FL_NUM").alias("OP_CARRIER_FL_NUM_RF"), F.col("TIMESTAMP_UTC").alias("TIMESTAMP_UTC_RF"))
rf_ensemble = rf_ensemble.withColumn("prob_rf", vector_to_array("prob_rf"))

lr_ensemble = lr_test_pred.select("DEP_DEL15", F.col("prediction").alias("pred_lr"), F.col("probability").alias("prob_lr"), F.col('OP_CARRIER_FL_NUM').alias("OP_CARRIER_FL_NUM_LR"), F.col('TAIL_NUM').alias("TAIL_NUM_LR"), F.col('TIMESTAMP_UTC').alias("TIMESTAMP_UTC_LR"))
lr_ensemble = lr_ensemble.withColumn("prob_lr", vector_to_array("prob_lr"))

xgb_ensemble = xgb_test_pred.select(F.col("prediction").alias("pred_xgb"), F.col("probability").alias("prob_xgb"), "TAIL_NUM", "OP_CARRIER_FL_NUM", "TIMESTAMP_UTC")
xgb_ensemble = xgb_ensemble.withColumn("prob_xgb", vector_to_array("prob_xgb"))

ensemble1_2 = rf_ensemble.join(lr_ensemble, [lr_ensemble.TAIL_NUM_LR == rf_ensemble.TAIL_NUM_RF, lr_ensemble.OP_CARRIER_FL_NUM_LR == rf_ensemble.OP_CARRIER_FL_NUM_RF, lr_ensemble.TIMESTAMP_UTC_LR == rf_ensemble.TIMESTAMP_UTC_RF], how = "left")
ensemble1_2 = ensemble1_2.select('pred_rf',
 'prob_rf',
 'DEP_DEL15',
 'pred_lr',
 'prob_lr',
 'OP_CARRIER_FL_NUM_RF',
 'TAIL_NUM_RF',
 'TIMESTAMP_UTC_RF')
ensemble_join = ensemble1_2.join(xgb_ensemble, [xgb_ensemble.TAIL_NUM == ensemble1_2.TAIL_NUM_RF, xgb_ensemble.OP_CARRIER_FL_NUM == ensemble1_2.OP_CARRIER_FL_NUM_RF, xgb_ensemble.TIMESTAMP_UTC == ensemble1_2.TIMESTAMP_UTC_RF], how = 'left')

ensemble1_join = ensemble_join.select('pred_rf',
 'prob_rf',
 'DEP_DEL15',
 'pred_lr',
 'prob_lr',
 'OP_CARRIER_FL_NUM',
 'TAIL_NUM',
 'TIMESTAMP_UTC')

In [0]:
#hard vote: mode of 3 predictions from each model
ensemble_join = ensemble_join.withColumn("hard_prediction", F.array("pred_lr", "pred_xgb", "pred_rf"))
ensemble_join = ensemble_join.withColumn('mean_pred', sum([F.col('hard_prediction').getItem(i) / 3 for i in range(3)]).cast(DoubleType()))
ensemble_join = ensemble_join.withColumn('hard_vote', F.when(ensemble_join.mean_pred > .5, 1).otherwise(0))

In [0]:
# soft voting: sum probabilities of each class prediction and take the larger of the two
ensemble_join = ensemble_join.withColumn("soft_sum0", sum(ensemble_join[col].getItem(0) for col in ["prob_lr", "prob_xgb", "prob_rf"]))
ensemble_join = ensemble_join.withColumn("soft_sum1", sum(ensemble_join[col].getItem(1) for col in ["prob_lr", "prob_xgb", "prob_rf"]))
ensemble_join = ensemble_join.withColumn("soft_vote", F.when(F.col("soft_sum0") > F.col("soft_sum1"), 0).otherwise(1))
display(ensemble_join)

pred_rf,prob_rf,DEP_DEL15,pred_lr,prob_lr,OP_CARRIER_FL_NUM_RF,TAIL_NUM_RF,TIMESTAMP_UTC_RF,pred_xgb,prob_xgb,TAIL_NUM,OP_CARRIER_FL_NUM,TIMESTAMP_UTC,hard_prediction,mean_pred,hard_vote,soft_sum0,soft_sum1,soft_vote
0.0,"List(0.5854466075315637, 0.41455339246843625)",0.0,0.0,"List(0.8668094794533888, 0.13319052054661118)",1206,215NV,2019-01-02T20:38:00.000+0000,0.0,"List(0.6638182401657104, 0.33618178963661194)",215NV,1206,2019-01-02T20:38:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,2.1160743271506632,0.8839257026516594,0
0.0,"List(0.5413304483461343, 0.4586695516538656)",1.0,0.0,"List(0.8259598070742146, 0.17404019292578543)",1206,215NV,2019-01-21T12:30:00.000+0000,0.0,"List(0.5282003879547119, 0.4717995822429657)",215NV,1206,2019-01-21T12:30:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,1.895490643375061,1.1045093268226167,0
0.0,"List(0.6134203020437453, 0.38657969795625474)",0.0,0.0,"List(0.866930775483658, 0.13306922451634196)",1700,215NV,2019-04-22T11:40:00.000+0000,0.0,"List(0.8831183314323425, 0.11688168346881866)",215NV,1700,2019-04-22T11:40:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,2.363469408959746,0.6365306059414153,0
0.0,"List(0.6082268218253586, 0.39177317817464147)",0.0,0.0,"List(0.8630792933233954, 0.1369207066766046)",1702,215NV,2019-03-07T11:00:00.000+0000,0.0,"List(0.8487657308578491, 0.15123425424098969)",215NV,1702,2019-03-07T11:00:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,2.320071846006603,0.6799281390922358,0
1.0,"List(0.4665645998492464, 0.5334354001507536)",0.0,0.0,"List(0.8056099703649887, 0.19439002963501129)",1704,215NV,2019-03-11T22:07:00.000+0000,1.0,"List(0.4968176484107971, 0.5031823515892029)",215NV,1704,2019-03-11T22:07:00.000+0000,"List(0.0, 1.0, 1.0)",0.6666666666666666,1,1.7689922186250324,1.2310077813749678,0
0.0,"List(0.5698657532818051, 0.43013424671819495)",0.0,0.0,"List(0.7846054702275851, 0.2153945297724149)",1712,215NV,2019-03-22T17:53:00.000+0000,0.0,"List(0.6574671864509583, 0.34253281354904175)",215NV,1712,2019-03-22T17:53:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,2.011938409960349,0.9880615900396515,0
1.0,"List(0.15431876190818225, 0.8456812380918177)",0.0,1.0,"List(0.29716039484725704, 0.702839605152743)",1713,215NV,2019-03-02T02:04:00.000+0000,1.0,"List(0.13152849674224854, 0.8684715032577515)",215NV,1713,2019-03-02T02:04:00.000+0000,"List(1.0, 1.0, 1.0)",1.0,1,0.5830076534976878,2.416992346502312,1
0.0,"List(0.6179598258458064, 0.3820401741541936)",1.0,0.0,"List(0.8662141080237648, 0.13378589197623525)",1718,215NV,2019-02-10T21:24:00.000+0000,0.0,"List(0.6034029722213745, 0.3965970277786255)",215NV,1718,2019-02-10T21:24:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,2.0875769060909457,0.9124230939090544,0
0.0,"List(0.6144408119229793, 0.3855591880770206)",0.0,0.0,"List(0.8895223370201215, 0.11047766297987849)",1718,215NV,2019-03-12T12:00:00.000+0000,0.0,"List(0.7347662448883057, 0.2652337849140167)",215NV,1718,2019-03-12T12:00:00.000+0000,"List(0.0, 0.0, 0.0)",0.0,0,2.2387293938314063,0.7612706359709158,0
0.0,"List(0.5348017080663319, 0.46519829193366824)",0.0,1.0,"List(0.3770611356742251, 0.6229388643257749)",1719,215NV,2019-03-31T22:55:00.000+0000,1.0,"List(0.4980977773666382, 0.5019022226333618)",215NV,1719,2019-03-31T22:55:00.000+0000,"List(1.0, 1.0, 0.0)",0.6666666666666666,1,1.4099606211071951,1.590039378892805,1


In [0]:
ensemble_join.write.parquet(f"{blob_url}/ensemble_test_agg_0410")

In [0]:
#get hard metrics
hard_precision, hard_recall, hard_fmeasure = getMetricsEnsemble(ensemble_join, hard = 1)
""
print("Final Ensemble Hard Test Scores:")
print("Precision is {:.3f}".format(hard_precision))
print("Recall is {:.3f}".format(hard_recall))
print("F beta(0.5) score is {:.3f}".format(hard_fmeasure))

In [0]:
#get hard metrics
precision, recall, fmeasure = getMetricsEnsemble(ensemble_join, hard = 0)
""
print("Final Ensemble Soft Test Scores")
print("Precision is {:.3f}".format(precision))
print("Recall is {:.3f}".format(recall))
print("F beta(0.5) score is {:.3f}".format(fmeasure))

In [0]:
rf_rejoin.columns

In [0]:
spark.read.parquet(f"{blob_url}/ensemble_test_agg_0410")
post_ensemble = spark.read.parquet(f"{blob_url}/ensemble_test_agg_0410")
post_ensemble = post_ensemble.select("hard_vote", "soft_vote", "TAIL_NUM", "OP_CARRIER_FL_NUM", "TIMESTAMP_UTC", "soft_sum0", "soft_sum1")
rf_rejoin = rf_test_pred.withColumnRenamed("TAIL_NUM", "TAIL_NUM_RF").withColumnRenamed("OP_CARRIER_FL_NUM", "OP_CARRIER_FL_NUM_RF").withColumnRenamed("TIMESTAMP_UTC", "TIMESTAMP_UTC_RF")
#select(F.col("prediction").alias("pred_rf"), F.col("probability").alias("prob_rf"), F.col("TAIL_NUM").alias("TAIL_NUM_RF"), F.col("OP_CARRIER_FL_NUM").alias("OP_CARRIER_FL_NUM_RF"), F.col("TIMESTAMP_UTC").alias("TIMESTAMP_UTC_RF"))
post_ensemble_join = post_ensemble.join(rf_rejoin, [post_ensemble.TAIL_NUM == rf_rejoin.TAIL_NUM_RF, post_ensemble.OP_CARRIER_FL_NUM == rf_rejoin.OP_CARRIER_FL_NUM_RF, post_ensemble.TIMESTAMP_UTC == rf_rejoin.TIMESTAMP_UTC_RF], how = "left")


In [0]:
display(post_ensemble_join)

hard_vote,soft_vote,TAIL_NUM,OP_CARRIER_FL_NUM,TIMESTAMP_UTC,soft_sum0,soft_sum1,features,DEP_DEL15,CRS_DEP_TIME,OP_UNIQUE_CARRIER,DAY_OF_WEEK,DISTANCE,DISTANCE_GROUP,MONTH,CIG_CeilingHeightDim_median,VIS_Horizontal_median,AA_RainDepth,AA_RainDuration,AL_SnowAccumDuration_mean,AL_SnowAccumDepth,AJ1_SnowDepth_mean,AJ1_SnowEqWaterDepth,WND_Speed_mean,TMP_Value_mean,SLP_Value_mean,OP_CARRIER_FL_NUM_RF,TAIL_NUM_RF,TIMESTAMP_UTC_RF,PREV_DEP_DEL15,ORIGIN_mean_encoding,DEST_mean_encoding,ORIGIN_DEST_COMBO_mean_encoding,rawPrediction,probability,prediction
0,0,215NV,1206,2019-01-02T20:38:00.000+0000,2.1160743271506632,0.8839257026516594,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 23, 29, 47, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.8573269979298686, 2.523621473338592, 3.690632194401699, 0.9583137414198132, 0.1644717092153885, 0.10006861366460523, 0.12678228358932056, 0.5537856522541988, 14.827309457531822, 5.689775477532049, 6.5065572931316895, 4.912766537070736))",0.0,1538,G4,3,588.0,3,1,9202.17,5515.87,0.0,0.0,0.0,0,0.0,0.0,2.7300000190734863,56.459999084472656,10223.51953125,1206,215NV,2019-01-02T20:38:00.000+0000,0.0,0.1789372599231754,0.2059981255857544,0.2485875706214689,"Map(vectorType -> dense, length -> 2, values -> List(17.563398225946912, 12.436601774053088))","Map(vectorType -> dense, length -> 2, values -> List(0.5854466075315637, 0.41455339246843625))",0.0
0,0,215NV,1206,2019-01-21T12:30:00.000+0000,1.895490643375061,1.1045093268226167,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 22, 29, 47, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.7918735019844716, 2.523621473338592, 3.690632194401699, 0.9583137414198132, 0.393209170604109, 0.29195832488630546, 4.465708706616491, -0.6913983364750049, 14.783249100199145, 5.689775477532049, 6.5065572931316895, 4.912766537070736))",1.0,730,G4,1,588.0,3,1,22000.0,16093.0,0.0,0.0,0.0,0,0.0,0.0,96.16000366210938,-70.48999786376953,10193.1396484375,1206,215NV,2019-01-21T12:30:00.000+0000,0.0,0.1789372599231754,0.2059981255857544,0.2485875706214689,"Map(vectorType -> dense, length -> 2, values -> List(16.23991345038403, 13.760086549615968))","Map(vectorType -> dense, length -> 2, values -> List(0.5413304483461343, 0.4586695516538656))",0.0
0,0,215NV,1702,2019-03-07T11:00:00.000+0000,2.320071846006603,0.6799281390922358,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 21, 30, 45, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.7823407432920306, 2.7623958606219507, 3.5966520046550063, 1.4097642624628204, 0.393209170604109, 0.29195832488630546, 1.9718591968936534, 1.3237498504817427, 14.871573765215071, 7.193491918346579, 6.094279118716825, 4.314396605138784))",0.0,600,G4,4,865.0,4,3,22000.0,16093.0,0.0,0.0,0.0,0,0.0,0.0,42.459999084472656,134.9600067138672,10254.0400390625,1702,215NV,2019-03-07T11:00:00.000+0000,0.0,0.2262275090170626,0.1929453655279848,0.2183098591549295,"Map(vectorType -> dense, length -> 2, values -> List(18.24680465476076, 11.753195345239243))","Map(vectorType -> dense, length -> 2, values -> List(0.6082268218253586, 0.39177317817464147))",0.0
1,0,215NV,1704,2019-03-11T22:07:00.000+0000,1.7689922186250324,1.2310077813749678,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 22, 30, 45, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.7918735019844716, 2.7623958606219507, 3.5966520046550063, 1.3869472686194915, 0.18214610187665176, 0.29195832488630546, 1.8283583643297778, 2.595416502073394, 14.830588242681623, 7.193491918346579, 6.152764087719193, 6.294792275024473))",0.0,1807,G4,1,851.0,4,3,10191.05,16093.0,0.0,0.0,0.0,0,0.0,0.0,39.369998931884766,264.6099853515625,10225.7802734375,1704,215NV,2019-03-11T22:07:00.000+0000,0.0,0.2262275090170626,0.1947970043358297,0.3185185185185185,"Map(vectorType -> dense, length -> 2, values -> List(13.996937995477392, 16.003062004522608))","Map(vectorType -> dense, length -> 2, values -> List(0.4665645998492464, 0.5334354001507536))",1.0
0,0,215NV,1712,2019-03-22T17:53:00.000+0000,2.011938409960349,0.9880615900396515,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 20, 29, 45, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.765542968916176, 2.523621473338592, 3.5966520046550063, 1.010466870204565, 0.393209170604109, 0.29195832488630546, 1.2394942099144812, 1.4478268117741988, 14.750951296071259, 7.193491918346579, 6.19378993833864, 6.79837565702643))",0.0,1353,G4,5,620.0,3,3,22000.0,16093.0,0.0,0.0,0.0,0,0.0,0.0,26.690000534057617,147.61000061035156,10170.8701171875,1712,215NV,2019-03-22T17:53:00.000+0000,0.0,0.2262275090170626,0.1960958860558275,0.344,"Map(vectorType -> dense, length -> 2, values -> List(17.095972598454154, 12.904027401545848))","Map(vectorType -> dense, length -> 2, values -> List(0.5698657532818051, 0.43013424671819495))",0.0
0,0,215NV,1718,2019-02-10T21:24:00.000+0000,2.0875769060909457,0.9124230939090544,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 25, 30, 51, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.877626148557499, 2.7623958606219507, 3.865516153185545, 1.5662236488170755, 0.3338547978554755, 0.29195832488630546, 3.4054743673245063, 2.288706060296566, 14.849949362767054, 7.193491918346579, 5.5197990835396356, 4.14076036695962))",1.0,1624,G4,7,961.0,4,2,18679.13,16093.0,0.0,0.0,0.0,0,0.0,0.0,73.33000183105469,233.33999633789065,10239.1298828125,1718,215NV,2019-02-10T21:24:00.000+0000,0.0,0.2262275090170626,0.174757281553398,0.2095238095238095,"Map(vectorType -> dense, length -> 2, values -> List(18.538794775374193, 11.46120522462581))","Map(vectorType -> dense, length -> 2, values -> List(0.6179598258458064, 0.3820401741541936))",0.0
0,0,215NV,1718,2019-03-12T12:00:00.000+0000,2.2387293938314063,0.7612706359709158,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 24, 30, 45, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.870831253074558, 2.7623958606219507, 3.5966520046550063, 1.5662236488170755, 0.13619335818196865, 0.29195832488630546, 1.0058989831550336, 2.361288755358408, 14.812836064250693, 7.193491918346579, 5.5197990835396356, 4.14076036695962))",0.0,800,G4,2,961.0,4,3,7620.0,16093.0,0.0,0.0,0.0,0,0.0,0.0,21.65999984741211,240.7400054931641,10213.5400390625,1718,215NV,2019-03-12T12:00:00.000+0000,0.0,0.2262275090170626,0.174757281553398,0.2095238095238095,"Map(vectorType -> dense, length -> 2, values -> List(18.43322435768938, 11.566775642310619))","Map(vectorType -> dense, length -> 2, values -> List(0.6144408119229793, 0.3855591880770206))",0.0
1,1,215NV,1719,2019-03-31T22:55:00.000+0000,1.4099606211071951,1.590039378892805,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 25, 30, 45, 53, 54, 55, 62, 63, 66, 67, 68), values -> List(15.378478634872558, 2.877626148557499, 2.7623958606219507, 3.5966520046550063, 1.5662236488170755, 0.11837633817972566, 0.29195832488630546, 3.2276075633318864, -0.011672068164301514, 8.427907663761856, 6.903644428611925, 7.152222452021161))",0.0,1855,G4,7,961.0,4,3,6623.14,16093.0,0.0,0.0,0.0,0,0.0,0.0,69.5,-1.190000057220459,0.0,1719,215NV,2019-03-31T22:55:00.000+0000,0.0,0.2650485436893204,0.2185699361328608,0.3619047619047619,"Map(vectorType -> dense, length -> 2, values -> List(16.044051241989955, 13.955948758010047))","Map(vectorType -> dense, length -> 2, values -> List(0.5348017080663319, 0.46519829193366824))",0.0
0,0,215NV,1720,2019-03-31T11:40:00.000+0000,2.206337326912038,0.7936626879891231,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 25, 32, 45, 53, 54, 55, 62, 63, 64, 66, 67, 68), values -> List(15.378478634872558, 2.877626148557499, 3.211247675868455, 3.5966520046550063, 1.9508529736046196, 0.3455007570388845, 0.29195832488630546, 2.21056280704546, 2.2363289189050235, 14.747818392074992, 7.193491918346579, 6.290064139923015, 5.877705497379006))",1.0,740,G4,7,1197.0,5,3,19330.72,16093.0,0.0,0.0,0.0,0,0.0,0.0,47.59999847412109,228.0,10168.7099609375,1720,215NV,2019-03-31T11:40:00.000+0000,0.0,0.2262275090170626,0.1991439349970981,0.2974137931034483,"Map(vectorType -> dense, length -> 2, values -> List(16.374689145380824, 13.625310854619174))","Map(vectorType -> dense, length -> 2, values -> List(0.5458229715126941, 0.4541770284873058))",0.0
0,0,215NV,1754,2019-03-18T10:00:00.000+0000,1.5902874636234812,1.4097125214753576,"Map(vectorType -> sparse, length -> 69, indices -> List(18, 22, 30, 45, 53, 54, 55, 62, 63, 64, 65, 66, 67, 68), values -> List(15.378478634872558, 2.7918735019844716, 2.7623958606219507, 3.5966520046550063, 1.458657820698525, 0.393209170604109, 0.29195832488630546, 1.4628724927331571, 2.2333863509207874, 14.748383504504698, 2.431116614438075, 7.193491918346579, 5.991993265930256, 3.9245117597876473))",0.0,600,G4,1,895.0,4,3,22000.0,16093.0,0.0,0.0,0.0,0,0.0,0.0,31.5,227.6999969482422,10169.099609375,1754,215NV,2019-03-18T10:00:00.000+0000,1.0,0.2262275090170626,0.1897069872276484,0.1985815602836879,"Map(vectorType -> dense, length -> 2, values -> List(7.555014268601584, 22.44498573139841))","Map(vectorType -> dense, length -> 2, values -> List(0.2518338089533862, 0.7481661910466137))",1.0


In [0]:
#write ensemble test set rejoined with features to blob
post_ensemble_join.write.parquet(f"{blob_url}/ensemble_test_analysis_0410")

In [0]:
#analyze errors of ensemble model
analyze_errors(post_ensemble_join)

PRED_GROUP,avg(DISTANCE),avg(CIG_CeilingHeightDim_median),avg(CRS_DEP_TIME),avg(VIS_Horizontal_median),avg(WND_Speed_mean)
TP,830.7218315223305,10123.413784340224,1596.078734318892,14088.298961450526,39.142767598125765
TN,787.9254270185096,12959.603247632143,1291.8280778539477,15118.31616834268,33.18144007967393
FN,822.1466164557143,11830.9313571516,1413.4647835186584,14770.110837579236,35.09331844639156
FP,836.4063730569948,10580.49754238474,1293.8085456900612,14303.092388236091,38.20164735820261


PRED_GROUP,avg(PREV_DEP_DEL15)
TP,0.802069051283247
TN,0.0005926932299665697
FN,0.0007488930335862865
FP,0.5739166274140367


In [0]:
# #Finetuning analysis
# hard_ensemble = spark.write.parquet(f"{blob_url}/ensemble_test_agg_0410")

# ensemble_join = ensemble_join.withColumn("hard_prediction", F.array("pred_lr", "pred_xgb", "pred_rf"))
# ensemble_join = ensemble_join.withColumn('mean_pred', sum([F.col('hard_prediction').getItem(i) / 3 for i in range(3)]).cast(DoubleType()))
# ensemble_join = ensemble_join.withColumn('hard_vote', F.when(ensemble_join.mean_pred > .5, 1).otherwise(0))