In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pyspark.sql import functions as f
from pyspark.sql import SQLContext
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, avg, mean
from pyspark.sql.functions import min
from pyspark.sql.functions import col, max
from pyspark.sql.functions import format_string
from pyspark.sql.functions import substring
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import concat
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import lit
from pyspark.sql.functions import to_utc_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import instr
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer,OneHotEncoder
from pyspark.ml.classification import MultilayerPerceptronClassifier


from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [0]:
#Initializes blob storage credentials/location
blob_container = "w261-sec4-group2" # The name of your container created in https://portal.azure.com
storage_account = "kdevery" # The name of your Storage account created in https://portal.azure.com
secret_scope = "sec4-group2" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

#Points to SAS token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

In [0]:
#feature processing of dfs

train_df=train_df.select("*", f.row_number().over(Window.partitionBy().orderBy("Date_Time_sched_dep_utc")).alias("Index"))
train_df = train_df.withColumn("holiday_period", train_df["holiday_period"].cast(IntegerType()))
test_df = test_df.withColumn("holiday_period", test_df["holiday_period"].cast(IntegerType()))

In [0]:
#string indexing of carrier
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
train_df = carrier_indexer.fit(train_df).transform(train_df)


#one hot encoding
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
train_df = onehotencoder_carrier_vector.fit(train_df).transform(train_df)

In [0]:
#splitting training dataframe into five folds contained in dictionary "d"

d = {}
folds = ['df1','df2','df3','df4','df5']

each_len = train_df.count()/5
start = 1
val_size = each_len/5
stop = each_len
precision_list = []

for fold in folds:
    d[fold] = train_df.filter(col('Index').between(start,stop))\
                                  .withColumn('cv', F.when(col('Index').between(start,(stop-val_size)), 'train')
                                         .otherwise('val'))
    start += each_len
    stop += each_len

                                  

In [0]:
d['df1'].count()

Out[16]: 7073330

In [0]:
train_df.createOrReplaceTempView('train_view')

In [0]:
%sql 

SELECT holiday_period,mean_carrier_delay,Pagerank_Score,
 PREV_FLIGHT_DELAYED,origin_percent_delayed,
 dest_percent_delayed,
 ORIGIN_Prophet_trend,
 ORIGIN_Prophet_pred,
 DEST_Prophet_trend,
 DEST_Prophet_pred
 FROM train_view
 LIMIT 10

holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred
0,0.1235820895522388,0.003849790625422,0,0.1651651651651651,0.0,0.1773706625404781,0.1587084590387079,0.1374696268965153,0.1363514225877349
0,0.1029009304871373,0.003849790625422,0,0.1651651651651651,0.0,0.1773706625404781,0.1587084590387079,0.1374696268965153,0.1363514225877349
0,0.1153415453527435,0.003849790625422,0,0.1651651651651651,0.0,0.1773706625404781,0.1587084590387079,0.1374696268965153,0.1363514225877349
0,0.1153415453527435,0.0377002894322355,0,0.115819209039548,0.0,0.171421578526557,0.1355859881424514,0.1374696268965153,0.1363514225877349
0,0.1202003338898163,0.0377002894322355,0,0.115819209039548,0.0,0.171421578526557,0.1355859881424514,0.1374696268965153,0.1363514225877349
0,0.1046464646464646,0.0377002894322355,0,0.115819209039548,0.0,0.171421578526557,0.1355859881424514,0.1374696268965153,0.1363514225877349
0,0.1202003338898163,0.0214447464923642,1,0.1463119709794437,0.0,0.2619641219031601,0.2276439832690767,0.1374696268965153,0.1363514225877349
0,0.094298245614035,0.0214447464923642,0,0.1463119709794437,0.0,0.2619641219031601,0.2276439832690767,0.1374696268965153,0.1363514225877349
0,0.1501196172248803,0.0377002894322355,0,0.2313364055299539,0.2857142857142857,0.1718056486588808,0.1825649314159583,0.1385854239009572,0.1464927517009997
0,0.1483483483483483,0.0377002894322355,0,0.2313364055299539,0.2857142857142857,0.1718056486588808,0.1825649314159583,0.1385854239009572,0.1464927517009997


In [0]:
%sql

SELECT COUNT(*)
FROM train_view
WHERE ORIGIN_Prophet_trend IS NULL

count(1)
1201


In [0]:
display(train_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,two_hrs_pre_flight_utc,Date_Time_sched_dep_utc,Date_Time_sched_arrival_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_flight_per_day,origin_delays_per_day,dest_flight_per_day,dest_delays_per_day,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred,Index,OP_CARRIER_Index,carrier_vec
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T04:00:00.000+0000,2015-01-01T06:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N633NK,PBG,FLL,155,523,-33.0,208.0,1334.0,6,,,,,,,,,,,,,0,PBG_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.00040524111846548696,0,,,,,,,0.2705312393116827,0.3382002742993087,0.1996033439108405,0.2575085307991152,1,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T04:00:00.000+0000,2015-01-01T06:00:00.000+0000,2015-01-01T10:00:00.000+0000,UA,N37293,BQN,EWR,259,559,6.0,240.0,1585.0,7,,,,,,,,,,,,,0,BQN_EWR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.00040524111846548696,0,,,,,,,0.2124145791730478,0.2739463846862088,0.2514565729306676,0.2753549089032862,2,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N528NK,MSP,FLL,115,542,25.0,207.0,1487.0,6,,,,,,,,,,,,,0,MSP_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0055625928677562,0,,,,,,,0.1478954440091772,0.2066691885195173,0.1996033439108405,0.2575085307991152,3,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N630NK,IAG,FLL,200,504,-11.0,184.0,1176.0,5,,,,,,,,,,,,,0,IAG_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.00040524111846548696,0,,,,,,,0.1467331782770404,0.1576017018143287,0.1996033439108405,0.2575085307991152,4,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,B6,N239JB,BQN,MCO,307,500,20.0,173.0,1129.0,5,,,,,,,,,,,,,0,BQN_MCO,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.00040524111846548696,0,,,,,,,0.2124145791730478,0.2739463846862088,0.2052754459222279,0.2446643896166549,5,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T11:00:00.000+0000,DL,N3751B,SLC,ATL,40,615,-22.0,215.0,1590.0,7,,,,,,,,,,,,,0,SLC_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0201903066158751,0,,,,,,,0.1157464901981351,0.194904214257851,0.1710663133068552,0.2046958300831791,6,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,DL,N958DN,DEN,ATL,30,523,-30.0,173.0,1199.0,5,,,,,,,,,,,,,0,DEN_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0669666824524958,0,,,,,,,0.2517042427700748,0.3348671456744319,0.1710663133068552,0.2046958300831791,7,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T11:00:00.000+0000,AA,N3LAAA,SFO,DFW,30,545,-13.0,195.0,1464.0,6,,,,,,,,,,,,,0,SFO_DFW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0162774050936167,0,,,,,,,0.1999094355673047,0.2354412147722896,0.2365338872876753,0.2637300320149568,8,2.0,"Map(vectorType -> sparse, length -> 18, indices -> List(2), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T11:00:00.000+0000,NK,N606NK,PHX,DFW,159,502,1.0,123.0,868.0,4,,,,,,,,,,,,,0,PHX_DFW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0032108266918817,0,,,,,,,0.1918739827556874,0.2443394254499782,0.2365338872876753,0.2637300320149568,9,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T12:00:00.000+0000,UA,N78448,SFO,IAH,48,626,-7.0,218.0,1635.0,7,,,,,,,,,,,,,0,SFO_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0162774050936167,0,,,,,,,0.1999094355673047,0.2354412147722896,0.2187611526117484,0.2515145246424415,10,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"


In [0]:
train_df.columns

Out[26]: ['DEP_DEL15',
 'YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'two_hrs_pre_flight_utc',
 'Date_Time_sched_dep_utc',
 'Date_Time_sched_arrival_utc',
 'OP_CARRIER',
 'TAIL_NUM',
 'ORIGIN',
 'DEST',
 'CRS_DEP_TIME',
 'CRS_ARR_TIME',
 'ARR_DELAY',
 'CRS_ELAPSED_TIME',
 'DISTANCE',
 'DISTANCE_GROUP',
 'ELEVATION',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyRelativeHumidity',
 'HourlySeaLevelPressure',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'HourlyWindGustSpeed',
 'Route',
 'Rain',
 'Snow',
 'Thunder',
 'Fog',
 'Mist',
 'Freezing',
 'Blowing',
 'Smoke',
 'Drizzle',
 'Overcast',
 'Broken',
 'Scattered',
 'CloudySkyCondition',
 'holiday_period',
 'mean_carrier_delay',
 'Pagerank_Score',
 'PREV_FLIGHT_DELAYED',
 'origin_flight_per_day',
 'origin_delays_per_day',
 'dest_flight_per_day',
 'dest_delays_per

In [0]:
def process_fold_df(fold_df):
    
    
    #imputation
    fold_df.createOrReplaceTempView("fold_view")
    
    imputation_columns = ['CRS_ELAPSED_TIME','HourlyAltimeterSetting','HourlyDewPointTemperature',
             'HourlyDryBulbTemperature','HourlyRelativeHumidity','HourlySeaLevelPressure',
             'HourlyStationPressure','HourlyVisibility','HourlyWetBulbTemperature',
             'HourlyWindDirection','mean_carrier_delay','ORIGIN_Prophet_trend',
             'ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred',]

    means = {}

    for impute_col in imputation_columns:
        mean = spark.sql(f"SELECT AVG({impute_col}) FROM fold_view").collect()[0][0]
        means[impute_col] = mean
    
    print(means)
    
    #fill Nones and Nans - Seems to error sometimes?
    fold_df = fold_df.fillna(0,["HourlyWindGustSpeed"]) \
         .fillna(means["CRS_ELAPSED_TIME"],["CRS_ELAPSED_TIME"]) \
         .fillna(means["HourlyAltimeterSetting"],["HourlyAltimeterSetting"]) \
         .fillna(means["HourlyDewPointTemperature"],["HourlyDewPointTemperature"]) \
         .fillna(means["HourlyDryBulbTemperature"],["HourlyDryBulbTemperature"]) \
         .fillna(0,["HourlyPrecipitation"]) \
         .fillna(means["HourlyRelativeHumidity"],["HourlyRelativeHumidity"]) \
         .fillna(means["HourlySeaLevelPressure"],["HourlySeaLevelPressure"]) \
         .fillna(means["HourlyStationPressure"],["HourlyStationPressure"]) \
         .fillna(means["HourlyVisibility"],["HourlyVisibility"]) \
         .fillna(means["HourlyWetBulbTemperature"],["HourlyWetBulbTemperature"]) \
         .fillna(means["HourlyWindDirection"],["HourlyWindDirection"]) \
         .fillna(0,["HourlyWindSpeed"]) \
         .fillna("",["TAIL_NUM"])\
         .fillna(0,['holiday_period'])\
         .fillna(means['mean_carrier_delay'],['mean_carrier_delay'])\
         .fillna(0,['PREV_FLIGHT_DELAYED'])\
         .fillna(0,['origin_percent_delayed'])\
         .fillna(0,['dest_percent_delayed'])\
         .fillna(means['ORIGIN_Prophet_trend'],['ORIGIN_Prophet_trend'])\
         .fillna(means['ORIGIN_Prophet_pred'],['ORIGIN_Prophet_pred'])\
         .fillna(means['DEST_Prophet_trend'],['DEST_Prophet_trend'])\
         .fillna(means['DEST_Prophet_pred'],['DEST_Prophet_pred'])
         

    
    #vector assembler
    feature_cols = ['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE','HourlyWindSpeed','Rain','Blowing','Snow','Thunder','CloudySkyCondition','carrier_vec',         'holiday_period','mean_carrier_delay','Pagerank_Score','PREV_FLIGHT_DELAYED','origin_percent_delayed','dest_percent_delayed','ORIGIN_Prophet_trend','ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred']
    #assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
    #outputCol = "features"
    df_va = VectorAssembler(inputCols = feature_cols, outputCol = 'feature_vector')
    model_input = df_va.transform(fold_df)
    
    #rename delay flag to label
    model_input = model_input.withColumnRenamed("DEP_DEL15","label")
    #model_input = assemble.transform(fold_df) \
    #               .withColumnRenamed('DEP_DEL15', 'label')
    
    #scaling
    #scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
    #model_input = scaler.fit(model_input).transform(model_input)
    #model_input = model_input.select('label', 'scaled_feature_vector','cv')
    
    
    return model_input

In [0]:
d_processed = {}
for key in d.keys():
    print(key)
    d_processed[key] = process_fold_df(d[key])

df1
{'CRS_ELAPSED_TIME': 142.1289347632494, 'HourlyAltimeterSetting': 30.03658166854818, 'HourlyDewPointTemperature': 45.00853174684653, 'HourlyDryBulbTemperature': 60.174234042800215, 'HourlyRelativeHumidity': 62.37595780260503, 'HourlySeaLevelPressure': 30.025573714624038, 'HourlyStationPressure': 29.139284746052244, 'HourlyVisibility': 9.306117444800673, 'HourlyWetBulbTemperature': 52.37836194809699, 'HourlyWindDirection': 171.99281838703556, 'mean_carrier_delay': 0.1791214051393481, 'ORIGIN_Prophet_trend': 0.18260345473578715, 'ORIGIN_Prophet_pred': 0.1807998984156776, 'DEST_Prophet_trend': 0.18257519835982852, 'DEST_Prophet_pred': 0.18077420854193052}
df2
{'CRS_ELAPSED_TIME': 145.44301015221967, 'HourlyAltimeterSetting': 30.01590338118016, 'HourlyDewPointTemperature': 49.01968432582812, 'HourlyDryBulbTemperature': 65.21155510423971, 'HourlyRelativeHumidity': 61.395272204113596, 'HourlySeaLevelPressure': 29.998395627789762, 'HourlyStationPressure': 29.109984516281322, 'HourlyVisibi

In [0]:
display(d_processed['df1'])

label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,two_hrs_pre_flight_utc,Date_Time_sched_dep_utc,Date_Time_sched_arrival_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_flight_per_day,origin_delays_per_day,dest_flight_per_day,dest_delays_per_day,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred,Index,OP_CARRIER_Index,carrier_vec,cv,feature_vector,scaled_feature_vector
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T04:00:00.000+0000,2015-01-01T06:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N633NK,PBG,FLL,155,523,-33.0,208.0,1334.0,6,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,PBG_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1791213240972651,0.00040524111846548696,0,,,,,0.0,0.0,0.2705312393116827,0.3382002742993087,0.1996033439108405,0.2575085307991152,1,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 29, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1334.0, 1.0, 0.17912132409726514, 4.0524111846548696E-4, 0.2705312393116827, 0.3382002742993087, 0.19960334391084059, 0.2575085307991152))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 29, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.196688815059387, 6.993234662116104, 1.9267133475511355, 0.013415414940534748, 7.269152411971436, 6.253473419473737, 5.362489900811254, 4.761065819241783))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T04:00:00.000+0000,2015-01-01T06:00:00.000+0000,2015-01-01T10:00:00.000+0000,UA,N37293,BQN,EWR,259,559,6.0,240.0,1585.0,7,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,BQN_EWR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1791213240972651,0.00040524111846548696,0,,,,,0.0,0.0,0.2124145791730478,0.2739463846862088,0.2514565729306676,0.2753549089032862,2,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 14, 29, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1585.0, 1.0, 0.17912132409726514, 4.0524111846548696E-4, 0.2124145791730478, 0.27394638468620885, 0.2514565729306676, 0.27535490890328623))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 14, 29, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.6100088244896016, 3.5068941508943214, 1.9267133475511355, 0.013415414940534748, 5.707562477672714, 5.065390436318862, 6.755564843821634, 5.091026852087401))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N528NK,MSP,FLL,115,542,25.0,207.0,1487.0,6,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,MSP_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0055625928677562,0,,,,,0.0,0.0,0.1478954440091772,0.2066691885195173,0.1996033439108405,0.2575085307991152,3,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1487.0, 1.0, 0.005562592867756299, 0.14789544400917728, 0.20666918851951732, 0.19960334391084059, 0.2575085307991152))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.448632884552705, 6.993234662116104, 0.18414837010811727, 3.9739385598285364, 3.821405170970466, 5.362489900811254, 4.761065819241783))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T11:00:00.000+0000,DL,N3751B,SLC,ATL,40,615,-22.0,215.0,1590.0,7,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,SLC_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1791213240972651,0.0201903066158751,0,,,,,0.0,0.0,0.1157464901981351,0.194904214257851,0.1710663133068552,0.2046958300831791,4,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 29, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1590.0, 1.0, 0.17912132409726514, 0.020190306615875182, 0.11574649019813513, 0.19490421425785107, 0.1710663133068552, 0.20469583008317915))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 29, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.6182422908129124, 2.765862917623, 1.9267133475511355, 0.6683955025448827, 3.1100987839398395, 3.603865566726943, 4.595821690676614, 3.784613724935659))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N630NK,IAG,FLL,200,504,-11.0,184.0,1176.0,5,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,IAG_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.00040524111846548696,0,,,,,0.0,0.0,0.1467331782770404,0.1576017018143287,0.1996033439108405,0.2575085307991152,5,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1176.0, 1.0, 4.0524111846548696E-4, 0.1467331782770404, 0.15760170181432878, 0.19960334391084059, 0.2575085307991152))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 1.936511279242758, 6.993234662116104, 0.013415414940534748, 3.9427085740730625, 2.9141255287318533, 5.362489900811254, 4.761065819241783))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,DL,N958DN,DEN,ATL,30,523,-30.0,173.0,1199.0,5,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,DEN_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1791213240972651,0.0669666824524958,0,,,,,0.0,0.0,0.2517042427700748,0.3348671456744319,0.1710663133068552,0.2046958300831791,6,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 29, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1199.0, 1.0, 0.17912132409726514, 0.06696668245249589, 0.25170424277007486, 0.3348671456744319, 0.1710663133068552, 0.20469583008317915))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 29, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 1.9743852243299886, 2.765862917623, 1.9267133475511355, 2.2169167721507312, 6.763272545125697, 6.191842389450066, 4.595821690676614, 3.784613724935659))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,B6,N239JB,BQN,MCO,307,500,20.0,173.0,1129.0,5,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,BQN_MCO,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1791213240972651,0.00040524111846548696,0,,,,,0.0,0.0,0.2124145791730478,0.2739463846862088,0.2052754459222279,0.2446643896166549,7,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 16, 29, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1129.0, 1.0, 0.17912132409726514, 4.0524111846548696E-4, 0.2124145791730478, 0.27394638468620885, 0.20527544592222793, 0.24466438961665493))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 16, 29, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 1.859116695803634, 4.930648252844006, 1.9267133475511355, 0.013415414940534748, 5.707562477672714, 5.065390436318862, 5.514875072103887, 4.523590962111585))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T11:00:00.000+0000,NK,N635NK,LAS,MSP,25,526,-17.0,181.0,1299.0,6,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,LAS_MSP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0248220783707261,0,,,,,0.0,0.0,0.2089042217667012,0.2619873703617896,0.1478954440091772,0.2066691885195173,8,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1299.0, 1.0, 0.024822078370726145, 0.20890422176670123, 0.26198737036178965, 0.14789544400917728, 0.20666918851951732))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 19, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.1390545507962098, 6.993234662116104, 0.8217292516877809, 5.6132394594802495, 4.844262945054099, 3.973319330909949, 3.821099076978972))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T12:00:00.000+0000,DL,N320US,SLC,ATL,140,715,10.0,215.0,1590.0,7,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,SLC_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0201903066158751,0,,,,,0.0,0.0,0.1157464901981351,0.194904214257851,0.1710663133068552,0.2046958300831791,9,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1590.0, 1.0, 0.020190306615875182, 0.11574649019813513, 0.19490421425785107, 0.1710663133068552, 0.20469583008317915))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.6182422908129124, 2.765862917623, 0.6683955025448827, 3.1100987839398395, 3.603865566726943, 4.595821690676614, 3.784613724935659))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T12:00:00.000+0000,DL,N3730B,SFO,MSP,25,602,8.0,217.0,1589.0,7,,30.03658066077159,45,60,0.0,62,30.025573795875868,29.139292838905504,9.306111209209847,52,171,0,0,SFO_MSP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0162774050936167,0,,,,,0.0,0.0,0.1999094355673047,0.2354412147722896,0.1478954440091772,0.2066691885195173,10,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 30, 34, 35, 36, 37), values -> List(1.0, 1.0, 4.0, 1589.0, 1.0, 0.016277405093616745, 0.19990943556730476, 0.2354412147722896, 0.14789544400917728, 0.20666918851951732))","Map(vectorType -> sparse, length -> 38, indices -> List(0, 1, 2, 3, 11, 30, 34, 35, 36, 37), values -> List(0.28291027323636736, 0.11328696931622385, 2.011378284527651, 2.61659559754825, 2.765862917623, 0.5388597887423908, 5.371550285383871, 4.3534127270520955, 3.973319330909949, 3.821099076978972))"


### Model Building

In [0]:
%run "/Shared/w261_Section4_Group2/Phase 3/custom_cv_module"

#### Logistic Regression

In [0]:
# set up grid search: estimator, set of params, and evaluator
logistic_model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
grid = ParamGridBuilder()\
            .addGrid(logistic_model.threshold, [0.3,0.5,0.8])\
            .build() 

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)

In [0]:
# run cross validation & return the crossvalidation F0.5 score for 'test' set
cv = CustomCrossValidator(estimator=logistic_model, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=10)

In [0]:
cvModel = cv.fit(d_processed)

fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
fold 3 start...
fold 3 end
fold 4 start...
fold 4 end
fold 5 start...
fold 5 end
Best Model:  {Param(parent='LogisticRegression_f3b199003b55', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.5} Detailed Score [0.9397630019749837, 0.9376867896401635, 0.9404750051115456, 0.9354116302191581, 0.9252390142585548] Avg Score 0.9357150882408812


In [0]:
#for individual testing

#test_train = d_processed['df1'].filter(col('cv')=='train')
#test_val = d_processed['df1'].filter(col('cv')=='val')

#test_logistic_model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
#evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
#lrModel = test_logistic_model.fit(test_train)
#predictions = lrModel.transform(test_val)
#evaluator.evaluate(predictions)

In [0]:
# make predictions
predictions = cvModel.transform(d_processed['df1'])

display(predictions.groupby('label', 'prediction').count())

label,prediction,count
0.0,0.0,5798203
1.0,0.0,308417
1.0,1.0,962546
0.0,1.0,4164


In [0]:
fbeta = cvModel.avgMetrics[0]
print(f"Logistic Regression F0.5 Score: {fbeta}")

Logistic Regression F0.5 Score: 0.9302701606788579


In [0]:
# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

In [0]:
#string indexing of carrier for train
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
indexer_transformer = carrier_indexer.setHandleInvalid("keep").fit(train_df)
train_df = indexer_transformer.transform(train_df)

#one hot encoding for train
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
onehotencoder_transformer = onehotencoder_carrier_vector.fit(train_df)
train_df = onehotencoder_transformer.transform(train_df)

In [0]:
display(train_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,two_hrs_pre_flight_utc,Date_Time_sched_dep_utc,Date_Time_sched_arrival_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_flight_per_day,origin_delays_per_day,dest_flight_per_day,dest_delays_per_day,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred,OP_CARRIER_Index,carrier_vec
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T15:00:00.000+0000,2015-03-16T17:00:00.000+0000,2015-03-16T19:00:00.000+0000,OO,N8968E,DTW,ABE,1355,1521,-5.0,86.0,425.0,2,192.3,29.92,37.0,57,0.0,47.0,29.92,29.21,10.0,47.0,250.0,8,0,DTW_ABE,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0.1235820895522388,0.003849790625422,0,333,55.0,8.0,0.0,0.1651651651651651,0.0,0.1773706625404781,0.1587084590387079,0.1374696268965153,0.1363514225877349,3.0,"Map(vectorType -> sparse, length -> 19, indices -> List(3), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T22:00:00.000+0000,2015-03-17T00:00:00.000+0000,2015-03-17T01:00:00.000+0000,EV,N981EV,DTW,ABE,2026,2150,-19.0,84.0,425.0,2,192.3,29.8,45.0,58,0.0,62.0,29.8,29.09,10.0,51.0,160.0,8,0,DTW_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1029009304871373,0.003849790625422,0,333,55.0,8.0,0.0,0.1651651651651651,0.0,0.1773706625404781,0.1587084590387079,0.1374696268965153,0.1363514225877349,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T12:00:00.000+0000,2015-03-16T14:00:00.000+0000,2015-03-16T15:00:00.000+0000,EV,N926EV,DTW,ABE,1005,1130,-4.0,85.0,425.0,2,192.3,29.92,37.0,46,0.0,71.0,29.92,29.21,10.0,42.0,220.0,7,0,DTW_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1153415453527435,0.003849790625422,0,333,55.0,8.0,0.0,0.1651651651651651,0.0,0.1773706625404781,0.1587084590387079,0.1374696268965153,0.1363514225877349,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T12:00:00.000+0000,2015-03-16T14:00:00.000+0000,2015-03-16T16:00:00.000+0000,EV,N837AS,ATL,ABE,1026,1222,13.0,116.0,692.0,3,307.8,30.2,40.0,51,0.0,66.0,30.19,29.1,10.0,46.0,300.0,7,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.1153415453527435,0.0377002894322355,0,1062,123.0,8.0,0.0,0.115819209039548,0.0,0.171421578526557,0.1355859881424514,0.1374696268965153,0.1363514225877349,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T16:00:00.000+0000,2015-03-16T18:00:00.000+0000,2015-03-16T20:00:00.000+0000,EV,N878AS,ATL,ABE,1457,1653,14.0,116.0,692.0,3,307.8,30.19,31.0,76,0.0,19.0,30.17,29.09,10.0,54.0,340.0,6,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1202003338898163,0.0377002894322355,0,1062,123.0,8.0,0.0,0.115819209039548,0.0,0.171421578526557,0.1355859881424514,0.1374696268965153,0.1363514225877349,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T22:00:00.000+0000,2015-03-17T00:00:00.000+0000,2015-03-17T02:00:00.000+0000,DL,N919AT,ATL,ABE,2046,2243,4.0,117.0,692.0,3,307.8,30.06,24.0,80,0.0,13.0,30.05,28.96,10.0,54.0,240.0,5,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.1046464646464646,0.0377002894322355,0,1062,123.0,8.0,0.0,0.115819209039548,0.0,0.171421578526557,0.1355859881424514,0.1374696268965153,0.1363514225877349,1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))"
1.0,2015,1,3,16,1,2015-03-16,2015-03-16T16:00:00.000+0000,2015-03-16T18:00:00.000+0000,2015-03-16T20:00:00.000+0000,EV,N16987,ORD,ABE,1326,1618,39.0,112.0,654.0,3,201.8,29.91,36.0,60,0.0,41.0,29.91,29.19,10.0,48.0,230.0,7,0,ORD_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1202003338898163,0.0214447464923642,1,827,121.0,8.0,0.0,0.1463119709794437,0.0,0.2619641219031601,0.2276439832690767,0.1374696268965153,0.1363514225877349,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-17T00:00:00.000+0000,2015-03-17T02:00:00.000+0000,2015-03-17T03:00:00.000+0000,EV,N17159,ORD,ABE,2100,2353,-12.0,113.0,654.0,3,201.8,29.77,47.0,68,0.0,47.0,29.76,29.05,10.0,57.0,250.0,11,0,ORD_ABE,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0.094298245614035,0.0214447464923642,0,827,121.0,8.0,0.0,0.1463119709794437,0.0,0.2619641219031601,0.2276439832690767,0.1374696268965153,0.1363514225877349,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,2,6,4,4,2015-06-04,2015-06-04T16:00:00.000+0000,2015-06-04T18:00:00.000+0000,2015-06-04T20:00:00.000+0000,EV,N881AS,ATL,ABE,1444,1652,-11.0,128.0,692.0,3,307.8,30.08,63.0,73,0.0,71.0,30.06,28.98,10.0,67.0,100.0,7,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.1501196172248803,0.0377002894322355,0,1085,251.0,7.0,2.0,0.2313364055299539,0.2857142857142857,0.1718056486588808,0.1825649314159583,0.1385854239009572,0.1464927517009997,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,2,6,4,4,2015-06-04,2015-06-04T12:00:00.000+0000,2015-06-04T14:00:00.000+0000,2015-06-04T16:00:00.000+0000,EV,N856AS,ATL,ABE,1031,1238,-1.0,127.0,692.0,3,307.8,30.08,60.0,65,0.0,84.0,30.06,28.98,10.0,62.0,70.0,13,0,ATL_ABE,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.1483483483483483,0.0377002894322355,0,1085,251.0,7.0,2.0,0.2313364055299539,0.2857142857142857,0.1718056486588808,0.1825649314159583,0.1385854239009572,0.1464927517009997,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"


In [0]:
#string indexing of carrier for test
#one hot encoding for test
test_df = indexer_transformer.transform(test_df)
test_df = onehotencoder_transformer.transform(test_df)

In [0]:
#cast holiday to integer
train_df = train_df.withColumn("holiday_period", train_df["holiday_period"].cast(IntegerType()))
test_df = test_df.withColumn("holiday_period", test_df["holiday_period"].cast(IntegerType()))

In [0]:
processed_train_df = process_fold_df(train_df)

#scale to train on train set
scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
scaler_transformer = scaler.fit(processed_train_df)
processed_train_df = scaler_transformer.transform(processed_train_df)

processed_test_df = process_fold_df(test_df)
#scale to train on test set
processed_test_df = scaler_transformer.transform(processed_test_df)

{'CRS_ELAPSED_TIME': 142.19873196344375, 'HourlyAltimeterSetting': 30.033807209267266, 'HourlyDewPointTemperature': 47.031581851427795, 'HourlyDryBulbTemperature': 62.17946108108702, 'HourlyRelativeHumidity': 62.826167693631106, 'HourlySeaLevelPressure': 30.020612570542625, 'HourlyStationPressure': 29.138857040193923, 'HourlyVisibility': 9.348308673776328, 'HourlyWetBulbTemperature': 54.20766773299253, 'HourlyWindDirection': 172.5585708721293, 'mean_carrier_delay': 0.16986784944051236, 'ORIGIN_Prophet_trend': 0.1700375311510371, 'ORIGIN_Prophet_pred': 0.16868667812184251, 'DEST_Prophet_trend': 0.17001660082758124, 'DEST_Prophet_pred': 0.16866696229568484}
{'CRS_ELAPSED_TIME': 142.7358402172544, 'HourlyAltimeterSetting': 30.028331599000225, 'HourlyDewPointTemperature': 48.10941381755681, 'HourlyDryBulbTemperature': 64.10941143119706, 'HourlyRelativeHumidity': 61.50895857992817, 'HourlySeaLevelPressure': 30.011542257892003, 'HourlyStationPressure': 29.048171030880972, 'HourlyVisibility':

In [0]:
final_logistic_model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)
lrModel = final_logistic_model.fit(processed_train_df)

In [0]:
predictions = lrModel.transform(processed_test_df)

In [0]:
display(predictions.groupby('label', 'prediction').count())

label,prediction,count
1.0,1.0,720042
0.0,1.0,575
1.0,0.0,298661
0.0,0.0,4840028


In [0]:
evaluator.evaluate(predictions)

Out[119]: 0.9228536764986718

In [0]:
feature_cols = ['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE','HourlyWindSpeed','Rain','Blowing','Snow','Thunder','CloudySkyCondition','carrier_vec',         'holiday_period','mean_carrier_delay','Pagerank_Score','PREV_FLIGHT_DELAYED','origin_percent_delayed','dest_percent_delayed','ORIGIN_Prophet_trend','ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred']

In [0]:
lrModel.coefficients

Out[127]: DenseVector([-0.0006, -0.0092, 0.0395, 0.1734, 0.0164, 0.0814, 0.0086, 0.0913, 0.107, 0.0472, -0.2329, -0.0026, 0.0887, 0.0624, 0.0885, 0.052, 0.0134, 0.0045, 0.0157, -0.0159, -0.0366, 0.0102, 0.0205, 0.0076, 0.0568, 0.0046, -0.0134, -0.0169, 0.004, 0.008, 0.2697, 0.0176, 3.6122, 0.0815, 0.0642, 0.1275, 0.0759, -0.0235, 0.0104])

#### Random Forest

In [0]:
# set up grid search: estimator, set of params, and evaluator
rf_model = RandomForestClassifier(labelCol="label", featuresCol="scaled_feature_vector")
grid = ParamGridBuilder()\
            .addGrid(rf_model.maxDepth, [5,10])\
            .addGrid(rf_model.numTrees, [32,64,128])\
            .build()

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5,metricLabel=1)

In [0]:
cv_rf = CustomCrossValidator(estimator=rf_model, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=10)

In [0]:
cvModel_rf1 = cv_rf.fit(d_processed)

fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
fold 3 start...
fold 3 end
fold 4 start...
fold 4 end
fold 5 start...
fold 5 end
Best Model:  {Param(parent='RandomForestClassifier_b175a3cfe482', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 10, Param(parent='RandomForestClassifier_b175a3cfe482', name='numTrees', doc='Number of trees to train (>= 1).'): 64} Detailed Score [0.011205522409182662, 0.015062820133580356, 0.05309732010543106, 0.051756398425788476, 6.787373313677099e-06] Avg Score 0.02622576968945925


In [0]:
# make predictions
predictions_rf = cvModel_rf1.transform(d_processed['df1'])

display(predictions_rf.groupby('label', 'prediction').count())

label,prediction,count
1.0,1.0,2
0.0,1.0,6
1.0,0.0,1275194
0.0,0.0,5817106


In [0]:
fbeta = cvModel_rf1.avgMetrics[0]
print(f"Random Forest F0.5 Score: {fbeta}")

Random Forest F0.5 Score: 0.02622576968945925


### Advanced Modeling - May Not Use Whats Below This!

In [0]:
print(processed_train_df)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975185>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mprint[0m[0;34m([0m[0mprocessed_train_df[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mNameError[0m: name 'processed_train_df' is not defined

In [0]:
#Multi Layer Perceptron Grid Search Hyperparameter selection

# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

# set up grid search: estimator, set of params, and evaluator
MLPC_model = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector")
grid = ParamGridBuilder()\
            .addGrid(MLPC_model.maxIter, [50,100,200])\
            .addGrid(MLPC_model.layers, [[38,26,2],[38,26,26,2]])\
            .addGrid(MLPC_model.blockSize, [32, 64])\
            .addGrid(MLPC_model.solver, ['gd', 'l-bfgs'] )
            .build() 

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)

# run cross validation & return the crossvalidation F0.5 score for 'validation' set
cv = CustomCrossValidator(estimator=logistic_model, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=10)

#run to select best model
cvModel = cv.fit(d_processed)


In [0]:
#Neural Network (MLPC) -

#train models with Multi Layer Neural Perceptron
def fit_MLPC_model(df, blockSize=128, seed=1234, layers = [4, 5, 4, 2], maxIter = 10):
    MLPC = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector", maxIter=maxIter, layers=layers, blockSize=blockSize, seed=seed)
    lrn = MLPC.fit(df)
    return lrn

#return model results of each fold
MLPC_models = {}
for key in d_processed.keys():
    print(key)
    MLPC_models[key] = fit_MLPC_model(d_processed[key])
    result = MLPC_models[key].transform(d_processed[key])
    result.show(10)

In [0]:
#XGBoost - Needs to use larger dataset to work

#train models with XGBoost
def fit_xgboost_model(df, maxIter=10):
    xg = GBTClassifier(labelCol="label", featuresCol="feature_vector", maxIter=maxIter)
    lrn = xg.fit(df)
    return lrn

#return model results of each fold
xgboost_models = {}
for key in d_processed.keys():
    print(key)
    xgboost_models[key] = fit_xgboost_model(d_processed[key])
    lrn_summary = xgboost_models[key].summary
    display(lrn_summary.predictions)