In [0]:
import pyspark
import airporttime
from datetime import datetime, timedelta
import numpy as np

from pyspark.sql import SQLContext
from pyspark.mllib.stat import Statistics
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, BooleanType, IntegerType
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
from pyspark.ml.feature import Bucketizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from sparkdl.xgboost import XgboostRegressor
from sklearn.metrics import confusion_matrix

#### Create the Azure BLOB storage to store data for quick access when datasets are huge

In [0]:
blob_container = "w261-scrr" # The name of your container created in https://portal.azure.com
storage_account = "midsw261rv" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261scrr" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261scrrkey" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

#### Running Libs with Helper Functions

In [0]:
%run "../libs/weather_aggregation"

In [0]:
%run "../libs/time_based_features"

In [0]:
%run "../libs/transform"

In [0]:
%run "../libs/custom_cv"

In [0]:
%run "../libs/model_helper_functions"

In [0]:
%run "../libs/error_analysis"

#### Import joined data

In [0]:
df_train = spark.read.parquet(f"{blob_url}/join_full_0329")
df_test = spark.read.parquet(f"{blob_url}/test_full_join_0404")

In [0]:
#df_train.count() #506,745,726

In [0]:
# display(df_test)

ACTUAL_ELAPSED_TIME,AIR_TIME,ARR_DEL15,ARR_DELAY,ARR_DELAY_GROUP,ARR_DELAY_NEW,ARR_TIME,ARR_TIME_BLK,CARRIER_DELAY,CRS_ARR_TIME,CRS_DEP_TIME,CRS_ELAPSED_TIME,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_DELAY,DEP_DELAY_GROUP,DEP_DELAY_NEW,DEP_TIME,DEP_TIME_BLK,DEST,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,DISTANCE,DISTANCE_GROUP,FL_DATE,MONTH,NAS_DELAY,OP_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,QUARTER,SECURITY_DELAY,TAIL_NUM,YEAR,TIMESTAMP,TIMESTAMP_UTC,WEATHER_WINDOW_START,WEATHER_WINDOW_END,iata_code,ident,elevation_ft,coordinates,station_id,lat,lon,neighbor_id,neighbor_name,neighbor_state,neighbor_call,distance_to_neighbor,dist_to_airport_rank,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND_DirectionAngle,WND_DirectionQuality,WND_Type,WND_Speed,WND_SpeedQuality,CIG_CeilingHeightDim,CIG_CeilingQuality,CIG_CeilingDetermination,CIG_CeilingAndVisibilityOK,VIS_Horizontal,VIS_DistanceQuality,VIS_Variability,VIS_VariabilityQuality,TMP_Value,TMP_Quality,DEW_Value,DEW_Quality,SLP_Value,SLP_Quality,AA1_RainCondition,AA1_RainQuality,AA2_RainCondition,AA2_RainQuality,AA3_RainCondition,AA3_RainQuality,AA4_RainCondition,AA4_RainQuality,AJ1_SnowDepth,AJ1_SnowDepthCondition,AJ1_SnowDepthQuality,AJ1_SnowEqWaterDepth,AJ1_SnowEqWaterDepthCondition,AJ1_SnowEqWaterDepthQuality,AL1_SnowAccumCondition,AL1_SnowAccumQuality,AL2_SnowAccumCondition,AL2_SnowAccumQuality,AL3_SnowAccumDuration,AL3_SnowAccumDepth,AL3_SnowAccumCondition,AL3_SnowAccumQuality,AW1_PresentWeatherCond,AW1_PresentWeatherQuality,AW2_PresentWeatherCond,AW2_PresentWeatherQuality,AW3_PresentWeatherCond,AW3_PresentWeatherQuality,AW4_PresentWeatherCond,AW4_PresentWeatherQuality,AA_RainDepth,AL_SnowAccumDepth,AA_RainDuration,AL_SnowAccumDuration,DATE_UTC
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T09:58:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,320.0,1,N,46.0,1,1189.0,1,M,N,16093.0,1,N,1,189.0,1,139.0,1,,1,,,,,,,,,,,,,,,,,,,,,,,12.0,1.0,,,,,,,,,,,2019-05-17T13:58:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,320.0,1,N,26.0,1,732.0,1,M,N,4023.0,1,N,1,161.0,1,156.0,1,10133.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,63.0,1.0,90.0,1.0,95.0,1.0,,,,,2019-05-17T14:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T09:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,20.0,1,N,62.0,1,1829.0,1,M,N,16093.0,1,N,1,189.0,1,139.0,1,10107.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T13:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T09:00:00.000+0000,4,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-12,99999,V020,200.0,1,N,46.0,1,,1,,N,16000.0,1,,1,183.0,1,133.0,1,10091.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T13:00:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T11:17:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,240.0,1,N,15.0,1,1829.0,1,M,N,12875.0,1,N,1,167.0,1,156.0,1,,1,3.0,1.0,,,,,,,,,,,,,,,,,,,,,61.0,1.0,,,,,,,,,,,2019-05-17T15:17:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T11:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,,1,C,0.0,1,1981.0,1,M,N,16093.0,1,N,1,167.0,1,156.0,1,10129.0,1,9.0,1.0,9.0,1.0,9.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T15:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:40:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,330.0,1,N,51.0,1,335.0,1,M,N,2816.0,1,N,1,167.0,1,156.0,1,,1,3.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,63.0,1.0,90.0,1.0,95.0,1.0,,,,,2019-05-17T14:40:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T08:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,200.0,1,N,46.0,1,3048.0,1,M,N,16093.0,1,N,1,183.0,1,133.0,1,10091.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T12:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:08:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,10.0,1,N,93.0,1,1250.0,1,M,N,8047.0,1,N,1,189.0,1,144.0,1,,1,2.0,1.0,,,,,,,,,,,,,,,,,,,,,61.0,1.0,90.0,1.0,92.0,1.0,,,,,,,2019-05-17T14:08:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:16:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,360.0,1,N,41.0,1,1280.0,1,M,N,4023.0,1,N,1,167.0,1,150.0,1,,1,3.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,63.0,1.0,90.0,1.0,95.0,1.0,,,,,2019-05-17T14:16:00.000+0000


In [0]:
# After aggregation - DF_AGG.count() #23,977,618 

#NEED to check
# airlines train count = 23,792,538



### Cross Validation

In [0]:
# Transform the data and save it - run this once

# trainsplits, valsplits = Split4year5Fold(df_train)

# for i, val_train in enumerate(trainsplits):
  
#   df_train_split = aggregate_weather_reports(val_train)
#   df_val_split = aggregate_weather_reports(valsplits[i])
  
#   df_train_split = get_transformed_df(df_train_split)
#   df_val_split = get_transformed_df(df_val_split)
  
#   df_train_split = add_previous_flight_delay_indicator(df_train_split)
#   df_val_split = add_previous_flight_delay_indicator(df_val_split)
  
#   df_train_split.write.parquet(f"{blob_url}/cv_train_0402_split"+str(i))
#   df_val_split.write.parquet(f"{blob_url}/cv_val_0402_split"+str(i))
  
  
  

In [0]:
# reading in CV splits
df_train_split = []
df_val_split = []

# reading in each of the 5 CV folds
for i in range(5):
  
  cv_train_str = "cv_train_0407_split" + str(i)
  cv_val_str = "cv_val_0407_split" + str(i)
  
  df_train_split.append(spark.read.parquet(f"{blob_url}/{cv_train_str}"))
  df_val_split.append(spark.read.parquet(f"{blob_url}/{cv_val_str}"))

In [0]:
def preprocess(df):
  """
  This function fills nulls and pre-processes columns in a Spark DataFrame to be passed into model training and evaluation.
  Inputs:
  - `df`: Spark DataFrame of joined flight, weather station, and weather observation data.
  Outputs:
  - `df`: Pre-processed Spark DataFrame, ready for use in our models.
  """
  df = df.fillna(999999, subset=['CIG_CeilingHeightDim_median', 'VIS_Horizontal_median' ])
  # Note from Ruth: For 'WND_Speed_mean','TMP_Value_mean','SLP_Value_mean', the nulls in this column have already been filled with the group mean (doesn't make sense to fill these with 0) in Carolina's transformation step
  df = df.fillna(0, subset=['AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', 'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value', 'SLP_Value_mean'])
  
  df = df.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),F.lit('-'),col("DEST")))
  
  df = target_mean_encoding(df, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')

  df = df.withColumn("DAY_OF_WEEK",col("DAY_OF_WEEK").cast(StringType())) \
                          .withColumn("MONTH",col("MONTH").cast(StringType())) \
                          .drop('ORIGIN', 'DEST', 'ORIGIN_DEST_COMBO')

  return df

In [0]:
# select the columns we'll be using for training. This is so that we can choose columns for model and record scores.

# All columns = ['DEP_DEL15', 'CRS_DEP_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'ARR_DEL15', 'ARR_DELAY', 'ARR_DELAY_GROUP', 'ARR_DELAY_NEW', 'ARR_TIME', 'CRS_ELAPSED_TIME', \
#                   'DEP_DELAY', 'DEP_DELAY_GROUP', 'DEP_DELAY_NEW', 'DEP_TIME', 'DEP_TIME_BLK', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_NAME', 'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_WAC', \
#                   'OP_CARRIER_AIRLINE_ID', 'OP_UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_STATE_FIPS', 'ORIGIN_STATE_NM', 'ORIGIN_WAC', 'ORIGIN_CITY_NAME', 'DAY_OF_WEEK',   \
#                   'DISTANCE', 'DISTANCE_GROUP', 'ORIGIN_STATE_ABR', 'FL_DATE', 'WEATHER_WINDOW_START', 'WEATHER_WINDOW_END', 'TIMESTAMP_UTC', 'NAS_DELAY', 'OP_CARRIER', 'SECURITY_DELAY', 'CARRIER_DELAY',  \
#                   'OP_CARRIER_FL_NUM', 'TAIL_NUM', 'TIMESTAMP', 'YEAR', 'MONTH', 'DAY_OF_MONTH', 'ORIGIN', 'DEST', 'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID', 'DEST_STATE_ABR', 'AA_RainDepth','AA_RainDuration', \ 
#                   'AL_SnowAccumDuration', 'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth', 'VIS_Variability', 'WND_Type', 'WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean', 'CIG_CeilingHeightDim_median', \ 
#                   'VIS_Horizontal_median', 'WND_DirectionAngle_median', 'DEW_Value_median', 'weather_condition', 'PREV_DEP_DEL15']

# flights only
# selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST']

# flights + weather
# selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
#                   'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration', \
#                   'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean' ]

# flights + weather + time based attribute
selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
                  'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', \
                  'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value', 'SLP_Value_mean', \
                  'PREV_DEP_DEL15']

df_temp = df_train_split[0].select(*selected_cols)

df_temp = preprocess(df_temp)

# Get numerical, categorical values and label ready for pipeline
labelCol = ['DEP_DEL15']

categoricalColumns = [t[0] for t in df_temp.dtypes if t[1] =='string']

numericCols = [t[0] for t in df_temp.dtypes if t[1] !='string']

numericCols.remove(*labelCol)

In [0]:
# display(df_temp)

DEP_DEL15,OP_UNIQUE_CARRIER,DAY_OF_WEEK,DISTANCE,DISTANCE_GROUP,MONTH,CIG_CeilingHeightDim_median,VIS_Horizontal_median,AA_RainDepth,AA_RainDuration,AL_SnowAccumDuration,AL_SnowAccumDepth,AJ1_SnowDepth,AJ1_SnowEqWaterDepth,WND_Speed_mean,TMP_Value_mean,SLP_Value_mean,PREV_DEP_DEL15,ORIGIN_mean_encoding,DEST_mean_encoding,ORIGIN_DEST_COMBO_mean_encoding,classWeights
0.0,US,4,992.0,4,1,1263.92,10298.53,2.39,3.19,0.0,0,0.0,0.0,9.859999656677246,217.0,10205.1904296875,0.0,0.2089608241092899,0.2022562904926924,0.1948424068767908,0.1922394872779164
1.0,US,4,920.0,4,1,22000.0,16078.99,0.0,1.0,0.0,0,0.0,0.0,77.12999725341797,3.5799999237060547,10230.6796875,0.0,0.1932316258770857,0.2041274635071824,0.1746522411128284,0.8077605127220836
0.0,US,5,507.0,3,1,5439.91,16083.32,0.0,1.0,0.0,0,0.0,0.0,25.11000061035156,187.94000244140625,10231.7998046875,0.0,0.1943438256658595,0.1364126327266419,0.1243611584327086,0.1922394872779164
0.0,US,6,541.0,3,1,1324.45,13512.06,12.53,4.32,0.0,0,0.0,0.0,11.779999732971191,83.98999786376953,10275.919921875,0.0,0.1771598071931776,0.2292914007500493,0.251937984496124,0.1922394872779164
0.0,US,6,541.0,3,1,14816.61,16089.95,0.0,6.44,0.0,0,0.0,0.0,25.479999542236328,12.25,10323.2099609375,0.0,0.211280756434029,0.1364126327266419,0.202525497814473,0.1922394872779164
1.0,US,6,1475.0,6,1,362.0,13925.49,20.22,1.53,0.0,0,0.0,0.0,10.9399995803833,89.52999877929688,10272.009765625,0.0,0.1771598071931776,0.2244701836022457,0.1382252559726962,0.8077605127220836
1.0,US,6,1475.0,6,1,212.53,8580.53,0.0,0.0,0.0,0,0.0,0.0,36.2549991607666,193.70499801635745,10168.4150390625,1.0,0.1991600465516369,0.1364126327266419,0.1615646258503401,0.8077605127220836
1.0,US,6,930.0,4,1,212.53,8580.53,2.61,1.93,0.0,0,0.0,0.0,24.0,107.0999984741211,10247.169921875,1.0,0.1771598071931776,0.1543094247565943,0.1641113003975014,0.8077605127220836
0.0,US,7,930.0,4,1,765.76,4780.21,0.0,1.05,0.0,0,8.0,778.22,91.72000122070312,-65.44999694824219,10157.3798828125,1.0,0.151653351328994,0.1364126327266419,0.1047239612976664,0.1922394872779164
0.0,US,7,361.0,2,1,61.0,1205.38,20.27,5.57,0.0,0,0.0,0.0,8.239999771118164,96.01000213623048,10204.01953125,0.0,0.1771598071931776,0.2029393824973133,0.1887138145840967,0.1922394872779164


In [0]:
metricsArray = np.empty((0,3), int)

# looping through each CV split and evaluating model performance
for i, cv_train in enumerate(df_train_split):
  
  cv_train = cv_train.select(*selected_cols)
  cv_val = df_val_split[i].select(*selected_cols)
  
  cv_train = preprocess(cv_train)
  cv_val = preprocess(cv_val)
  
  #oversampling/undersampling
  cv_train = oversampling(cv_train)
  
  # the function is called getRegressionPipeline but it just preprocesses feature types (categorical, numeric, label), so it can be used for RF as well
  pipeline = getRegressionPipeline(categoricalColumns, numericCols, labelCol)
   
  pipelineModel = pipeline.fit(cv_train)  

  val_ml_train = pipelineModel.transform(cv_train)
  val_ml_test = pipelineModel.transform(cv_val)
  
  cols = cv_train.columns
  selectedCols = ['features'] + cols
  
  train = val_ml_train.select(selectedCols)
  test = val_ml_test.select(selectedCols)
  
  print("############################")
  print("Validation Set {:d}".format(i+1))
  print("Training Dataset Count: " + str(train.count()))
  print("Test Dataset Count: " + str(test.count()))
  
  # creating predictions
  pred = execRFModel(train, test, maxDepth=15, numTrees=30)
  
  precision, recall, fmeasure = getMetrics(pred)
  
  print("Precision is {:.3f}".format(precision))
  print("Recall is {:.3f}".format(recall))
  print("F beta(0.5) score is {:.3f}".format(fmeasure))
  
  newrow = np.array([precision, recall, fmeasure])

  metricsArray = np.append(metricsArray, [newrow], axis=0)


avgArray = np.mean(metricsArray, axis=0)

print("############################")
print("Average of Cross validation")
print("Average Precision is {:.3f}".format(avgArray[0]))
print("Average Recall is {:.3f}".format(avgArray[1]))
print("Average F beta(0.5) score is {:.3f}".format(avgArray[2])) 


# c_values = np.logspace(-2, 2, 10)
# logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

## Custom Cross Validation & Grid Search

In [0]:
def preprocess_dos(df):
  """
  This function fills nulls and pre-processes columns in a Spark DataFrame to be passed into model training and evaluation.
  Inputs:
  - `df`: Spark DataFrame of joined flight, weather station, and weather observation data.
  Outputs:
  - `df`: Pre-processed Spark DataFrame, ready for use in our models.
  """
  df = df.fillna(999999, subset=['CIG_CeilingHeightDim_median', 'VIS_Horizontal_median' ])
  # Note from Ruth: For 'WND_Speed_mean','TMP_Value_mean','SLP_Value_mean', the nulls in this column have already been filled with the group mean (doesn't make sense to fill these with 0) in Carolina's transformation step
  df = df.fillna(0, subset=['AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', 'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean'])
  
  df = df.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),F.lit('-'),col("DEST")))
  
  df = target_mean_encoding(df, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')

  df = df.withColumn("CRS_DEP_TIME",(F.regexp_replace(col("CRS_DEP_TIME"), "[:]","")).cast(IntegerType())) \
                          .withColumn("DAY_OF_WEEK",col("DAY_OF_WEEK").cast(StringType())) \
                          .withColumn("MONTH",col("MONTH").cast(StringType())) \
                          .drop('ORIGIN', 'DEST', 'ORIGIN_DEST_COMBO')
    
  return df

In [0]:
# read the dataframes for inference

df_train_main = spark.read.parquet(f"{blob_url}/train_agg_0404")
df_test_main = spark.read.parquet(f"{blob_url}/test_agg_0404")

In [0]:
# flights + weather + time based attribute
selected_cols = ['DEP_DEL15', 'CRS_DEP_TIME','OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'YEAR', 'ORIGIN', 'DEST', \
                  'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', \
                  'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean', \
                  'PREV_DEP_DEL15']

df_temp2 = df_train_main.select(*selected_cols)

df_temp2 = preprocess_dos(df_temp2)

# Get numerical, categorical values and label ready for pipeline
labelCol = ['DEP_DEL15']

categoricalColumns = [t[0] for t in df_temp2.dtypes if t[1] =='string']

numericCols = [t[0] for t in df_temp2.dtypes if t[1] !='string']

numericCols.remove(*labelCol)

In [0]:
cv_train = df_train_main.select(*selected_cols)

cv_train = preprocess_dos(cv_train)

pipeline = getRegressionPipeline(categoricalColumns, numericCols, labelCol)
# pipeline = getXGBPipeline(numericCols)

pipelineModel = pipeline.fit(cv_train) 

val_ml_train = pipelineModel.transform(cv_train)

val_ml_train = val_ml_train.withColumn("MONTH", val_ml_train.MONTH.cast(IntegerType()))
val_ml_train = val_ml_train.withColumn("YEAR", val_ml_train.YEAR.cast(IntegerType()))


cols = cv_train.columns
selectedCols = ['features'] + cols
  
train = val_ml_train.select(selectedCols)
train = train.withColumnRenamed('DEP_DEL15', 'label')
  
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# grid search for hyperparameters
# hyperparameters: number of trees, tree depth
grid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [10,15,20])\
            .addGrid(rf.numTrees, [20,30,40])\
            .build()

evaluator = BinaryClassificationEvaluator()

predictions = customGridsearchCV(train, estimator=rf, grid=grid, evaluator=evaluator)

display(predictions)

features,DEP_DEL15,CRS_DEP_TIME,OP_UNIQUE_CARRIER,DAY_OF_WEEK,DISTANCE,DISTANCE_GROUP,MONTH,YEAR,CIG_CeilingHeightDim_median,VIS_Horizontal_median,AA_RainDepth,AA_RainDuration,AL_SnowAccumDuration_mean,AL_SnowAccumDepth,AJ1_SnowDepth_mean,AJ1_SnowEqWaterDepth,WND_Speed_mean,TMP_Value_mean,SLP_Value_mean,PREV_DEP_DEL15,ORIGIN_mean_encoding,DEST_mean_encoding,ORIGIN_DEST_COMBO_mean_encoding,rawPrediction,probability,prediction
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 22, 28, 50, 53, 58, 59, 60, 61, 62, 65, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.812648176119276, 2.353318638369035, 3.7138373600989674, 2.1907278517147235, 0.4371080034795757, 0.40485841039587744, 0.30039331627211896, 0.40635319171050377, 1.5927484682569886, 2.921140848204952, 1.1343479876627829, -1.5712732089561476, 15.290900053225467, 4.388244339316772, 5.94990262148718, 2.715793379243408))",0.0,825,YX,1,268.0,2,1,2018,22000.0,16070.81,15.0,4.02,0.0,0,10.0,0.0,24.049999237060547,-156.17999267578125,10326.6796875,0.0,0.1405376064947018,0.1878368724254814,0.1363547184618403,"Map(vectorType -> dense, length -> 2, values -> List(35.78959333502799, 4.210406664972002))","Map(vectorType -> dense, length -> 2, values -> List(0.8947398333757, 0.10526016662430009))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 22, 31, 50, 53, 58, 59, 60, 62, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.812648176119276, 3.034090167616668, 3.7138373600989674, 2.1907278517147235, 0.3229380025707313, 0.40485841039587744, 0.30055931604875313, 0.39620608851095024, 1.4480035422630062, -1.2570789593484866, 15.250846921681502, 5.831585417194086, 6.618825322115431, 3.2206407864817925))",0.0,1115,YX,1,198.0,1,1,2018,22000.0,16079.69,0.0,1.0,0.0,0,0.0,0.0,30.700000762939453,-124.9499969482422,10299.6298828125,0.0,0.1867619469725026,0.2089545874493639,0.1617021276595744,"Map(vectorType -> dense, length -> 2, values -> List(35.3115001284422, 4.688499871557807))","Map(vectorType -> dense, length -> 2, values -> List(0.8827875032110548, 0.11721249678894514))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 22, 31, 50, 53, 58, 59, 60, 62, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.812648176119276, 3.034090167616668, 3.7138373600989674, 2.1907278517147235, 0.3229380025707313, 0.40485841039587744, 0.3008080966684747, 0.39620608851095024, 1.2767900479821, -1.1243789564908324, 15.286191832845413, 5.800249398103465, 5.94990262148718, 3.494231693551344))",0.0,1310,YX,1,198.0,1,1,2018,22000.0,16093.0,0.0,1.0,0.0,0,0.0,0.0,27.06999969482422,-111.76000213623048,10323.5,0.0,0.1857583818153369,0.1878368724254814,0.175438596491228,"Map(vectorType -> dense, length -> 2, values -> List(36.63266806550946, 3.367331934490542))","Map(vectorType -> dense, length -> 2, values -> List(0.9158167016377364, 0.08418329836226353))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 22, 28, 50, 53, 58, 59, 60, 62, 65, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.812648176119276, 2.353318638369035, 3.7138373600989674, 2.1907278517147235, 0.5463850043494696, 0.40485841039587744, 0.30043856736583746, 0.39620608851095024, 5.258053526768913, 1.6310085960412908, -1.3483291846136778, 15.267179645635041, 5.831585417194086, 6.603889558519246, 3.774443635571682))",0.0,1544,YX,1,335.0,2,1,2018,22000.0,16073.23,0.0,1.0,0.0,0,18.0,0.0,34.58000183105469,-134.02000427246094,10310.66015625,0.0,0.1867619469725026,0.2084830692918965,0.1895074946466809,"Map(vectorType -> dense, length -> 2, values -> List(35.892000995536186, 4.107999004463816))","Map(vectorType -> dense, length -> 2, values -> List(0.8973000248884047, 0.10269997511159541))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 22, 28, 50, 53, 58, 59, 60, 62, 65, 66, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.812648176119276, 2.353318638369035, 3.7138373600989674, 2.1907278517147235, 0.5463850043494696, 0.40485841039587744, 0.300196686670854, 0.39620608851095024, 5.258053526768913, 9.346447047039527, 0.7640930914150803, -2.313649114658571, 15.229746690789067, 5.852440615523312, 5.94990262148718, 2.93649855784988))",0.0,1750,YX,1,335.0,2,1,2018,22000.0,16060.29,0.0,1.0,0.0,0,18.0,1792.9,16.200000762939453,-229.97000122070312,10285.3798828125,0.0,0.1874298540965207,0.1878368724254814,0.1474358974358974,"Map(vectorType -> dense, length -> 2, values -> List(34.802577664949865, 5.197422335050142))","Map(vectorType -> dense, length -> 2, values -> List(0.8700644416237465, 0.12993555837625353))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 22, 28, 50, 53, 58, 59, 60, 62, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.812648176119276, 2.353318638369035, 3.7138373600989674, 2.1907278517147235, 0.6621860052712975, 0.40485841039587744, 0.30032079404729295, 0.39620608851095024, 2.946946527993728, -0.8176294942099089, 15.262915351439469, 5.831585417194086, 6.11182806110833, 3.441301319968555))",0.0,2015,YX,1,406.0,2,1,2018,22000.0,16066.93,0.0,1.0,0.0,0,0.0,0.0,62.47999954223633,-81.2699966430664,10307.7802734375,0.0,0.1867619469725026,0.1929488162806156,0.1727810650887574,"Map(vectorType -> dense, length -> 2, values -> List(36.749650614839666, 3.250349385160334))","Map(vectorType -> dense, length -> 2, values -> List(0.9187412653709917, 0.08125873462900834))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 24, 28, 50, 53, 58, 59, 60, 62, 65, 66, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.8466783675516822, 2.353318638369035, 3.7138373600989674, 2.1907278517147235, 0.5659570045052715, 0.01835940922798571, 0.3005565049683043, 0.39620608851095024, 1.460570424102476, 2.630287932968808, 2.3191638355803503, -1.4468229410981885, 15.382260640397542, 5.114810562488905, 4.548893044634968, 3.50035963719302))",0.0,514,YX,2,347.0,2,1,2018,997.65,16079.54,0.0,1.0,0.0,0,5.0,504.56,49.16999816894531,-143.80999755859375,10388.3798828125,0.0,0.1638065655746798,0.1436073658443654,0.1757462686567164,"Map(vectorType -> dense, length -> 2, values -> List(31.031114938266157, 8.968885061733845))","Map(vectorType -> dense, length -> 2, values -> List(0.7757778734566539, 0.2242221265433461))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 24, 29, 50, 53, 58, 59, 60, 62, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.8466783675516822, 2.516163140755703, 3.7138373600989674, 2.1907278517147235, 1.2004160095558496, 0.40485841039587744, 0.3004204231193041, 2.4287433679143136, 1.5031880443092338, -0.7381503717705091, 15.32109281782484, 5.515217401242332, 4.00391470541571, 2.6125419192546793))",0.0,729,YX,2,736.0,3,1,2018,22000.0,16072.26,0.0,6.13,0.0,0,0.0,0.0,31.8700008392334,-73.37000274658203,10347.0703125,0.0,0.1766299670061684,0.1264025419521398,0.1311706629055007,"Map(vectorType -> dense, length -> 2, values -> List(36.72376680226907, 3.276233197730937))","Map(vectorType -> dense, length -> 2, values -> List(0.9180941700567266, 0.0819058299432734))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 24, 29, 50, 53, 58, 59, 60, 62, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.8466783675516822, 2.516163140755703, 3.7138373600989674, 2.1907278517147235, 1.2004160095558496, 0.017002396503463305, 0.3005824800818023, 1.6284070766792254, 3.2002291756184307, 1.873193599253048, 15.097385833145223, 4.342199143716099, 4.548893044634968, 2.946487265913566))",0.0,1014,YX,2,736.0,3,1,2018,923.91,16080.93,0.0,4.11,0.0,0,0.0,0.0,67.8499984741211,186.19000244140625,10195.990234375,0.0,0.1390629662787227,0.1436073658443654,0.1479374110953058,"Map(vectorType -> dense, length -> 2, values -> List(36.467378033608604, 3.5326219663913836))","Map(vectorType -> dense, length -> 2, values -> List(0.9116844508402154, 0.08831554915978462))",0.0
"Map(vectorType -> sparse, length -> 74, indices -> List(12, 24, 28, 50, 53, 58, 59, 60, 62, 65, 67, 68, 69, 71, 72, 73), values -> List(8.903909296774385, 2.8466783675516822, 2.353318638369035, 3.7138373600989674, 2.1907278517147235, 0.5659570045052715, 0.40485841039587744, 0.3005082785102146, 0.39620608851095024, 0.730285212051238, 0.9518146970223483, -1.1098915870253327, 15.33013907909561, 5.515217401242332, 6.11182806110833, 3.1213398038663875))",0.0,1315,YX,2,347.0,2,1,2018,22000.0,16076.96,0.0,1.0,0.0,0,2.5,0.0,20.18000030517578,-110.31999969482422,10353.1796875,0.0,0.1766299670061684,0.1929488162806156,0.1567164179104477,"Map(vectorType -> dense, length -> 2, values -> List(36.18740450467962, 3.8125954953203762))","Map(vectorType -> dense, length -> 2, values -> List(0.9046851126169907, 0.09531488738300942))",0.0


#### Run the model on test data

In [0]:
# Transform the training & test data and save it - run this once
  
# df_train_upd = aggregate_weather_reports(df_train)
# df_test_upd = aggregate_weather_reports(df_test)
  
# df_train_upd = get_transformed_df(df_train_upd)
# df_test_upd = get_transformed_df(df_test_upd)
  
# df_train_upd = add_previous_flight_delay_indicator(df_train_upd)
# df_test_upd = add_previous_flight_delay_indicator(df_test_upd)
  
# df_train_upd.write.parquet(f"{blob_url}/train_agg_0404")
# df_test_upd.write.parquet(f"{blob_url}/test_agg_0404")

In [0]:
# re-read the dataframes for inference

df_train_main = spark.read.parquet(f"{blob_url}/train_agg_0404")
df_test_main = spark.read.parquet(f"{blob_url}/test_agg_0404")

In [0]:
def preprocess_dos(df):
  """
  This function fills nulls and pre-processes columns in a Spark DataFrame to be passed into model training and evaluation.
  Inputs:
  - `df`: Spark DataFrame of joined flight, weather station, and weather observation data.
  Outputs:
  - `df`: Pre-processed Spark DataFrame, ready for use in our models.
  """
  df = df.fillna(999999, subset=['CIG_CeilingHeightDim_median', 'VIS_Horizontal_median' ])
  # Note from Ruth: For 'WND_Speed_mean','TMP_Value_mean','SLP_Value_mean', the nulls in this column have already been filled with the group mean (doesn't make sense to fill these with 0) in Carolina's transformation step
  df = df.fillna(0, subset=['AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', 'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean'])
  
  df = df.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),F.lit('-'),col("DEST")))
  
  df = target_mean_encoding(df, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')

  df = df.withColumn("CRS_DEP_TIME",(F.regexp_replace(col("CRS_DEP_TIME"), "[:]","")).cast(IntegerType())) \
                          .withColumn("DAY_OF_WEEK",col("DAY_OF_WEEK").cast(StringType())) \
                          .withColumn("MONTH",col("MONTH").cast(StringType())) \
                          .drop('ORIGIN', 'DEST', 'ORIGIN_DEST_COMBO')

  return df

In [0]:
# flights + weather + ID variables for ensemble + time based attribute
selected_cols = ['DEP_DEL15', 'CRS_DEP_TIME','OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
                  'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', \
                  'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean', \
                  'OP_CARRIER_FL_NUM', 'TAIL_NUM', 'TIMESTAMP_UTC', \
                  'PREV_DEP_DEL15']

df_temp2 = df_train_main.select(*selected_cols)

df_temp2 = preprocess_dos(df_temp2)

# Get numerical, categorical values and label ready for pipeline
labelCol = ['DEP_DEL15']

categoricalColumns = [t[0] for t in df_temp2.dtypes if t[1] =='string']
# remove ID columns not needed for features, but needed for ensemble
categoricalColumns.remove('OP_CARRIER_FL_NUM')
categoricalColumns.remove('TAIL_NUM')

numericCols = [t[0] for t in df_temp2.dtypes if t[1] !='string']

numericCols.remove(*labelCol)
# remove column not needed for features, but needed for ensemble
numericCols.remove('TIMESTAMP_UTC')

In [0]:
# running model on held-out test set (2019)
df_train_main = df_train_main.select(*selected_cols)
df_test_main = df_test_main.select(*selected_cols)

df_train_main = preprocess_dos(df_train_main)
df_test_main = preprocess_dos(df_test_main)
  
#oversampling/undersampling
df_train_main = oversampling(df_train_main)
  
pipeline = getRegressionPipeline(categoricalColumns, numericCols, labelCol)
# pipeline = getXGBPipeline(numericCols)
   
pipelineModel = pipeline.fit(df_train_main)  

ml_train = pipelineModel.transform(df_train_main)
ml_test = pipelineModel.transform(df_test_main)

cols = df_train_main.columns
selectedCols = ['features'] + cols
  
train_all = ml_train.select(selectedCols)
test_all = ml_test.select(selectedCols)

print("############################")

# generating predictions on test set and evaluating results
pred = execRFModel(train_all, test_all, maxDepth=15, numTrees=30)

precision, recall, fmeasure = getMetrics(pred)

print("Final test scores")
print("Precision is {:.3f}".format(precision))
print("Recall is {:.3f}".format(recall))
print("F beta(0.5) score is {:.3f}".format(fmeasure))

In [0]:
# writing predictions to cold storage for ensemble model
pred.write.parquet(f"{blob_url}/rf_test_0410b")

#### Error Analysis on Test Set Predictions

In [0]:
# loading in saved predictions
pred = spark.read.parquet(f"{blob_url}/rf_test_0410b")

In [0]:
# calling error analysis function from libs
analyze_errors(pred)

PRED_GROUP,avg(DISTANCE),avg(CIG_CeilingHeightDim_median),avg(CRS_DEP_TIME),avg(VIS_Horizontal_median),avg(WND_Speed_mean)
TP,830.7218315223305,10123.413784340224,1596.078734318892,14088.298961450526,39.142767598125765
TN,787.9254270185096,12959.603247632143,1291.8280778539477,15118.316168342684,33.18144007967394
FN,822.1466164557143,11830.9313571516,1413.4647835186584,14770.110837579236,35.09331844639155
FP,836.4063730569948,10580.49754238474,1293.8085456900612,14303.092388236091,38.20164735820261


PRED_GROUP,avg(PREV_DEP_DEL15)
TP,0.802069051283247
TN,0.0005926932299665697
FN,0.0007488930335862865
FP,0.5739166274140367
