In [0]:
import pyspark
from pyspark.sql.types import StringType, BooleanType, IntegerType
import pyspark.sql.functions as F

import airporttime
from datetime import datetime, timedelta

import numpy as np

In [0]:
from pyspark.sql import SQLContext
from pyspark.mllib.stat import Statistics
from pyspark.sql.functions import udf
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
from pyspark.ml.feature import Bucketizer
from pyspark.ml import Pipeline
from sklearn.metrics import confusion_matrix

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from sparkdl.xgboost import XgboostRegressor

#### Create the Azure BLOB storage to store data for quick access when datasets are huge

In [0]:
blob_container = "w261-scrr" # The name of your container created in https://portal.azure.com
storage_account = "midsw261rv" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261scrr" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261scrrkey" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
%run "../libs/weather_aggregation"

In [0]:
%run "../libs/time_based_features"

In [0]:
%run "../libs/transform"

In [0]:
%run "../libs/model_helper_functions"

#### Import joined data

In [0]:
df_train = spark.read.parquet(f"{blob_url}/join_full_0329")

In [0]:
df_test = spark.read.parquet(f"{blob_url}/test_full_join_0404")

In [0]:
display(df_test)

ACTUAL_ELAPSED_TIME,AIR_TIME,ARR_DEL15,ARR_DELAY,ARR_DELAY_GROUP,ARR_DELAY_NEW,ARR_TIME,ARR_TIME_BLK,CARRIER_DELAY,CRS_ARR_TIME,CRS_DEP_TIME,CRS_ELAPSED_TIME,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_DELAY,DEP_DELAY_GROUP,DEP_DELAY_NEW,DEP_TIME,DEP_TIME_BLK,DEST,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,DISTANCE,DISTANCE_GROUP,FL_DATE,MONTH,NAS_DELAY,OP_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,QUARTER,SECURITY_DELAY,TAIL_NUM,YEAR,TIMESTAMP,TIMESTAMP_UTC,WEATHER_WINDOW_START,WEATHER_WINDOW_END,iata_code,ident,elevation_ft,coordinates,station_id,lat,lon,neighbor_id,neighbor_name,neighbor_state,neighbor_call,distance_to_neighbor,dist_to_airport_rank,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND_DirectionAngle,WND_DirectionQuality,WND_Type,WND_Speed,WND_SpeedQuality,CIG_CeilingHeightDim,CIG_CeilingQuality,CIG_CeilingDetermination,CIG_CeilingAndVisibilityOK,VIS_Horizontal,VIS_DistanceQuality,VIS_Variability,VIS_VariabilityQuality,TMP_Value,TMP_Quality,DEW_Value,DEW_Quality,SLP_Value,SLP_Quality,AA1_RainCondition,AA1_RainQuality,AA2_RainCondition,AA2_RainQuality,AA3_RainCondition,AA3_RainQuality,AA4_RainCondition,AA4_RainQuality,AJ1_SnowDepth,AJ1_SnowDepthCondition,AJ1_SnowDepthQuality,AJ1_SnowEqWaterDepth,AJ1_SnowEqWaterDepthCondition,AJ1_SnowEqWaterDepthQuality,AL1_SnowAccumCondition,AL1_SnowAccumQuality,AL2_SnowAccumCondition,AL2_SnowAccumQuality,AL3_SnowAccumDuration,AL3_SnowAccumDepth,AL3_SnowAccumCondition,AL3_SnowAccumQuality,AW1_PresentWeatherCond,AW1_PresentWeatherQuality,AW2_PresentWeatherCond,AW2_PresentWeatherQuality,AW3_PresentWeatherCond,AW3_PresentWeatherQuality,AW4_PresentWeatherCond,AW4_PresentWeatherQuality,AA_RainDepth,AL_SnowAccumDepth,AA_RainDuration,AL_SnowAccumDuration,DATE_UTC
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T09:58:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,320.0,1,N,46.0,1,1189.0,1,M,N,16093.0,1,N,1,189.0,1,139.0,1,,1,,,,,,,,,,,,,,,,,,,,,,,12.0,1.0,,,,,,,,,,,2019-05-17T13:58:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,320.0,1,N,26.0,1,732.0,1,M,N,4023.0,1,N,1,161.0,1,156.0,1,10133.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,63.0,1.0,90.0,1.0,95.0,1.0,,,,,2019-05-17T14:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T09:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,20.0,1,N,62.0,1,1829.0,1,M,N,16093.0,1,N,1,189.0,1,139.0,1,10107.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T13:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T09:00:00.000+0000,4,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-12,99999,V020,200.0,1,N,46.0,1,,1,,N,16000.0,1,,1,183.0,1,133.0,1,10091.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T13:00:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T11:17:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,240.0,1,N,15.0,1,1829.0,1,M,N,12875.0,1,N,1,167.0,1,156.0,1,,1,3.0,1.0,,,,,,,,,,,,,,,,,,,,,61.0,1.0,,,,,,,,,,,2019-05-17T15:17:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T11:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,,1,C,0.0,1,1981.0,1,M,N,16093.0,1,N,1,167.0,1,156.0,1,10129.0,1,9.0,1.0,9.0,1.0,9.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T15:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:40:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,330.0,1,N,51.0,1,335.0,1,M,N,2816.0,1,N,1,167.0,1,156.0,1,,1,3.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,63.0,1.0,90.0,1.0,95.0,1.0,,,,,2019-05-17T14:40:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T08:51:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-15,KHTS,V030,200.0,1,N,46.0,1,3048.0,1,M,N,16093.0,1,N,1,183.0,1,133.0,1,10091.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-05-17T12:51:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:08:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,10.0,1,N,93.0,1,1250.0,1,M,N,8047.0,1,N,1,189.0,1,144.0,1,,1,2.0,1.0,,,,,,,,,,,,,,,,,,,,,61.0,1.0,90.0,1.0,92.0,1.0,,,,,,,2019-05-17T14:08:00.000+0000
73.0,39.0,0.0,2.0,0,2.0,1555,1500-1559,0.0,1553,14:33,80.0,17,5,0.0,9.0,0,9.0,1442,1400-1459,CLT,11057,1105703,31057,"Charlotte, NC",NC,37,North Carolina,36,235.0,1,2019-05-17,5,0.0,OH,20397,5026,OH,HTS,12223,1222305,32223,"Ashland, WV",WV,54,West Virginia,39,2,0.0,N258PS,2019,2019-05-17T14:33:00.000+0000,2019-05-17T18:33:00.000+0000,2019-05-17T12:33:00.000+0000,2019-05-17T16:33:00.000+0000,HTS,KHTS,828,"-82.55799866, 38.36669922",72045800476,37.751,-82.637,72425003860,TRI-STATE/M.J.FERGUSON FIELD,WV,KHTS,42.62750781220094,3,72425003860,2019-05-17T10:16:00.000+0000,7,38.36532,-82.55485,251.2,"HUNTINGTON TRI STATE AIRPORT, WV US",FM-16,KHTS,V030,360.0,1,N,41.0,1,1280.0,1,M,N,4023.0,1,N,1,167.0,1,150.0,1,,1,3.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,63.0,1.0,90.0,1.0,95.0,1.0,,,,,2019-05-17T14:16:00.000+0000


### Cross Validation

In [0]:
# Transform the data and save it - run this once

# trainsplits, valsplits = Split4year5Fold(df_train)

# for i, val_train in enumerate(trainsplits):
  
#   df_train_split = aggregate_weather_reports(val_train)
#   df_val_split = aggregate_weather_reports(valsplits[i])
  
#   df_train_split = get_transformed_df(df_train_split)
#   df_val_split = get_transformed_df(df_val_split)
  
#   df_train_split = add_previous_flight_delay_indicator(df_train_split)
#   df_val_split = add_previous_flight_delay_indicator(df_val_split)
  
#   df_train_split.write.parquet(f"{blob_url}/cv_train_0402_split"+str(i))
#   df_val_split.write.parquet(f"{blob_url}/cv_val_0402_split"+str(i))
  
  
  

In [0]:
# This would be part of main flow

df_train_split = []
df_val_split = []

for i in range(5):
  
  cv_train_str = "cv_train_0402_split" + str(i)
  cv_val_str = "cv_val_0402_split" + str(i)
  
  df_train_split.append(spark.read.parquet(f"{blob_url}/{cv_train_str}"))
  df_val_split.append(spark.read.parquet(f"{blob_url}/{cv_val_str}"))



In [0]:
def preprocess(df):

  df = df.fillna(999999, subset=['CIG_CeilingHeightDim_median', 'VIS_Horizontal_median' ])
  df = df.fillna(0, subset=['AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration', 'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean'])
  
  df = df.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),F.lit('-'),col("DEST")))
  
  df = target_mean_encoding(df, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')


  df = df.withColumn("DAY_OF_WEEK",col("DAY_OF_WEEK").cast(StringType())) \
                          .withColumn("MONTH",col("MONTH").cast(StringType())) \
                          .drop('ORIGIN', 'DEST', 'ORIGIN_DEST_COMBO')
  
  balancingRatio = (df.where(df.DEP_DEL15 == 0).count()) / (df.count())
  
  df = df.withColumn("classWeights", F.when(df.DEP_DEL15 == 1,balancingRatio).otherwise(1-balancingRatio))

  return df

In [0]:
# flights only
# selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST']

# flights + weather
# selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
#                   'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration', \
#                   'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean' ]

# flights + weather + time based attribute
selected_cols = ['DEP_DEL15', 'OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
                  'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration', \
                  'AL_SnowAccumDepth', 'AJ1_SnowDepth', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean', \
                  'PREV_DEP_DEL15']

df_temp = df_train_split[0].select(*selected_cols)

df_temp = preprocess(df_temp)

# Get numerical, categorical values and label ready for pipeline
labelCol = ['DEP_DEL15']

categoricalColumns = [t[0] for t in df_temp.dtypes if t[1] =='string']

numericCols = [t[0] for t in df_temp.dtypes if t[1] !='string']

numericCols.remove(*labelCol)

In [0]:
display(df_temp)

DEP_DEL15,OP_UNIQUE_CARRIER,DAY_OF_WEEK,DISTANCE,DISTANCE_GROUP,MONTH,CIG_CeilingHeightDim_median,VIS_Horizontal_median,AA_RainDepth,AA_RainDuration,AL_SnowAccumDuration,AL_SnowAccumDepth,AJ1_SnowDepth,AJ1_SnowEqWaterDepth,WND_Speed_mean,TMP_Value_mean,SLP_Value_mean,PREV_DEP_DEL15,ORIGIN_mean_encoding,DEST_mean_encoding,ORIGIN_DEST_COMBO_mean_encoding,classWeights
0.0,US,4,992.0,4,1,1263.92,10298.53,2.39,3.19,0.0,0,0.0,0.0,9.859999656677246,217.0,10205.1904296875,0.0,0.2089608241092899,0.2022562904926924,0.1948424068767908,0.1922394872779164
1.0,US,4,920.0,4,1,22000.0,16078.99,0.0,1.0,0.0,0,0.0,0.0,77.12999725341797,3.5799999237060547,10230.6796875,0.0,0.1932316258770857,0.2041274635071824,0.1746522411128284,0.8077605127220836
0.0,US,5,507.0,3,1,5439.91,16083.32,0.0,1.0,0.0,0,0.0,0.0,25.11000061035156,187.94000244140625,10231.7998046875,0.0,0.1943438256658595,0.1364126327266419,0.1243611584327086,0.1922394872779164
0.0,US,6,541.0,3,1,1324.45,13512.06,12.53,4.32,0.0,0,0.0,0.0,11.779999732971191,83.98999786376953,10275.919921875,0.0,0.1771598071931776,0.2292914007500493,0.251937984496124,0.1922394872779164
0.0,US,6,541.0,3,1,14816.61,16089.95,0.0,6.44,0.0,0,0.0,0.0,25.479999542236328,12.25,10323.2099609375,0.0,0.211280756434029,0.1364126327266419,0.202525497814473,0.1922394872779164
1.0,US,6,1475.0,6,1,362.0,13925.49,20.22,1.53,0.0,0,0.0,0.0,10.9399995803833,89.52999877929688,10272.009765625,0.0,0.1771598071931776,0.2244701836022457,0.1382252559726962,0.8077605127220836
1.0,US,6,1475.0,6,1,212.53,8580.53,0.0,0.0,0.0,0,0.0,0.0,36.2549991607666,193.70499801635745,10168.4150390625,1.0,0.1991600465516369,0.1364126327266419,0.1615646258503401,0.8077605127220836
1.0,US,6,930.0,4,1,212.53,8580.53,2.61,1.93,0.0,0,0.0,0.0,24.0,107.0999984741211,10247.169921875,1.0,0.1771598071931776,0.1543094247565943,0.1641113003975014,0.8077605127220836
0.0,US,7,930.0,4,1,765.76,4780.21,0.0,1.05,0.0,0,8.0,778.22,91.72000122070312,-65.44999694824219,10157.3798828125,1.0,0.151653351328994,0.1364126327266419,0.1047239612976664,0.1922394872779164
0.0,US,7,361.0,2,1,61.0,1205.38,20.27,5.57,0.0,0,0.0,0.0,8.239999771118164,96.01000213623048,10204.01953125,0.0,0.1771598071931776,0.2029393824973133,0.1887138145840967,0.1922394872779164


In [0]:


metricsArray = np.empty((0,3), int)

for i, cv_train in enumerate(df_train_split):
  
  cv_train = cv_train.select(*selected_cols)
  cv_val = df_val_split[i].select(*selected_cols)
  
  cv_train = preprocess(cv_train)
  cv_val = preprocess(cv_val)
  
  #oversampling
  #cv_train = oversampling(cv_train)
  
  pipeline = getRegressionPipeline(categoricalColumns, numericCols, labelCol)
   
  pipelineModel = pipeline.fit(cv_train)  

  val_ml_train = pipelineModel.transform(cv_train)
  val_ml_test = pipelineModel.transform(cv_val)
  
  cols = cv_train.columns
  selectedCols = ['features'] + cols
  
  train = val_ml_train.select(selectedCols)
  test = val_ml_test.select(selectedCols)
  
  print("############################")
  print("Validation Set {:d}".format(i+1))
  print("Training Dataset Count: " + str(train.count()))
  print("Test Dataset Count: " + str(test.count()))
  
  pred = execLinearModel(train, test)
  #pred = execRFModel(train, test)
#   pred = execXGBModel(train, test)
  
  precision, recall, fmeasure = getMetrics(pred)
  
  print("Precision is {:.3f}".format(precision))
  print("Recall is {:.3f}".format(recall))
  print("F beta(0.5) score is {:.3f}".format(fmeasure))
  
  newrow = np.array([precision, recall, fmeasure])

  metricsArray = np.append(metricsArray, [newrow], axis=0)


avgArray = np.mean(metricsArray, axis=0)

print("############################")
print("Average of Cross validation")
print("Average Precision is {:.3f}".format(avgArray[0]))
print("Average Recall is {:.3f}".format(avgArray[1]))
print("Average F beta(0.5) score is {:.3f}".format(avgArray[2])) 

  

#### Run the model on test data

In [0]:
# Transform the training & test data and save it - run this once
  
# df_train_upd = aggregate_weather_reports(df_train)
# df_test_upd = aggregate_weather_reports(df_test)
  
# df_train_upd = get_transformed_df(df_train_upd)
# df_test_upd = get_transformed_df(df_test_upd)
  
# df_train_upd = add_previous_flight_delay_indicator(df_train_upd)
# df_test_upd = add_previous_flight_delay_indicator(df_test_upd)
  
# df_train_upd.write.parquet(f"{blob_url}/train_agg_0404")
# df_test_upd.write.parquet(f"{blob_url}/test_agg_0404")

In [0]:
# read the dataframes for inference - this will be part of main loop

df_train_main = spark.read.parquet(f"{blob_url}/train_agg_0404")
df_test_main = spark.read.parquet(f"{blob_url}/test_agg_0404")

In [0]:
def preprocess_dos(df):

  df = df.fillna(999999, subset=['CIG_CeilingHeightDim_median', 'VIS_Horizontal_median' ])
  # Note from Ruth: For 'WND_Speed_mean','TMP_Value_mean','SLP_Value_mean', the nulls in this column have already been filled with the group mean (doesn't make sense to fill these with 0) in Carolina's transformation step
  df = df.fillna(0, subset=['AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', 'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean'])
  #df = df.fillna("0", subset=['CRS_DEP_TIME'])
  
  df = df.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),F.lit('-'),col("DEST")))
  
  df = target_mean_encoding(df, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')

  df = df.withColumn("CRS_DEP_TIME",(F.regexp_replace(col("CRS_DEP_TIME"), "[:]","")).cast(IntegerType())) \
                          .withColumn("DAY_OF_WEEK",col("DAY_OF_WEEK").cast(StringType())) \
                          .withColumn("MONTH",col("MONTH").cast(StringType())) \
                          .drop('ORIGIN', 'DEST', 'ORIGIN_DEST_COMBO')

  return df

In [0]:
# flights + weather + time based attribute
selected_cols = ['DEP_DEL15', 'CRS_DEP_TIME','OP_UNIQUE_CARRIER', 'DAY_OF_WEEK', 'DISTANCE', 'DISTANCE_GROUP', 'MONTH', 'ORIGIN', 'DEST', \
                  'CIG_CeilingHeightDim_median', 'VIS_Horizontal_median', 'AA_RainDepth','AA_RainDuration', 'AL_SnowAccumDuration_mean', \
                  'AL_SnowAccumDepth', 'AJ1_SnowDepth_mean', 'AJ1_SnowEqWaterDepth','WND_Speed_mean', 'TMP_Value_mean', 'SLP_Value_mean', \
                  'PREV_DEP_DEL15']

df_temp2 = df_train_main.select(*selected_cols)

df_temp2 = preprocess_dos(df_temp2)

# Get numerical, categorical values and label ready for pipeline
labelCol = ['DEP_DEL15']

categoricalColumns = [t[0] for t in df_temp2.dtypes if t[1] =='string']

numericCols = [t[0] for t in df_temp2.dtypes if t[1] !='string']

numericCols.remove(*labelCol)

In [0]:
df_train_main = df_train_main.select(*selected_cols)
df_test_main = df_test_main.select(*selected_cols)

df_train_main = preprocess_dos(df_train_main)
df_test_main = preprocess_dos(df_test_main)
  
#oversampling
#df_train_main = oversampling(df_train_main)
  
pipeline = getRegressionPipeline(categoricalColumns, numericCols, labelCol)
   
pipelineModel = pipeline.fit(df_train_main)  

ml_train = pipelineModel.transform(df_train_main)
ml_test = pipelineModel.transform(df_test_main)

cols = df_train_main.columns
selectedCols = ['features'] + cols
  
train_all = ml_train.select(selectedCols)
test_all = ml_test.select(selectedCols)

print("############################")

pred = execLinearModel(train_all, test_all)

precision, recall, fmeasure = getMetrics(pred)

print("Final test scores")
print("Precision is {:.3f}".format(precision))
print("Recall is {:.3f}".format(recall))
print("F beta(0.5) score is {:.3f}".format(fmeasure))