In [0]:
import pyspark
from pyspark.sql.functions import col, concat, lit, regexp_replace, when, length, lpad, to_timestamp, mean, stddev
from pyspark.sql.types import StringType, BooleanType, IntegerType
import pyspark.sql.functions as F
from itertools import chain

import airporttime
from datetime import datetime, timedelta

import numpy as np

In [0]:
from pyspark.sql import SQLContext
from pyspark.mllib.stat import Statistics
from pyspark.sql.functions import udf
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
from pyspark.ml.feature import Bucketizer
from pyspark.ml import Pipeline
from sklearn.metrics import confusion_matrix

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

In [0]:
%run "./libs/time_based_features"

#### Create the Azure BLOB storage to store data for quick access when datasets are huge

In [0]:
blob_container = "w261-scrr" # The name of your container created in https://portal.azure.com
storage_account = "midsw261rv" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261scrr" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261scrrkey" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

#### Import joined flights +weather data from BLOB. We add features to it and use it for our modeling.

In [0]:
df_joined = spark.read.parquet(f"{blob_url}/join_6m_0329")
display(df_joined)

ACTUAL_ELAPSED_TIME,AIR_TIME,ARR_DEL15,ARR_DELAY,ARR_DELAY_GROUP,ARR_DELAY_NEW,ARR_TIME,ARR_TIME_BLK,CARRIER_DELAY,CRS_ARR_TIME,CRS_DEP_TIME,CRS_ELAPSED_TIME,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_DELAY,DEP_DELAY_GROUP,DEP_DELAY_NEW,DEP_TIME,DEP_TIME_BLK,DEST,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,DISTANCE,DISTANCE_GROUP,FL_DATE,MONTH,NAS_DELAY,OP_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,QUARTER,SECURITY_DELAY,TAIL_NUM,YEAR,TIMESTAMP,TIMESTAMP_UTC,WEATHER_WINDOW_START,WEATHER_WINDOW_END,iata_code,ident,elevation_ft,coordinates,station_id,lat,lon,neighbor_id,neighbor_name,neighbor_state,neighbor_call,distance_to_neighbor,dist_to_airport_rank,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND_DirectionAngle,WND_DirectionQuality,WND_Type,WND_Speed,WND_SpeedQuality,CIG_CeilingHeightDim,CIG_CeilingQuality,CIG_CeilingDetermination,CIG_CeilingAndVisibilityOK,VIS_Horizontal,VIS_DistanceQuality,VIS_Variability,VIS_VariabilityQuality,TMP_Value,TMP_Quality,DEW_Value,DEW_Quality,SLP_Value,SLP_Quality,AA1_RainCondition,AA1_RainQuality,AA2_RainCondition,AA2_RainQuality,AA3_RainCondition,AA3_RainQuality,AA4_RainCondition,AA4_RainQuality,AJ1_SnowDepth,AJ1_SnowDepthCondition,AJ1_SnowDepthQuality,AJ1_SnowEqWaterDepth,AJ1_SnowEqWaterDepthCondition,AJ1_SnowEqWaterDepthQuality,AL1_SnowAccumCondition,AL1_SnowAccumQuality,AL2_SnowAccumCondition,AL2_SnowAccumQuality,AL3_SnowAccumDuration,AL3_SnowAccumDepth,AL3_SnowAccumCondition,AL3_SnowAccumQuality,AW1_PresentWeatherCond,AW1_PresentWeatherQuality,AW2_PresentWeatherCond,AW2_PresentWeatherQuality,AW3_PresentWeatherCond,AW3_PresentWeatherQuality,AW4_PresentWeatherCond,AW4_PresentWeatherQuality,AA_RainDepth,AL_SnowAccumDepth,AA_RainDuration,AL_SnowAccumDuration,DATE_UTC
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T11:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,340.0,1,N,21.0,1,22000.0,1,,N,4828.0,1,N,1,0.0,1,-6.0,1,10215.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T16:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T09:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,270.0,1,N,21.0,1,22000.0,1,,N,6437.0,1,N,1,17.0,1,11.0,1,10196.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T14:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T12:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,10.0,1,N,36.0,1,22000.0,1,,N,4828.0,1,N,1,33.0,1,17.0,1,10224.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T17:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T12:00:00.000+0000,4,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-12,99999,V020,340.0,1,N,21.0,1,,1,,N,4800.0,1,,1,0.0,1,-6.0,1,10215.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2015-03-11T17:00:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T10:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,,1,C,0.0,1,22000.0,1,,N,4828.0,1,N,1,6.0,1,-6.0,1,10204.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T15:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T11:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,340.0,1,N,21.0,1,22000.0,1,,N,4828.0,1,N,1,0.0,1,-6.0,1,10215.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T16:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T09:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,270.0,1,N,21.0,1,22000.0,1,,N,6437.0,1,N,1,17.0,1,11.0,1,10196.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T14:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T12:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,10.0,1,N,36.0,1,22000.0,1,,N,4828.0,1,N,1,33.0,1,17.0,1,10224.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T17:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T12:00:00.000+0000,4,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-12,99999,V020,340.0,1,N,21.0,1,,1,,N,4800.0,1,,1,0.0,1,-6.0,1,10215.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2015-03-11T17:00:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,15:04,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T10:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,,1,C,0.0,1,22000.0,1,,N,4828.0,1,N,1,6.0,1,-6.0,1,10204.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,,,1.0,,2015-03-11T15:51:00.000+0000


##### Aggregate to one row per flight
Since we have multiple rows per flight due to weather data, we have to aggregate data. FLight Num, TAIL Num and Departure time are used to identify unique flights.

First, since we read from BLOB, we need to change column types of few attributes to make them ready for further processing

In [0]:
# since we take min of cloud cover and visibility, for NULLs we set to high value
# for rain and snow, we set to 0 as we take max

df_joined = df_joined.fillna("99999", subset=['CIG_CeilingHeightDim', 'VIS_Horizontal'])
df_joined = df_joined.fillna("0", subset=['AA_RainDepth', 'AL_SnowAccumDepth'])

df_joined = df_joined.withColumn("CRS_DEP_TIME",(F.regexp_replace(col("CRS_DEP_TIME"), "[:]","")).cast(IntegerType())) \
                  .withColumn("AA_RainDepth",col("AA_RainDepth").cast(IntegerType())) \
                  .withColumn("AL_SnowAccumDepth",col("AL_SnowAccumDepth").cast(IntegerType()))

In [0]:
#display(df_joined)

ACTUAL_ELAPSED_TIME,AIR_TIME,ARR_DEL15,ARR_DELAY,ARR_DELAY_GROUP,ARR_DELAY_NEW,ARR_TIME,ARR_TIME_BLK,CARRIER_DELAY,CRS_ARR_TIME,CRS_DEP_TIME,CRS_ELAPSED_TIME,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_DELAY,DEP_DELAY_GROUP,DEP_DELAY_NEW,DEP_TIME,DEP_TIME_BLK,DEST,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,DISTANCE,DISTANCE_GROUP,FL_DATE,MONTH,NAS_DELAY,OP_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,QUARTER,SECURITY_DELAY,TAIL_NUM,YEAR,TIMESTAMP,TIMESTAMP_UTC,WEATHER_WINDOW_START,WEATHER_WINDOW_END,iata_code,ident,elevation_ft,coordinates,station_id,lat,lon,neighbor_id,neighbor_name,neighbor_state,neighbor_call,distance_to_neighbor,dist_to_airport_rank,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND_DirectionAngle,WND_DirectionQuality,WND_Type,WND_Speed,WND_SpeedQuality,CIG_CeilingHeightDim,CIG_CeilingQuality,CIG_CeilingDetermination,CIG_CeilingAndVisibilityOK,VIS_Horizontal,VIS_DistanceQuality,VIS_Variability,VIS_VariabilityQuality,TMP_Value,TMP_Quality,DEW_Value,DEW_Quality,SLP_Value,SLP_Quality,AA1_RainCondition,AA1_RainQuality,AA2_RainCondition,AA2_RainQuality,AA3_RainCondition,AA3_RainQuality,AA4_RainCondition,AA4_RainQuality,AJ1_SnowDepth,AJ1_SnowDepthCondition,AJ1_SnowDepthQuality,AJ1_SnowEqWaterDepth,AJ1_SnowEqWaterDepthCondition,AJ1_SnowEqWaterDepthQuality,AL1_SnowAccumCondition,AL1_SnowAccumQuality,AL2_SnowAccumCondition,AL2_SnowAccumQuality,AL3_SnowAccumDuration,AL3_SnowAccumDepth,AL3_SnowAccumCondition,AL3_SnowAccumQuality,AW1_PresentWeatherCond,AW1_PresentWeatherQuality,AW2_PresentWeatherCond,AW2_PresentWeatherQuality,AW3_PresentWeatherCond,AW3_PresentWeatherQuality,AW4_PresentWeatherCond,AW4_PresentWeatherQuality,AA_RainDepth,AL_SnowAccumDepth,AA_RainDuration,AL_SnowAccumDuration,DATE_UTC
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T11:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,340.0,1,N,21.0,1,22000.0,1,,N,4828.0,1,N,1,0.0,1,-6.0,1,10215.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T16:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T09:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,270.0,1,N,21.0,1,22000.0,1,,N,6437.0,1,N,1,17.0,1,11.0,1,10196.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T14:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T12:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,10.0,1,N,36.0,1,22000.0,1,,N,4828.0,1,N,1,33.0,1,17.0,1,10224.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T17:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T12:00:00.000+0000,4,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-12,99999,V020,340.0,1,N,21.0,1,,1,,N,4800.0,1,,1,0.0,1,-6.0,1,10215.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,2015-03-11T17:00:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72534014819,41.786,-87.752,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,17.197261125546635,3,72530094846,2015-03-11T10:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,,1,C,0.0,1,22000.0,1,,N,4828.0,1,N,1,6.0,1,-6.0,1,10204.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T15:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T11:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,340.0,1,N,21.0,1,22000.0,1,,N,4828.0,1,N,1,0.0,1,-6.0,1,10215.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T16:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T09:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,270.0,1,N,21.0,1,22000.0,1,,N,6437.0,1,N,1,17.0,1,11.0,1,10196.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T14:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T12:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,10.0,1,N,36.0,1,22000.0,1,,N,4828.0,1,N,1,33.0,1,17.0,1,10224.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T17:51:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T12:00:00.000+0000,4,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-12,99999,V020,340.0,1,N,21.0,1,,1,,N,4800.0,1,,1,0.0,1,-6.0,1,10215.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,,,2015-03-11T17:00:00.000+0000
83.0,64.0,1.0,16.0,1,16.0,1648,1600-1659,4.0,1632,1504,88.0,11,3,1.0,21.0,1,21.0,1525,1500-1559,MSP,13487,1348702,31650,"Minneapolis, MN",MN,27,Minnesota,63,334.0,2,2015-03-11,3,0.0,DL,19790,1232,DL,ORD,13930,1393003,30977,"Chicago, IL",IL,17,Illinois,41,1,0.0,N945DN,2015,2015-03-11T15:04:00.000+0000,2015-03-11T20:04:00.000+0000,2015-03-11T14:04:00.000+0000,2015-03-11T18:04:00.000+0000,ORD,KORD,672,"-87.9048, 41.9786",72530094846,41.995,-87.934,72530094846,CHICAGO O'HARE INTERNATIONAL,IL,KORD,0.0,1,72530094846,2015-03-11T10:51:00.000+0000,7,41.995,-87.9336,201.8,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",FM-15,KORD,V030,,1,C,0.0,1,22000.0,1,,N,4828.0,1,N,1,6.0,1,-6.0,1,10204.0,1,9.0,1.0,,,,,,,,,,,,,,,,,,,,,10.0,1.0,,,,,,,0,0,1.0,,2015-03-11T15:51:00.000+0000


In [0]:
df_join_agg = df_joined.groupBy("OP_CARRIER_FL_NUM", "TAIL_NUM", "TIMESTAMP") \
                           .agg(*(F.max("DEP_DEL15").alias("DEP_DEL15"),  \
                                  F.max("DAY_OF_WEEK").alias("DAY_OF_WEEK"), \
                                  F.max("TIMESTAMP_UTC").alias("TIMESTAMP_UTC"), \
                                  F.max("WEATHER_WINDOW_END").alias("WEATHER_WINDOW_END"), \
                                  F.max("DEST_STATE_ABR").alias("DEST_STATE_ABR"), \
                                  F.max("DISTANCE_GROUP").alias("DISTANCE_GROUP"), \
                                  F.max("OP_UNIQUE_CARRIER").alias("OP_UNIQUE_CARRIER"), \
                                  F.max("DISTANCE").alias("DISTANCE"), \
                                  F.max("MONTH").alias("MONTH"), \
                                  F.max("ORIGIN").alias("ORIGIN"), \
                                  F.max("DEST").alias("DEST"), \
                                  F.max("DAY_OF_MONTH").alias("DAY_OF_MONTH"), \
                                  F.max("CRS_DEP_TIME").alias("CRS_DEP_TIME"), \
                                  F.max("elevation_ft").alias("elevation_ft"), \
                                  mean("WND_Speed").alias("WND_Speed"), \
                                  #min("CIG_CeilingHeightDim").alias("CLOUD_HEIGHT"), \
                                  #min(df_joined.VIS_Horizontal).alias("VIS_Horizontal"), \
                                  mean("TMP_Value").alias("TMP_Value"), \
                                  F.max("DEW_Value").alias("DEW_Value"), \
                                  F.max("AA_RainDepth").alias("AA_RainDepth"), \
                                  F.max("AL_SnowAccumDepth").alias("AL_SnowAccumDepth") \
                               ))
                                
  
#display(df_join_agg)

OP_CARRIER_FL_NUM,TAIL_NUM,TIMESTAMP,DEP_DEL15,DAY_OF_WEEK,TIMESTAMP_UTC,WEATHER_WINDOW_END,DEST_STATE_ABR,DISTANCE_GROUP,OP_UNIQUE_CARRIER,DISTANCE,MONTH,ORIGIN,DEST,DAY_OF_MONTH,CRS_DEP_TIME,elevation_ft,WND_Speed,TMP_Value,DEW_Value,AA_RainDepth,AL_SnowAccumDepth
10,N237WN,2015-01-04T06:45:00.000+0000,0.0,7,2015-01-04T11:45:00.000+0000,2015-01-04T09:45:00.000+0000,TX,3,WN,696.0,1,ATL,HOU,4,645,1026,31.0,129.57142857142858,133,23,0
10,N376SW,2015-03-26T13:35:00.000+0000,0.0,4,2015-03-26T17:35:00.000+0000,2015-03-26T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,26,1335,1026,24.11111111111111,169.77777777777777,160,0,0
10,N379SW,2015-03-18T13:35:00.000+0000,0.0,3,2015-03-18T17:35:00.000+0000,2015-03-18T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,18,1335,1026,31.0,145.75,100,0,0
10,N379SW,2015-03-29T13:35:00.000+0000,1.0,7,2015-03-29T17:35:00.000+0000,2015-03-29T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,29,1335,1026,24.75,15.5,-83,0,0
10,N385SW,2015-03-25T13:35:00.000+0000,0.0,3,2015-03-25T17:35:00.000+0000,2015-03-25T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,25,1335,1026,18.25,147.0,128,0,0
10,N388SW,2015-03-10T13:35:00.000+0000,0.0,2,2015-03-10T17:35:00.000+0000,2015-03-10T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,10,1335,1026,33.5,145.5,83,0,0
10,N395SW,2015-03-20T13:35:00.000+0000,0.0,5,2015-03-20T17:35:00.000+0000,2015-03-20T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,20,1335,1026,27.0,89.2,83,0,0
10,N396SW,2015-04-05T13:35:00.000+0000,0.0,7,2015-04-05T17:35:00.000+0000,2015-04-05T15:35:00.000+0000,MA,4,WN,946.0,4,ATL,BOS,5,1335,1026,37.25,85.75,-28,0,0
10,N396SW,2015-04-07T13:35:00.000+0000,0.0,2,2015-04-07T17:35:00.000+0000,2015-04-07T15:35:00.000+0000,MA,4,WN,946.0,4,ATL,BOS,7,1335,1026,10.142857142857142,178.85714285714286,170,0,0
10,N397SW,2015-03-15T13:35:00.000+0000,0.0,7,2015-03-15T17:35:00.000+0000,2015-03-15T15:35:00.000+0000,MA,4,WN,946.0,3,ATL,BOS,15,1335,1026,36.0,147.25,139,0,0


In [0]:
df_temp = df_join_agg

In [0]:
# features in our dataset
df_temp.printSchema()

#### Tranformations

New features should be added in this section. If there's any encoding needed, getPipeline function needs to be updated

In [0]:
# function target encoding for features with too many indices
def target_mean_encoding(df, col, target):
    """
    :param df: pyspark.sql.dataframe
        dataframe to apply target mean encoding
    :param col: str list
        list of columns to apply target encoding
    :param target: str
        target column
    :return:
        dataframe with target encoded columns added
    """

    for c in col:
      
        means = df.groupby(F.col(c)).agg(F.mean(target).alias("mean_encoding"))
        
        mean_dict = {row[c]: row['mean_encoding'] for row in means.collect()}
        
        mapping_expr = F.create_map([lit(x) for x in chain(*mean_dict.items())])

        df = df.withColumn(c+"_mean_encoding", mapping_expr[F.col(c)])

    return df

In [0]:
df_temp = add_previous_flight_delay_indicator(df_temp)
df_temp = add_airline_airport_status_indicator(df_temp)
df_temp = df_temp.fillna(0, subset=['AIRLINE_AIRPORT_STATUS'])

df_temp = df_temp.withColumn("ORIGIN_DEST_COMBO", F.concat(col("ORIGIN"),lit('-'),col("DEST")))

df_temp = target_mean_encoding(df_temp, col=['ORIGIN', 'DEST','ORIGIN_DEST_COMBO'], target='DEP_DEL15')

display(df_temp)


OP_CARRIER_FL_NUM,TAIL_NUM,TIMESTAMP,DEP_DEL15,DAY_OF_WEEK,TIMESTAMP_UTC,WEATHER_WINDOW_END,DEST_STATE_ABR,DISTANCE_GROUP,OP_UNIQUE_CARRIER,DISTANCE,MONTH,ORIGIN,DEST,DAY_OF_MONTH,CRS_DEP_TIME,elevation_ft,WND_Speed,TMP_Value,DEW_Value,AA_RainDepth,AL_SnowAccumDepth,PREV_DEP_DEL15,AIRLINE_AIRPORT_STATUS,ORIGIN_DEST_COMBO,ORIGIN_mean_encoding,DEST_mean_encoding,ORIGIN_DEST_COMBO_mean_encoding
745,N459AS,2015-01-01T18:10:00.000+0000,1.0,4,2015-01-01T23:10:00.000+0000,2015-01-01T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,1,1810,1026,5.25,43.0,-22,0,0,0.0,0.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N403AS,2015-01-02T18:10:00.000+0000,1.0,5,2015-01-02T23:10:00.000+0000,2015-01-02T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,2,1810,1026,20.6,78.2,61,10,0,0.0,0.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N467AS,2015-01-03T18:10:00.000+0000,1.0,6,2015-01-03T23:10:00.000+0000,2015-01-03T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,3,1810,1026,43.0,101.2,100,3,0,0.0,0.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N464AS,2015-01-04T18:10:00.000+0000,0.0,7,2015-01-04T23:10:00.000+0000,2015-01-04T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,4,1810,1026,35.54545454545455,159.45454545454547,150,28,0,0.0,0.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N408AS,2015-01-05T18:10:00.000+0000,1.0,1,2015-01-05T23:10:00.000+0000,2015-01-05T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,5,1810,1026,64.25,36.0,-22,0,0,0.0,0.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N467AS,2015-01-06T18:10:00.000+0000,1.0,2,2015-01-06T23:10:00.000+0000,2015-01-06T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,6,1810,1026,40.0,39.0,-33,0,0,0.0,1.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N419AS,2015-01-07T18:10:00.000+0000,0.0,3,2015-01-07T23:10:00.000+0000,2015-01-07T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,7,1810,1026,73.25,12.5,-56,0,0,0.0,1.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N467AS,2015-01-08T18:10:00.000+0000,0.0,4,2015-01-08T23:10:00.000+0000,2015-01-08T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,8,1810,1026,36.0,-89.0,-178,0,0,0.0,1.0,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N442AS,2015-01-09T18:10:00.000+0000,1.0,5,2015-01-09T23:10:00.000+0000,2015-01-09T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,9,1810,1026,43.75,-19.75,-133,0,0,0.0,0.75,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366
745,N467AS,2015-01-10T18:10:00.000+0000,0.0,6,2015-01-10T23:10:00.000+0000,2015-01-10T21:10:00.000+0000,WA,9,AS,2182.0,1,ATL,SEA,10,1810,1026,28.25,-30.5,-144,0,0,0.0,0.8,ATL-SEA,0.1845785513414912,0.2319875776397515,0.1673306772908366


In [0]:
df_main = df_temp.select("DAY_OF_WEEK","DEST_STATE_ABR","DISTANCE_GROUP","OP_UNIQUE_CARRIER", "DISTANCE", "MONTH", "DAY_OF_MONTH", "CRS_DEP_TIME", \
                         "WND_Speed", "TMP_Value", "DEW_Value", "AA_RainDepth", "AL_SnowAccumDepth", "DEP_DEL15", \
                         "PREV_DEP_DEL15", "AIRLINE_AIRPORT_STATUS", "ORIGIN_mean_encoding", "DEST_mean_encoding", "ORIGIN_DEST_COMBO_mean_encoding")

##### getPipeline function encodes, vectorizes and scales data for ML
To add new column, figure out the encoding it requires and add it to the stages list so it can be part of pipeline.

In [0]:
display(df_main)

DAY_OF_WEEK,DEST_STATE_ABR,DISTANCE_GROUP,OP_UNIQUE_CARRIER,DISTANCE,MONTH,DAY_OF_MONTH,CRS_DEP_TIME,WND_Speed,TMP_Value,DEW_Value,AA_RainDepth,AL_SnowAccumDepth,DEP_DEL15,PREV_DEP_DEL15,AIRLINE_AIRPORT_STATUS,ORIGIN_mean_encoding,DEST_mean_encoding,ORIGIN_DEST_COMBO_mean_encoding
4,WA,9,AS,2182.0,1,1,1810,5.25,43.0,-22,0,0,1.0,0.0,0.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
5,WA,9,AS,2182.0,1,2,1810,20.6,78.2,61,10,0,1.0,0.0,0.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
6,WA,9,AS,2182.0,1,3,1810,43.0,101.2,100,3,0,1.0,0.0,0.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
7,WA,9,AS,2182.0,1,4,1810,35.54545454545455,159.45454545454547,150,28,0,0.0,0.0,0.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
1,WA,9,AS,2182.0,1,5,1810,64.25,36.0,-22,0,0,1.0,0.0,0.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
2,WA,9,AS,2182.0,1,6,1810,40.0,39.0,-33,0,0,1.0,0.0,1.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
3,WA,9,AS,2182.0,1,7,1810,73.25,12.5,-56,0,0,0.0,0.0,1.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
4,WA,9,AS,2182.0,1,8,1810,36.0,-89.0,-178,0,0,0.0,0.0,1.0,0.1845785513414912,0.2319875776397515,0.1673306772908366
5,WA,9,AS,2182.0,1,9,1810,43.75,-19.75,-133,0,0,1.0,0.0,0.75,0.1845785513414912,0.2319875776397515,0.1673306772908366
6,WA,9,AS,2182.0,1,10,1810,28.25,-30.5,-144,0,0,0.0,0.0,0.8,0.1845785513414912,0.2319875776397515,0.1673306772908366


#### Split into training and test. Test will be used only at the end. Training will be used for model training and cross validation

This notebook pulls in 6 months data

Training - Jan 1 to May 15

Test - May 15 to June 30

In [0]:
train_main = df_main.filter((df_main.MONTH < 5) | ((df_main.MONTH==5) & (df_main.DAY_OF_MONTH <= 15)))
test_main = df_main.filter((df_main.MONTH == 6) | ((df_main.MONTH==5) & (df_main.DAY_OF_MONTH > 15)))

### Functions

In [0]:
def getPipeline(df_train):
  
  stages = []

  # use bucketizer to create buckets
  # After creating buckets, it'll be treated as categorical column
  bucketizer = Bucketizer(splits=[ 0, 900, 1200, 1600, 2000, 2359],inputCol="CRS_DEP_TIME", outputCol="CRS_DEP_BUCKET")
  stages += [bucketizer]

  # for categorical columns, do one hot encoding
  categoricalColumns = ["DEST_STATE_ABR","OP_UNIQUE_CARRIER","DAY_OF_WEEK","DISTANCE_GROUP","CRS_DEP_BUCKET", "PREV_DEP_DEL15"]
  for categoricalCol in categoricalColumns:
      stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_Index', handleInvalid="keep")
      encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "_classVec"])
      stages += [stringIndexer, encoder]

  # no encoding for numerical columns
  numericCols = ["DISTANCE", "WND_Speed", "TMP_Value", "DEW_Value", "AA_RainDepth", "AL_SnowAccumDepth", "AIRLINE_AIRPORT_STATUS", "ORIGIN_mean_encoding", "DEST_mean_encoding", "ORIGIN_DEST_COMBO_mean_encoding"]

  # This is target column and should not change
  labelCol = ["DEP_DEL15"]

  assemblerInputs = [c + "_classVec" for c in categoricalColumns] + numericCols

  assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="vectorized_features")
  stages += [assembler]

  # scale the features
  scaler = StandardScaler(inputCol="vectorized_features", outputCol="features")
  stages += [scaler]

  #create pipeline of all the tranformations needed
  pipeline = Pipeline(stages = stages)
  
  pipelineModel = pipeline.fit(df_train)
  
  return pipelineModel


##### The below function creates 4 cross validation splits in time sequence.

It uses rolling windows (Blocking Time Series Split) which overlap on the test portion:

Split 1 - Jan for training, Feb 1 to 15 for test

Split 2 - Feb for training, Mar 1 to 15 for test

and so on...

In [0]:
def Split6month(df):
  trainsplits = []
  testsplits =[]
  
  train1 = df.filter((df.MONTH==1))
  test1 = df.filter((df.MONTH==2) & (df.DAY_OF_MONTH <= 15))
  
  train2 = df.filter((df.MONTH==2))
  test2 = df.filter((df.MONTH==3) & (df.DAY_OF_MONTH <= 15))
  
  train3 = df.filter((df.MONTH==3))
  test3 = df.filter((df.MONTH==4) & (df.DAY_OF_MONTH <= 15))

  train4 = df.filter((df.MONTH==4))
  test4 = df.filter((df.MONTH==5) & (df.DAY_OF_MONTH <= 15))
  
  trainsplits.append(train1)
  trainsplits.append(train2)
  trainsplits.append(train3)
  trainsplits.append(train4)
  testsplits.append(test1)
  testsplits.append(test2)
  testsplits.append(test3)
  testsplits.append(test4)
  
  return trainsplits, testsplits
  

In [0]:
def execLinearModel(train, test):
  
  lr = LogisticRegression(featuresCol = 'features', labelCol = 'DEP_DEL15', maxIter=5)
  lrModel = lr.fit(train)
  predictions = lrModel.transform(test)
  #predictions_train = lrModel.transform(train)
  #display(predictions.select('DEP_DEL15', 'features',  'rawPrediction', 'prediction', 'probability'))
  
  return predictions

In [0]:
def execRFModel(train, test):

  rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'DEP_DEL15')
  rfModel = rf.fit(train)
  predictions = rfModel.transform(test)
  #display(predictions.select('DEP_DEL15', 'features',  'rawPrediction', 'prediction', 'probability'))

  return predictions

In [0]:
def getMetrics(predictions):
  #class_names=[1.0,0.0]

  #y_true = predictions.select("DEP_DEL15")
  #y_true = y_true.toPandas()

  #y_pred = predictions.select("prediction")
  #y_pred = y_pred.toPandas()
  
  #evaluator = BinaryClassificationEvaluator(labelCol="DEP_DEL15", rawPredictionCol="prediction")
 
  #predandlabels = predictions.select("DEP_DEL15", "prediction")

  #precision = evaluator.evaluate(predandlabels, {evaluator.metricName: "precision"})
  #recall = evaluator.evaluate(predandlabels, {evaluator.metricName: "recall"})
  
  #cnf_matrix = confusion_matrix(y_true, y_pred,labels=class_names)
  #cnf_matrix
  
  #cnf_matrix = metrics.confusionMatrix().toArray()
  #print(cnf_matrix)
  
  #precision = metrics.precision(1.0)
  #recall = metrics.recall(1.0)
  #f1 = metrics.fMeasure()
  
  #precision = ((cnf_matrix[0][0]) / (cnf_matrix[0][0] + cnf_matrix[1][0]))
  #recall = ((cnf_matrix[0][0]) / (cnf_matrix[0][0] + cnf_matrix[0][1]))
  
  TN = predictions.filter('prediction = 0 AND DEP_DEL15 = 0').count()
  TP = predictions.filter('prediction = 1 AND DEP_DEL15 = 1').count()
  FN = predictions.filter('prediction = 0 AND DEP_DEL15 <> 0').count()
  FP = predictions.filter('prediction = 1 AND DEP_DEL15 <> 1').count()
  
  if (TP + FP) > 0:
    precision = TP / (TP + FP)
  else:
    precision = 0
    
  if (TP + FN) > 0:
    recall = TP / (TP + FN)
  else:
    recall=0
  
  beta = 0.5

  if (precision + recall) > 0:
    fmeasure = ((1 + (beta ** 2))  * precision*recall)/ (((beta ** 2) * precision) + recall)
  else:
    fmeasure = 0
  
  return precision, recall, fmeasure

#### Cross-validation Loop

Loops through splits and calculates metrics for each iteration. Also outputs average over the various splits

In [0]:
trainsplits, testsplits = Split6month(train_main)

metricsArray = np.empty((0,3), int)

for i, val_train in enumerate(trainsplits):
  
  pipelineModel = getPipeline(val_train)

  val_ml_train = pipelineModel.transform(val_train)
  val_ml_test = pipelineModel.transform(testsplits[i])
  
  cols = val_train.columns
  selectedCols = ['features'] + cols
  
  train = val_ml_train.select(selectedCols)
  test = val_ml_test.select(selectedCols)
  
  print("############################")
  print("Validation Set {:d}".format(i+1))
  print("Training Dataset Count: " + str(train.count()))
  print("Test Dataset Count: " + str(test.count()))
  
  pred = execLinearModel(train, test)
  #pred = execRFModel(train, test)
  
  precision, recall, fmeasure = getMetrics(pred)
  
  print("Precision is {:.3f}".format(precision))
  print("Recall is {:.3f}".format(recall))
  print("F beta(0.5) score is {:.3f}".format(fmeasure))
  
  newrow = np.array([precision, recall, fmeasure])

  metricsArray = np.append(metricsArray, [newrow], axis=0)


avgArray = np.mean(metricsArray, axis=0)

print("############################")
print("Average of Cross validation")
print("Average Precision is {:.3f}".format(avgArray[0]))
print("Average Recall is {:.3f}".format(avgArray[1]))
print("Average F beta(0.5) score is {:.3f}".format(avgArray[2])) 

  

#### Final test section

Trains model over entire training data and predicts on unseen data

In [0]:
pipelineModel = getPipeline(train_main)

ml_train = pipelineModel.transform(train_main)
ml_test = pipelineModel.transform(test_main)

cols = train_main.columns
selectedCols = ['features'] + cols

train = ml_train.select(selectedCols)
test = ml_test.select(selectedCols)

train.printSchema()

In [0]:
print("############################")
print("Test Set ".format(i))
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

pred = execLinearModel(train, test)
#pred = execRFModel(train, test)

precision, recall, f1 = getMetrics(pred)

print("Precision is {:.3f}".format(precision))
print("Recall is {:.3f}".format(recall))
print("F1 score is {:.3f}".format(f1))