In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pyspark.sql import functions as f
from pyspark.sql import SQLContext
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, avg, mean
from pyspark.sql.functions import min
from pyspark.sql.functions import col, max
from pyspark.sql.functions import format_string
from pyspark.sql.functions import substring
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import concat
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import lit
from pyspark.sql.functions import to_utc_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import instr
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer,OneHotEncoder
from pyspark.ml.classification import MultilayerPerceptronClassifier


from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator



In [0]:
#Initializes blob storage credentials/location
blob_container = "w261-sec4-group2" # The name of your container created in https://portal.azure.com
storage_account = "kdevery" # The name of your Storage account created in https://portal.azure.com
secret_scope = "sec4-group2" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

#Points to SAS token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

#Process Validation Folds

In [0]:
#feature processing of dfs

train_df=train_df.select("*", f.row_number().over(Window.partitionBy().orderBy("Date_Time_sched_dep_utc")).alias("Index"))
train_df = train_df.withColumn("holiday_period", train_df["holiday_period"].cast(IntegerType()))
test_df = test_df.withColumn("holiday_period", test_df["holiday_period"].cast(IntegerType()))

In [0]:
#string indexing of carrier
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
train_df = carrier_indexer.fit(train_df).transform(train_df)


#one hot encoding
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
train_df = onehotencoder_carrier_vector.fit(train_df).transform(train_df)

In [0]:
#splitting training dataframe into five folds contained in dictionary "d"

d = {}
folds = ['df1','df2','df3','df4','df5']

each_len = train_df.count()/5
start = 1
val_size = each_len/5
stop = each_len
precision_list = []

for fold in folds:
    d[fold] = train_df.filter(col('Index').between(start,stop))\
                                  .withColumn('cv', F.when(col('Index').between(start,(stop-val_size)), 'train')
                                         .otherwise('val'))
    start += each_len
    stop += each_len

                                  

In [0]:
train_df.createOrReplaceTempView('train_view')

In [0]:
%sql 

SELECT holiday_period,mean_carrier_delay,Pagerank_Score,
 PREV_FLIGHT_DELAYED,origin_percent_delayed,
 dest_percent_delayed,
 ORIGIN_Prophet_trend,
 ORIGIN_Prophet_pred,
 DEST_Prophet_trend,
 DEST_Prophet_pred
 FROM train_view
 LIMIT 10

holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred
0,0.1202003338898163,0.0162830899118796,1,0.1831187410586552,0.0,0.2607316376980994,0.1914847708400157,0.1369276377066992,0.0849973754869627
0,0.094298245614035,0.0162830899118796,0,0.1831187410586552,0.0,0.2607316376980994,0.1914847708400157,0.1369276377066992,0.0849973754869627
0,0.1235820895522388,0.0112674970607994,0,0.0798319327731092,0.0,0.1777465882992688,0.1378597053720036,0.1369276377066992,0.0849973754869627
0,0.1029009304871373,0.0112674970607994,0,0.0798319327731092,0.0,0.1777465882992688,0.1378597053720036,0.1369276377066992,0.0849973754869627
0,0.1153415453527435,0.0112674970607994,0,0.0798319327731092,0.0,0.1777465882992688,0.1378597053720036,0.1369276377066992,0.0849973754869627
0,0.1153415453527435,0.0405444716972231,0,0.1847826086956521,0.0,0.1705949989724562,0.1127466396817939,0.1369276377066992,0.0849973754869627
0,0.1046464646464646,0.0405444716972231,0,0.1847826086956521,0.0,0.1705949989724562,0.1127466396817939,0.1369276377066992,0.0849973754869627
0,0.1202003338898163,0.0405444716972231,0,0.1847826086956521,0.0,0.1705949989724562,0.1127466396817939,0.1369276377066992,0.0849973754869627
0,0.1321428571428571,0.0112674970607994,1,0.149584487534626,0.2857142857142857,0.1710573009548977,0.1595223142014603,0.1379635286565257,0.1361425511872535
0,0.1483483483483483,0.0112674970607994,0,0.149584487534626,0.2857142857142857,0.1710573009548977,0.1595223142014603,0.1379635286565257,0.1361425511872535


In [0]:
%sql

SELECT COUNT(*)
FROM train_view
WHERE ORIGIN_Prophet_trend IS NULL

count(1)
1201


In [0]:
display(train_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,two_hrs_pre_flight_utc,Date_Time_sched_dep_utc,Date_Time_sched_arrival_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_flight_per_day,origin_delays_per_day,dest_flight_per_day,dest_delays_per_day,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred,Index,OP_CARRIER_Index,carrier_vec
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T04:00:00.000+0000,2015-01-01T06:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N633NK,PBG,FLL,155,523,-33.0,208.0,1334.0,6,,,,,,,,,,,,,0,PBG_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.00040524111846548696,0,,,,,,,0.2644513839312122,0.3684335491388847,0.1972105961756701,0.2605400371440469,1,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T04:00:00.000+0000,2015-01-01T06:00:00.000+0000,2015-01-01T10:00:00.000+0000,UA,N37293,BQN,EWR,259,559,6.0,240.0,1585.0,7,,,,,,,,,,,,,0,BQN_EWR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.00040524111846548696,0,,,,,,,0.2136299479258512,0.3623799018602127,0.2529898832493815,0.3147981947730917,2,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T11:00:00.000+0000,DL,N3751B,SLC,ATL,40,615,-22.0,215.0,1590.0,7,,,,,,,,,,,,,0,SLC_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0044353640416047,0,,,,,,,0.1181421863802182,0.2124695928688214,0.1702601639651673,0.2348084846170554,3,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,DL,N958DN,DEN,ATL,30,523,-30.0,173.0,1199.0,5,,,,,,,,,,,,,0,DEN_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.021111536800748,0,,,,,,,0.2570537505206728,0.3629066295136991,0.1702601639651673,0.2348084846170554,4,1.0,"Map(vectorType -> sparse, length -> 18, indices -> List(1), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N528NK,MSP,FLL,115,542,25.0,207.0,1487.0,6,,,,,,,,,,,,,0,MSP_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0730270617383201,0,,,,,,,0.1474705839038135,0.2276140554121485,0.1972105961756701,0.2605400371440469,5,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,NK,N630NK,IAG,FLL,200,504,-11.0,184.0,1176.0,5,,,,,,,,,,,,,0,IAG_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0007496960691611508,0,,,,,,,0.1782062383491984,0.1749395545883275,0.1972105961756701,0.2605400371440469,6,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T05:00:00.000+0000,2015-01-01T07:00:00.000+0000,2015-01-01T10:00:00.000+0000,B6,N239JB,BQN,MCO,307,500,20.0,173.0,1129.0,5,,,,,,,,,,,,,0,BQN_MCO,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.00040524111846548696,0,,,,,,,0.2136299479258512,0.3623799018602127,0.2038688392930303,0.2470927404044414,7,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T11:00:00.000+0000,NK,N606NK,PHX,DFW,159,502,1.0,123.0,868.0,4,,,,,,,,,,,,,0,PHX_DFW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0034536674321219,0,,,,,,,0.1905710202687115,0.2617840238905722,0.2403348559489448,0.2602028318222336,8,9.0,"Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T11:00:00.000+0000,AA,N3LAAA,SFO,DFW,30,545,-13.0,195.0,1464.0,6,,,,,,,,,,,,,0,SFO_DFW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.016649587006533,0,,,,,,,0.1976197342531453,0.2543811782986895,0.2403348559489448,0.2602028318222336,9,2.0,"Map(vectorType -> sparse, length -> 18, indices -> List(2), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01,2015-01-01T06:00:00.000+0000,2015-01-01T08:00:00.000+0000,2015-01-01T12:00:00.000+0000,UA,N78448,SFO,IAH,48,626,-7.0,218.0,1635.0,7,,,,,,,,,,,,,0,SFO_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.016649587006533,0,,,,,,,0.1976197342531453,0.2543811782986895,0.2202336140315915,0.2518688063337834,10,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"


In [0]:
train_df.columns

Out[13]: ['DEP_DEL15',
 'YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'two_hrs_pre_flight_utc',
 'Date_Time_sched_dep_utc',
 'Date_Time_sched_arrival_utc',
 'OP_CARRIER',
 'TAIL_NUM',
 'ORIGIN',
 'DEST',
 'CRS_DEP_TIME',
 'CRS_ARR_TIME',
 'ARR_DELAY',
 'CRS_ELAPSED_TIME',
 'DISTANCE',
 'DISTANCE_GROUP',
 'ELEVATION',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyRelativeHumidity',
 'HourlySeaLevelPressure',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'HourlyWindGustSpeed',
 'Route',
 'Rain',
 'Snow',
 'Thunder',
 'Fog',
 'Mist',
 'Freezing',
 'Blowing',
 'Smoke',
 'Drizzle',
 'Overcast',
 'Broken',
 'Scattered',
 'CloudySkyCondition',
 'holiday_period',
 'mean_carrier_delay',
 'Pagerank_Score',
 'PREV_FLIGHT_DELAYED',
 'origin_flight_per_day',
 'origin_delays_per_day',
 'dest_flight_per_day',
 'dest_delays_per

In [0]:
def process_fold_df(fold_df):
    
    
    #imputation
    fold_df.createOrReplaceTempView("fold_view")
    
    imputation_columns = ['CRS_ELAPSED_TIME','HourlyAltimeterSetting','HourlyDewPointTemperature',
             'HourlyDryBulbTemperature','HourlyRelativeHumidity','HourlySeaLevelPressure',
             'HourlyStationPressure','HourlyVisibility','HourlyWetBulbTemperature',
             'HourlyWindDirection','mean_carrier_delay','ORIGIN_Prophet_trend',
             'ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred',]

    means = {}

    for impute_col in imputation_columns:
        mean = spark.sql(f"SELECT AVG({impute_col}) FROM fold_view").collect()[0][0]
        means[impute_col] = mean
    
    print(means)
    
    #fill Nones and Nans - Seems to error sometimes?
    fold_df = fold_df.fillna(0,["HourlyWindGustSpeed"]) \
         .fillna(means["CRS_ELAPSED_TIME"],["CRS_ELAPSED_TIME"]) \
         .fillna(means["HourlyAltimeterSetting"],["HourlyAltimeterSetting"]) \
         .fillna(means["HourlyDewPointTemperature"],["HourlyDewPointTemperature"]) \
         .fillna(means["HourlyDryBulbTemperature"],["HourlyDryBulbTemperature"]) \
         .fillna(0,["HourlyPrecipitation"]) \
         .fillna(means["HourlyRelativeHumidity"],["HourlyRelativeHumidity"]) \
         .fillna(means["HourlySeaLevelPressure"],["HourlySeaLevelPressure"]) \
         .fillna(means["HourlyStationPressure"],["HourlyStationPressure"]) \
         .fillna(means["HourlyVisibility"],["HourlyVisibility"]) \
         .fillna(means["HourlyWetBulbTemperature"],["HourlyWetBulbTemperature"]) \
         .fillna(means["HourlyWindDirection"],["HourlyWindDirection"]) \
         .fillna(0,["HourlyWindSpeed"]) \
         .fillna("",["TAIL_NUM"])\
         .fillna(0,['holiday_period'])\
         .fillna(means['mean_carrier_delay'],['mean_carrier_delay'])\
         .fillna(0,['PREV_FLIGHT_DELAYED'])\
         .fillna(0,['origin_percent_delayed'])\
         .fillna(0,['dest_percent_delayed'])\
         .fillna(means['ORIGIN_Prophet_trend'],['ORIGIN_Prophet_trend'])\
         .fillna(means['ORIGIN_Prophet_pred'],['ORIGIN_Prophet_pred'])\
         .fillna(means['DEST_Prophet_trend'],['DEST_Prophet_trend'])\
         .fillna(means['DEST_Prophet_pred'],['DEST_Prophet_pred'])
         

    
    #vector assembler
    feature_cols = ['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','DISTANCE','HourlyWindSpeed','Rain','Blowing','Snow','Thunder','CloudySkyCondition','carrier_vec',        'holiday_period','mean_carrier_delay','Pagerank_Score','PREV_FLIGHT_DELAYED','origin_percent_delayed','dest_percent_delayed','ORIGIN_Prophet_trend','ORIGIN_Prophet_pred','DEST_Prophet_trend','DEST_Prophet_pred']
    #assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
    #outputCol = "features"
    df_va = VectorAssembler(inputCols = feature_cols, outputCol = 'feature_vector')
    model_input = df_va.transform(fold_df)
    
    #rename delay flag to label
    model_input = model_input.withColumnRenamed("DEP_DEL15","label")
    #model_input = assemble.transform(fold_df) \
    #               .withColumnRenamed('DEP_DEL15', 'label')
    
    #scaling
    scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
    model_input = scaler.fit(model_input).transform(model_input)
    
    #check if cv exists, should only exist for cross fold validation not on full train, test
    if 'cv' in model_input.columns:
        model_input = model_input.select('label', 'scaled_feature_vector','cv')
    else:
        model_input = model_input.select('label', 'scaled_feature_vector')
    
    return model_input

In [0]:
d_processed = {}
for key in d.keys():
    print(key)
    d_processed[key] = process_fold_df(d[key])

df1
{'CRS_ELAPSED_TIME': 142.12890973967137, 'HourlyAltimeterSetting': 30.036581119366573, 'HourlyDewPointTemperature': 45.00851942735016, 'HourlyDryBulbTemperature': 60.17425457272137, 'HourlyRelativeHumidity': 62.37593175343212, 'HourlySeaLevelPressure': 30.025573114501174, 'HourlyStationPressure': 29.139284712018885, 'HourlyVisibility': 9.306111634363768, 'HourlyWetBulbTemperature': 52.378386344343085, 'HourlyWindDirection': 171.99315712143215, 'mean_carrier_delay': 0.1791214031726806, 'ORIGIN_Prophet_trend': 0.18222274440236552, 'ORIGIN_Prophet_pred': 0.18032809774467717, 'DEST_Prophet_trend': 0.18219374443411757, 'DEST_Prophet_pred': 0.18030503186638344}
df2
{'CRS_ELAPSED_TIME': 145.44314191477, 'HourlyAltimeterSetting': 30.015902788232133, 'HourlyDewPointTemperature': 49.01973232601843, 'HourlyDryBulbTemperature': 65.21154746068311, 'HourlyRelativeHumidity': 61.3952518212902, 'HourlySeaLevelPressure': 29.998396228843536, 'HourlyStationPressure': 29.10998980235495, 'HourlyVisibili

In [0]:
# commented out to ensure no overwrite if run all is pressed

# d_processed['df1'].write.mode("overwrite").parquet(f"{blob_url}/processed_fold_1")
# d_processed['df2'].write.mode("overwrite").parquet(f"{blob_url}/processed_fold_2")
# d_processed['df3'].write.mode("overwrite").parquet(f"{blob_url}/processed_fold_3")
# d_processed['df4'].write.mode("overwrite").parquet(f"{blob_url}/processed_fold_4")
# d_processed['df5'].write.mode("overwrite").parquet(f"{blob_url}/processed_fold_5")

#Process Full Train and Test Sets

In [0]:
# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/train_data_with_adv_features").cache()
test_df = spark.read.parquet(f"{blob_url}/test_data_with_adv_features")

In [0]:
#string indexing of carrier for train
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
indexer_transformer = carrier_indexer.setHandleInvalid("keep").fit(train_df)
train_df = indexer_transformer.transform(train_df)

#one hot encoding for train
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
onehotencoder_transformer = onehotencoder_carrier_vector.fit(train_df)
train_df = onehotencoder_transformer.transform(train_df)

In [0]:
display(train_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,two_hrs_pre_flight_utc,Date_Time_sched_dep_utc,Date_Time_sched_arrival_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,holiday_period,mean_carrier_delay,Pagerank_Score,PREV_FLIGHT_DELAYED,origin_flight_per_day,origin_delays_per_day,dest_flight_per_day,dest_delays_per_day,origin_percent_delayed,dest_percent_delayed,ORIGIN_Prophet_trend,ORIGIN_Prophet_pred,DEST_Prophet_trend,DEST_Prophet_pred,OP_CARRIER_Index,carrier_vec
1.0,2015,1,3,16,1,2015-03-16,2015-03-16T16:00:00.000+0000,2015-03-16T18:00:00.000+0000,2015-03-16T20:00:00.000+0000,EV,N16987,ORD,ABE,1326,1618,39.0,112.0,654.0,3,201.8,29.91,36.0,60,0.0,41.0,29.91,29.19,10.0,48.0,230.0,7,0,ORD_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1202003338898163,0.0162830899118796,1,699,128.0,3,0.0,0.1831187410586552,0.0,0.2607316376980994,0.1914847708400157,0.1369276377066992,0.0849973754869627,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-17T00:00:00.000+0000,2015-03-17T02:00:00.000+0000,2015-03-17T03:00:00.000+0000,EV,N17159,ORD,ABE,2100,2353,-12.0,113.0,654.0,3,201.8,29.77,47.0,68,0.0,47.0,29.76,29.05,10.0,57.0,250.0,11,0,ORD_ABE,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0.094298245614035,0.0162830899118796,0,699,128.0,3,0.0,0.1831187410586552,0.0,0.2607316376980994,0.1914847708400157,0.1369276377066992,0.0849973754869627,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T15:00:00.000+0000,2015-03-16T17:00:00.000+0000,2015-03-16T19:00:00.000+0000,OO,N8968E,DTW,ABE,1355,1521,-5.0,86.0,425.0,2,192.3,29.92,37.0,57,0.0,47.0,29.92,29.21,10.0,47.0,250.0,8,0,DTW_ABE,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0.1235820895522388,0.0112674970607994,0,238,19.0,3,0.0,0.0798319327731092,0.0,0.1777465882992688,0.1378597053720036,0.1369276377066992,0.0849973754869627,3.0,"Map(vectorType -> sparse, length -> 19, indices -> List(3), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T22:00:00.000+0000,2015-03-17T00:00:00.000+0000,2015-03-17T01:00:00.000+0000,EV,N981EV,DTW,ABE,2026,2150,-19.0,84.0,425.0,2,192.3,29.8,45.0,58,0.0,62.0,29.8,29.09,10.0,51.0,160.0,8,0,DTW_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1029009304871373,0.0112674970607994,0,238,19.0,3,0.0,0.0798319327731092,0.0,0.1777465882992688,0.1378597053720036,0.1369276377066992,0.0849973754869627,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T12:00:00.000+0000,2015-03-16T14:00:00.000+0000,2015-03-16T15:00:00.000+0000,EV,N926EV,DTW,ABE,1005,1130,-4.0,85.0,425.0,2,192.3,29.92,37.0,46,0.0,71.0,29.92,29.21,10.0,42.0,220.0,7,0,DTW_ABE,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0.1153415453527435,0.0112674970607994,0,238,19.0,3,0.0,0.0798319327731092,0.0,0.1777465882992688,0.1378597053720036,0.1369276377066992,0.0849973754869627,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T12:00:00.000+0000,2015-03-16T14:00:00.000+0000,2015-03-16T16:00:00.000+0000,EV,N837AS,ATL,ABE,1026,1222,13.0,116.0,692.0,3,307.8,30.2,40.0,51,0.0,66.0,30.19,29.1,10.0,46.0,300.0,7,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.1153415453527435,0.0405444716972231,0,920,170.0,3,0.0,0.1847826086956521,0.0,0.1705949989724562,0.1127466396817939,0.1369276377066992,0.0849973754869627,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T22:00:00.000+0000,2015-03-17T00:00:00.000+0000,2015-03-17T02:00:00.000+0000,DL,N919AT,ATL,ABE,2046,2243,4.0,117.0,692.0,3,307.8,30.06,24.0,80,0.0,13.0,30.05,28.96,10.0,54.0,240.0,5,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.1046464646464646,0.0405444716972231,0,920,170.0,3,0.0,0.1847826086956521,0.0,0.1705949989724562,0.1127466396817939,0.1369276377066992,0.0849973754869627,1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))"
0.0,2015,1,3,16,1,2015-03-16,2015-03-16T16:00:00.000+0000,2015-03-16T18:00:00.000+0000,2015-03-16T20:00:00.000+0000,EV,N878AS,ATL,ABE,1457,1653,14.0,116.0,692.0,3,307.8,30.19,31.0,76,0.0,19.0,30.17,29.09,10.0,54.0,340.0,6,0,ATL_ABE,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1202003338898163,0.0405444716972231,0,920,170.0,3,0.0,0.1847826086956521,0.0,0.1705949989724562,0.1127466396817939,0.1369276377066992,0.0849973754869627,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
1.0,2015,2,6,4,4,2015-06-04,2015-06-04T21:00:00.000+0000,2015-06-04T23:00:00.000+0000,2015-06-05T01:00:00.000+0000,EV,N917EV,DTW,ABE,1938,2104,50.0,86.0,425.0,2,192.3,30.01,62.0,78,0.0,58.0,30.0,29.3,10.0,68.0,130.0,8,0,DTW_ABE,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.1321428571428571,0.0112674970607994,1,361,54.0,7,2.0,0.149584487534626,0.2857142857142857,0.1710573009548977,0.1595223142014603,0.1379635286565257,0.1361425511872535,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"
0.0,2015,2,6,4,4,2015-06-04,2015-06-04T12:00:00.000+0000,2015-06-04T14:00:00.000+0000,2015-06-04T15:00:00.000+0000,EV,N871AS,DTW,ABE,1008,1135,-8.0,87.0,425.0,2,192.3,30.09,59.0,67,0.0,76.0,30.09,29.38,10.0,62.0,0.0,0,0,DTW_ABE,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0.1483483483483483,0.0112674970607994,0,361,54.0,7,2.0,0.149584487534626,0.2857142857142857,0.1710573009548977,0.1595223142014603,0.1379635286565257,0.1361425511872535,5.0,"Map(vectorType -> sparse, length -> 19, indices -> List(5), values -> List(1.0))"


In [0]:
#string indexing of carrier for test
#one hot encoding for test
test_df = indexer_transformer.transform(test_df)
test_df = onehotencoder_transformer.transform(test_df)

In [0]:
len(indexer_transformer.labels)

Out[62]: 19

In [0]:
#cast holiday to integer
train_df = train_df.withColumn("holiday_period", train_df["holiday_period"].cast(IntegerType()))
test_df = test_df.withColumn("holiday_period", test_df["holiday_period"].cast(IntegerType()))

In [0]:
processed_train_df = process_fold_df(train_df)

#scale to train on train set
# scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
# scaler_transformer = scaler.fit(processed_train_df)
# processed_train_df = scaler_transformer.transform(processed_train_df)

processed_test_df = process_fold_df(test_df)
# #scale to train on test set
# processed_test_df = scaler_transformer.transform(processed_test_df)


{'CRS_ELAPSED_TIME': 142.19873196344375, 'HourlyAltimeterSetting': 30.033807209268396, 'HourlyDewPointTemperature': 47.031581851427795, 'HourlyDryBulbTemperature': 62.17946108108702, 'HourlyRelativeHumidity': 62.826167693631106, 'HourlySeaLevelPressure': 30.020612570533856, 'HourlyStationPressure': 29.138857040175477, 'HourlyVisibility': 9.348308673776204, 'HourlyWetBulbTemperature': 54.20766773299253, 'HourlyWindDirection': 172.5585708721293, 'mean_carrier_delay': 0.16986784944061004, 'ORIGIN_Prophet_trend': 0.1720879271815201, 'ORIGIN_Prophet_pred': 0.1687493872510451, 'DEST_Prophet_trend': 0.17206664389874615, 'DEST_Prophet_pred': 0.16873035480876158}
{'CRS_ELAPSED_TIME': 142.7358402172544, 'HourlyAltimeterSetting': 30.028331599000225, 'HourlyDewPointTemperature': 48.10941381755681, 'HourlyDryBulbTemperature': 64.10941143119706, 'HourlyRelativeHumidity': 61.50895857992817, 'HourlySeaLevelPressure': 30.011542257892327, 'HourlyStationPressure': 29.04817103088108, 'HourlyVisibility': 9

In [0]:
processed_test_df1 = processed_test_df.withColumn("index", monotonically_increasing_id()) 

In [0]:
display(processed_train_df)

label,scaled_feature_vector
1.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 15, 30, 31, 32, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 1.0878368763601136, 1.1653266187892273, 2.000604697034528, 4.621254562829037, 1.3100239814177232, 0.9770776593882743, 3.3230428617034744, 1.563990621114016, 6.432441009112184, 1.0298412444801626, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 15, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 1.0878368763601136, 1.831227543811643, 2.000604697034528, 4.621254562829037, 1.0277256240672605, 0.9770776593882743, 1.563990621114016, 6.432441009112184, 1.0298412444801626, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 13, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 0.7069276337202574, 1.3318018500448312, 2.000604697034528, 3.148137925406252, 1.3468806262680577, 0.676113668530329, 0.6818329647785991, 4.385138888015855, 0.7414355194993069, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 15, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 0.7069276337202574, 1.3318018500448312, 2.000604697034528, 4.621254562829037, 1.121483462532785, 0.676113668530329, 0.6818329647785991, 4.385138888015855, 0.7414355194993069, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 15, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 0.7069276337202574, 1.1653266187892273, 2.000604697034528, 4.621254562829037, 1.257069640125813, 0.676113668530329, 0.6818329647785991, 4.385138888015855, 0.7414355194993069, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 15, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 1.1510445236103954, 1.1653266187892273, 4.621254562829037, 1.257069640125813, 2.4328980384831556, 1.5782014733948582, 4.208703926488924, 0.6063727115816706, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 11, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 1.1510445236103954, 0.8323761562780195, 2.000604697034528, 2.831373963298696, 1.1405074663362906, 2.4328980384831556, 1.5782014733948582, 4.208703926488924, 0.6063727115816706, 3.377768253652023, 0.4569697286346774))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 15, 30, 31, 33, 35, 36, 37, 38), values -> List(0.8697791361156608, 1.8241820408411074, 0.5017122833461674, 1.1510445236103954, 0.9988513875336233, 4.621254562829037, 1.3100239814177232, 2.4328980384831556, 1.5782014733948582, 4.208703926488924, 0.6063727115816706, 3.377768253652023, 0.4569697286346774))"
1.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 15, 30, 31, 32, 33, 34, 35, 36, 37, 38), values -> List(1.7395582722313216, 0.45604551021027684, 2.0068491333846694, 0.7069276337202574, 1.3318018500448312, 2.000604697034528, 4.621254562829037, 1.4401816220317956, 0.676113668530329, 3.3230428617034744, 1.2775794231425217, 2.5841953045416135, 4.22010925595605, 0.8579411190712629, 3.40332189368534, 0.7319405371665197))"
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 9, 15, 30, 31, 33, 34, 35, 36, 37, 38), values -> List(1.7395582722313216, 0.45604551021027684, 2.0068491333846694, 0.7069276337202574, 2.000604697034528, 4.621254562829037, 1.6167999509734434, 0.676113668530329, 1.2775794231425217, 2.5841953045416135, 4.22010925595605, 0.8579411190712629, 3.40332189368534, 0.7319405371665197))"


In [0]:
display(processed_test_df1)

label,scaled_feature_vector,index
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 1.1771165614071244, 1.1807240188784864, 2.001448286999518, 4.804541102597839, 0.5290014255667469, 2.5358333967474525, 0.3529775526800816, 2.2518971367325515, 0.2623600525700863, 2.453487744696793, 0.4321511055947388))",0
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.8181980723075531, 1.518073738558054, 2.001448286999518, 5.254269120180973, 0.34977829198919363, 0.9689112552858259, 0.433041426600483, 2.3371183222456193, 0.27964920300085777, 2.453487744696793, 0.4321511055947388))",1
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.8181980723075531, 1.1807240188784864, 2.001448286999518, 5.254269120180973, 0.4050167164495406, 0.9689112552858259, 0.433041426600483, 2.3371183222456193, 0.27964920300085777, 2.453487744696793, 0.4321511055947388))",2
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.8181980723075531, 2.001448286999518, 5.254269120180973, 0.34688490538266537, 0.9689112552858259, 0.433041426600483, 2.3371183222456193, 0.27964920300085777, 2.453487744696793, 0.4321511055947388))",3
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.722940084679231, 2.361448037756973, 2.001448286999518, 4.804541102597839, 0.38154845811321203, 0.7047198828462189, 0.5403663049134857, 1.8916360906620093, 0.4360534659923761, 2.453487744696793, 0.4321511055947388))",4
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.722940084679231, 0.8433742991989189, 2.001448286999518, 4.804541102597839, 0.3154302374467945, 0.7047198828462189, 0.5403663049134857, 1.8916360906620093, 0.4360534659923761, 2.453487744696793, 0.4321511055947388))",5
1.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 7, 9, 22, 30, 31, 33, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 0.722940084679231, 2.1927731779171893, 10.563083864358715, 2.001448286999518, 4.804541102597839, 0.608351639401759, 0.7047198828462189, 0.5403663049134857, 1.8916360906620093, 0.4360534659923761, 2.453487744696793, 0.4321511055947388))",6
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 26, 30, 31, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 1.5003133051460746, 0.5060245795193513, 7.296613042424572, 1.3850953356442304, 1.5473254271491774, 2.506155999639365, 0.1854260324295884, 2.453487744696793, 0.4321511055947388))",7
0.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 26, 30, 31, 35, 36, 37, 38), values -> List(0.6062867010278674, 0.4552178936348829, 1.9962981366261658, 1.6500044285620095, 1.0120491590387026, 7.296613042424572, 1.5574891757029543, 1.0216639197618846, 3.020166622012421, 0.35300129827896976, 2.453487744696793, 0.4321511055947388))",8
1.0,"Map(vectorType -> sparse, length -> 39, indices -> List(0, 1, 2, 3, 4, 9, 21, 30, 31, 33, 35, 36, 37, 38), values -> List(1.5157167525696686, 1.251849207495928, 0.9981490683130829, 0.8181980723075531, 1.0120491590387026, 2.001448286999518, 5.254269120180973, 1.5143994442576951, 0.9689112552858259, 0.756889388763837, 2.374154468913133, 0.4757177694132144, 2.364497822755491, 0.2879023802938084))",9


Output can only be rendered in Databricks

In [0]:
processed_test_df.count()

Out[56]: 5859306

In [0]:
# commented out to ensure no overwrite if run all is pressed

# processed_train_df.write.mode("overwrite").parquet(f"{blob_url}/processed_train")
# processed_test_df.write.mode("overwrite").parquet(f"{blob_url}/processed_test")

In [0]:
display(dbutils.fs.ls(f"{blob_url}"))

path,name,size,modificationTime
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/feature_engineered_data/,feature_engineered_data/,0,1668924639000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/feature_engineered_data_test/,feature_engineered_data_test/,0,1668924670000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/feature_engineered_train_data/,feature_engineered_train_data/,0,1668559613000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/merged_cleaned_data/,merged_cleaned_data/,0,1669494945000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/merged_cleaned_data_test/,merged_cleaned_data_test/,0,1669495012000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/merged_cleaned_data_train/,merged_cleaned_data_train/,0,1669495000000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/merged_data/,merged_data/,0,1669494746000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/number_flights_and_delay_rate/,number_flights_and_delay_rate/,0,1669961275000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/pagerank_scores/,pagerank_scores/,0,1669963319000
wasbs://w261-sec4-group2@kdevery.blob.core.windows.net/premerge_airline_data/,premerge_airline_data/,0,1669567605000
