In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pyspark.sql import functions as f
from pyspark.sql import SQLContext
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, avg, mean
from pyspark.sql.functions import min
from pyspark.sql.functions import col, max
from pyspark.sql.functions import format_string
from pyspark.sql.functions import substring
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import concat
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import lit
from pyspark.sql.functions import to_utc_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import instr
from pyspark.sql.functions import row_number

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer,OneHotEncoder
from pyspark.ml.classification import MultilayerPerceptronClassifier


from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator



In [0]:
#Initializes blob storage credentials/location
blob_container = "w261-sec4-group2" # The name of your container created in https://portal.azure.com
storage_account = "kdevery" # The name of your Storage account created in https://portal.azure.com
secret_scope = "sec4-group2" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

#Points to SAS token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/feature_engineered_data")
test_df = spark.read.parquet(f"{blob_url}/feature_engineered_data_test")

In [0]:
practice_df = train_df.limit(200)

display(practice_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N17245,IAD,IAH,535,207.0,1190.0,5,88.4,30.22,12,19,0.0,74,30.23,29.87,10.0,17,190.0,5,0,IAD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,1
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N31412,TPA,EWR,555,156.0,997.0,4,5.8,30.22,55,60,0.0,84,30.22,30.21,9.0,57,20.0,7,0,TPA_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,2
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N337JB,BOS,BUF,549,96.0,395.0,2,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230.0,11,0,BOS_BUF,0,0,0,0,0,0,0,0,0,0,1,0,1,3
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N79521,PHL,IAH,530,230.0,1325.0,6,3.0,30.2,12,29,0.0,49,30.2,30.17,10.0,24,250.0,11,0,PHL_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,4
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N656JB,FLL,BDL,553,166.0,1173.0,5,3.4,30.12,70,71,0.0,96,30.12,30.11,6.0,70,310.0,6,0,FLL_BDL,0,0,0,0,1,0,0,0,0,0,1,0,1,5
1.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3LLAA,JFK,MIA,545,185.0,1089.0,5,3.4,30.16,11,29,0.0,47,30.16,30.14,10.0,24,250.0,14,0,JFK_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,6
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3FKAA,EWR,MIA,559,183.0,1085.0,5,2.1,30.15,11,23,0.0,60,30.15,30.12,10.0,20,240.0,9,0,EWR_MIA,0,0,0,0,0,0,0,0,0,0,1,1,1,7
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N284JB,SYR,JFK,555,83.0,209.0,1,125.9,30.0,10,22,0.0,60,30.01,29.55,10.0,19,260.0,16,0,SYR_JFK,0,0,0,0,0,0,0,0,0,0,1,0,1,8
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N623JB,BOS,PBI,545,205.0,1197.0,5,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230.0,11,0,BOS_PBI,0,0,0,0,0,0,0,0,0,0,1,0,1,9
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N37290,BOS,EWR,550,88.0,200.0,1,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230.0,11,0,BOS_EWR,0,0,0,0,0,0,0,0,0,0,1,0,1,10


In [0]:
#splitting training dataframe into five folds contained in dictionary "d"

d = {}
folds = ['df1','df2','df3','df4','df5']

each_len = practice_df.count()/5
start = 1
val_size = each_len/5
stop = each_len
precision_list = []

for fold in folds:
    d[fold] = practice_df.filter(col('Index').between(start,stop))\
                                  .withColumn('cv', F.when(col('Index').between(start,(stop-val_size)), 'train')
                                         .otherwise('val'))
    start += each_len
    stop += each_len
                                  

In [0]:
display(d['df2'])

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N14204,CHS,IAH,615,172.0,925.0,4,12.2,30.35,31,33,0.0,92,30.34,30.29,3.0,32,350,3,0,CHS_IAH,0,0,0,0,1,0,0,0,0,0,0,0,0,41,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N4WPAA,JAX,DFW,635,170.0,918.0,4,7.9,30.3,45,48,0.0,89,30.3,30.27,10.0,47,340,6,0,JAX_DFW,0,0,0,0,0,0,0,0,0,0,1,1,1,42,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N994AT,AVL,ATL,650,65.0,164.0,1,645.3,30.29,19,27,0.0,72,30.32,27.99,10.0,24,0,0,0,AVL_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,43,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N17560,MHT,ORD,600,173.0,843.0,4,67.4,29.98,6,16,0.0,65,30.04,29.73,10.0,14,180,6,0,MHT_ORD,0,0,0,0,0,0,0,0,0,0,0,1,0,44,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N937JB,JFK,LAX,630,377.0,2475.0,10,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_LAX,0,0,0,0,0,0,0,0,0,0,0,0,0,45,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3JNAA,DCA,MIA,659,165.0,919.0,4,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,46,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N368JB,EWR,BOS,625,68.0,200.0,1,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_BOS,0,0,0,0,0,0,0,0,0,0,0,0,0,47,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N789SW,PVD,MCO,620,190.0,1072.0,5,16.8,30.07,6,24,0.0,46,30.06,30.0,10.0,19,250,11,0,PVD_MCO,0,0,0,0,0,0,0,0,0,0,0,1,0,48,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3FXAA,MIA,DFW,600,194.0,1121.0,5,8.8,30.12,68,71,0.0,90,30.12,30.09,7.0,69,0,0,0,MIA_DFW,0,0,0,0,0,0,0,0,0,0,1,0,1,49,train
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,OO,N445SW,ABR,MSP,510,69.0,257.0,2,395.3,29.89,14,20,0.0,78,29.95,28.51,10.0,18,260,15,0,ABR_MSP,0,0,0,0,0,0,0,0,0,0,1,0,1,50,train


In [0]:
def process_fold_df(fold_df):
    
    
    #imputation
    fold_df.createOrReplaceTempView("fold_view")
    
    imputation_columns = ['CRS_ELAPSED_TIME','HourlyAltimeterSetting','HourlyDewPointTemperature',
             'HourlyDryBulbTemperature','HourlyRelativeHumidity','HourlySeaLevelPressure',
             'HourlyStationPressure','HourlyVisibility','HourlyWetBulbTemperature',
             'HourlyWindDirection']

    means = {}

    for impute_col in imputation_columns:
        mean = spark.sql(f"SELECT AVG({impute_col}) FROM fold_view").collect()[0][0]
        means[impute_col] = mean
    
    print(means)
    
    #fill Nones and Nans - Seems to error sometimes?
    fold_df = fold_df.fillna(0,["HourlyWindGustSpeed"]) \
         .fillna(means["CRS_ELAPSED_TIME"],["CRS_ELAPSED_TIME"]) \
         .fillna(means["HourlyAltimeterSetting"],["HourlyAltimeterSetting"]) \
         .fillna(means["HourlyDewPointTemperature"],["HourlyDewPointTemperature"]) \
         .fillna(means["HourlyDryBulbTemperature"],["HourlyDryBulbTemperature"]) \
         .fillna(0,["HourlyPrecipitation"]) \
         .fillna(means["HourlyRelativeHumidity"],["HourlyRelativeHumidity"]) \
         .fillna(means["HourlySeaLevelPressure"],["HourlySeaLevelPressure"]) \
         .fillna(means["HourlyStationPressure"],["HourlyStationPressure"]) \
         .fillna(means["HourlyVisibility"],["HourlyVisibility"]) \
         .fillna(means["HourlyWetBulbTemperature"],["HourlyWetBulbTemperature"]) \
         .fillna(means["HourlyWindDirection"],["HourlyWindDirection"]) \
         .fillna(0,["HourlyWindSpeed"]) \
         .fillna("",["TAIL_NUM"])
         
    
    #string indexing of carrier
    carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
    fold_df = carrier_indexer.fit(fold_df).transform(fold_df)
    
    #one hot encoding
    onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
    fold_df = onehotencoder_carrier_vector.fit(fold_df).transform(fold_df)
    
    #vector assembler
    feature_cols = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK','Snow','carrier_vec']
    #assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
    #outputCol = "features"
    df_va = VectorAssembler(inputCols = feature_cols, outputCol = 'feature_vector')
    model_input = df_va.transform(fold_df)
    
    #rename delay flag to label
    model_input = model_input.withColumnRenamed("DEP_DEL15","label")
    #model_input = assemble.transform(fold_df) \
    #               .withColumnRenamed('DEP_DEL15', 'label')
    
    #scaling
    scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
    model_input = scaler.fit(model_input).transform(model_input)
    #model_input = model_input.select('label', 'scaled_feature_vector','cv')
    
    
    return model_input
display(process_fold_df(d['df2']))

{'CRS_ELAPSED_TIME': 153.975, 'HourlyAltimeterSetting': 30.168749999999996, 'HourlyDewPointTemperature': 21.5, 'HourlyDryBulbTemperature': 32.425, 'HourlyRelativeHumidity': 65.625, 'HourlySeaLevelPressure': 30.176999999999992, 'HourlyStationPressure': 29.81849999999999, 'HourlyVisibility': 9.0375, 'HourlyWetBulbTemperature': 29.1, 'HourlyWindDirection': 177.75}


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N14204,CHS,IAH,615,172.0,925.0,4,12.2,30.35,31,33,0.0,92,30.34,30.29,3.0,32,350,3,0,CHS_IAH,0,0,0,0,1,0,0,0,0,0,0,0,0,41,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.765331593774861))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N4WPAA,JAX,DFW,635,170.0,918.0,4,7.9,30.3,45,48,0.0,89,30.3,30.27,10.0,47,340,6,0,JAX_DFW,0,0,0,0,0,0,0,0,0,0,1,1,1,42,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N994AT,AVL,ATL,650,65.0,164.0,1,645.3,30.29,19,27,0.0,72,30.32,27.99,10.0,24,0,0,0,AVL_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,43,train,2.0,"Map(vectorType -> sparse, length -> 9, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N17560,MHT,ORD,600,173.0,843.0,4,67.4,29.98,6,16,0.0,65,30.04,29.73,10.0,14,180,6,0,MHT_ORD,0,0,0,0,0,0,0,0,0,0,0,1,0,44,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.765331593774861))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N937JB,JFK,LAX,630,377.0,2475.0,10,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_LAX,0,0,0,0,0,0,0,0,0,0,0,0,0,45,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.3646136786930683))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3JNAA,DCA,MIA,659,165.0,919.0,4,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,46,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N368JB,EWR,BOS,625,68.0,200.0,1,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_BOS,0,0,0,0,0,0,0,0,0,0,0,0,0,47,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.3646136786930683))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N789SW,PVD,MCO,620,190.0,1072.0,5,16.8,30.07,6,24,0.0,46,30.06,30.0,10.0,19,250,11,0,PVD_MCO,0,0,0,0,0,0,0,0,0,0,0,1,0,48,train,9.0,"Map(vectorType -> sparse, length -> 9, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2), values -> List(1.0, 1.0, 4.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2), values -> List(0.0, 0.0, 0.0))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3FXAA,MIA,DFW,600,194.0,1121.0,5,8.8,30.12,68,71,0.0,90,30.12,30.09,7.0,69,0,0,0,MIA_DFW,0,0,0,0,0,0,0,0,0,0,1,0,1,49,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))"
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,OO,N445SW,ABR,MSP,510,69.0,257.0,2,395.3,29.89,14,20,0.0,78,29.95,28.51,10.0,18,260,15,0,ABR_MSP,0,0,0,0,0,0,0,0,0,0,1,0,1,50,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.748873704735071))"


In [0]:
d_processed = {}
for key in d.keys():
    print(key)
    d_processed[key] = process_fold_df(d[key])

df1
{'CRS_ELAPSED_TIME': None, 'HourlyAltimeterSetting': None, 'HourlyDewPointTemperature': None, 'HourlyDryBulbTemperature': None, 'HourlyRelativeHumidity': None, 'HourlySeaLevelPressure': None, 'HourlyStationPressure': None, 'HourlyVisibility': None, 'HourlyWetBulbTemperature': None, 'HourlyWindDirection': 238.25}


[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-3035635552863028>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      2[0m [0;32mfor[0m [0mkey[0m [0;32min[0m [0md[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m     [0mprint[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m     [0md_processed[0m[0;34m[[0m[0mkey[0m[0;34m][0m [0;34m=[0m [0mprocess_fold_df[0m[0;34m([0m[0md[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-3035635552858915>[0m in [0;36mprocess_fold_df[0;34m(fold_df)[0m
[1;32m     19[0m [0;34m[0m[0m
[1;32m     20[0m     [0;31m#fill Nones and Nans - Seems to error sometimes?[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 21[0;31m     [0mfold_df[0

In [0]:
# set up grid search: estimator, set of params, and evaluator
rf = RandomForestClassifier(labelCol="label", featuresCol="Scaled_features")
grid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5, 10])\
            .addGrid(rf.numTrees, [10, 15])\
            .build()

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5)
#evaluator = MulticlassClassificationEvaluator(metricName='precisionByLabel')

In [0]:
#Logistic Regression

#train models with logistic regression
def fit_log_model(df):
    model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
    lrn = model.fit(df)
    return lrn

#return model results of each fold
logistic_models = {}
for key in d_processed.keys():
    print(key)
    logistic_models[key] = fit_log_model(d_processed[key])
    lrn_summary = logistic_models[key].summary
    display(lrn_summary.predictions)

#evaluator = MulticlassClassificationEvaluator(metricName='precisionByLabel')
#evaluator = BinaryClassificationEvaluator(metricName='Precision')


#grid = ParamGridBuilder()\
#        .addGrid(model.threshold,[0.5,0.8])\
#        .build()

df1


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N17245,IAD,IAH,535,207.0,1190.0,5,88.4,30.22,12,19,0.0,74,30.23,29.87,10.0,17,190,5,0,IAD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,1,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(1.7917589660964666, -1.7917589660964666))","Map(vectorType -> dense, length -> 2, values -> List(0.8571427955348965, 0.14285720446510353))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N31412,TPA,EWR,555,156.0,997.0,4,5.8,30.22,55,60,0.0,84,30.22,30.21,9.0,57,20,7,0,TPA_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,2,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(1.7917589660964666, -1.7917589660964666))","Map(vectorType -> dense, length -> 2, values -> List(0.8571427955348965, 0.14285720446510353))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N337JB,BOS,BUF,549,96.0,395.0,2,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_BUF,0,0,0,0,0,0,0,0,0,0,1,0,1,3,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(2.546903539945209, -2.546903539945209))","Map(vectorType -> dense, length -> 2, values -> List(0.9273652160840334, 0.07263478391596656))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N79521,PHL,IAH,530,230.0,1325.0,6,3.0,30.2,12,29,0.0,49,30.2,30.17,10.0,24,250,11,0,PHL_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,4,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(1.7917589660964666, -1.7917589660964666))","Map(vectorType -> dense, length -> 2, values -> List(0.8571427955348965, 0.14285720446510353))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N656JB,FLL,BDL,553,166.0,1173.0,5,3.4,30.12,70,71,0.0,96,30.12,30.11,6.0,70,310,6,0,FLL_BDL,0,0,0,0,1,0,0,0,0,0,1,0,1,5,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(2.546903539945209, -2.546903539945209))","Map(vectorType -> dense, length -> 2, values -> List(0.9273652160840334, 0.07263478391596656))",0.0
1.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3LLAA,JFK,MIA,545,185.0,1089.0,5,3.4,30.16,11,29,0.0,47,30.16,30.14,10.0,24,250,14,0,JFK_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,6,train,3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.9856801091687157))","Map(vectorType -> dense, length -> 2, values -> List(1.3862936493986595, -1.3862936493986595))","Map(vectorType -> dense, length -> 2, values -> List(0.7999998861245787, 0.20000011387542127))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3FKAA,EWR,MIA,559,183.0,1085.0,5,2.1,30.15,11,23,0.0,60,30.15,30.12,10.0,20,240,9,0,EWR_MIA,0,0,0,0,0,0,0,0,0,0,1,1,1,7,train,3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.9856801091687157))","Map(vectorType -> dense, length -> 2, values -> List(1.3862936493986595, -1.3862936493986595))","Map(vectorType -> dense, length -> 2, values -> List(0.7999998861245787, 0.20000011387542127))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N284JB,SYR,JFK,555,83.0,209.0,1,125.9,30.0,10,22,0.0,60,30.01,29.55,10.0,19,260,16,0,SYR_JFK,0,0,0,0,0,0,0,0,0,0,1,0,1,8,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(2.546903539945209, -2.546903539945209))","Map(vectorType -> dense, length -> 2, values -> List(0.9273652160840334, 0.07263478391596656))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N623JB,BOS,PBI,545,205.0,1197.0,5,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_PBI,0,0,0,0,0,0,0,0,0,0,1,0,1,9,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(2.546903539945209, -2.546903539945209))","Map(vectorType -> dense, length -> 2, values -> List(0.9273652160840334, 0.07263478391596656))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N37290,BOS,EWR,550,88.0,200.0,1,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_EWR,0,0,0,0,0,0,0,0,0,0,1,0,1,10,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(1.7917589660964666, -1.7917589660964666))","Map(vectorType -> dense, length -> 2, values -> List(0.8571427955348965, 0.14285720446510353))",0.0


df2


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N14204,CHS,IAH,615,172.0,925.0,4,12.2,30.35,31,33,0.0,92,30.34,30.29,3.0,32,350,3,0,CHS_IAH,0,0,0,0,1,0,0,0,0,0,0,0,0,41,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.765331593774861))","Map(vectorType -> dense, length -> 2, values -> List(1.609439689626111, -1.609439689626111))","Map(vectorType -> dense, length -> 2, values -> List(0.8333335801654108, 0.16666641983458919))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N4WPAA,JAX,DFW,635,170.0,918.0,4,7.9,30.3,45,48,0.0,89,30.3,30.27,10.0,47,340,6,0,JAX_DFW,0,0,0,0,0,0,0,0,0,0,1,1,1,42,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(0.5108246817629585, -0.5108246817629585))","Map(vectorType -> dense, length -> 2, values -> List(0.6249997792180133, 0.3750002207819867))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N994AT,AVL,ATL,650,65.0,164.0,1,645.3,30.29,19,27,0.0,72,30.32,27.99,10.0,24,0,0,0,AVL_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,43,train,2.0,"Map(vectorType -> sparse, length -> 9, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(0.9162915050236649, -0.9162915050236649))","Map(vectorType -> dense, length -> 2, values -> List(0.7142858720713025, 0.28571412792869755))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N17560,MHT,ORD,600,173.0,843.0,4,67.4,29.98,6,16,0.0,65,30.04,29.73,10.0,14,180,6,0,MHT_ORD,0,0,0,0,0,0,0,0,0,0,0,1,0,44,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.765331593774861))","Map(vectorType -> dense, length -> 2, values -> List(1.609439689626111, -1.609439689626111))","Map(vectorType -> dense, length -> 2, values -> List(0.8333335801654108, 0.16666641983458919))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N937JB,JFK,LAX,630,377.0,2475.0,10,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_LAX,0,0,0,0,0,0,0,0,0,0,0,0,0,45,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.3646136786930683))","Map(vectorType -> dense, length -> 2, values -> List(2.079438944274486, -2.079438944274486))","Map(vectorType -> dense, length -> 2, values -> List(0.888888632354768, 0.111111367645232))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3JNAA,DCA,MIA,659,165.0,919.0,4,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,46,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(0.5108246817629585, -0.5108246817629585))","Map(vectorType -> dense, length -> 2, values -> List(0.6249997792180133, 0.3750002207819867))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N368JB,EWR,BOS,625,68.0,200.0,1,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_BOS,0,0,0,0,0,0,0,0,0,0,0,0,0,47,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.3646136786930683))","Map(vectorType -> dense, length -> 2, values -> List(2.079438944274486, -2.079438944274486))","Map(vectorType -> dense, length -> 2, values -> List(0.888888632354768, 0.111111367645232))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N789SW,PVD,MCO,620,190.0,1072.0,5,16.8,30.07,6,24,0.0,46,30.06,30.0,10.0,19,250,11,0,PVD_MCO,0,0,0,0,0,0,0,0,0,0,0,1,0,48,train,9.0,"Map(vectorType -> sparse, length -> 9, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2), values -> List(1.0, 1.0, 4.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2), values -> List(0.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(20.018584279662683, -20.018584279662683))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999979767977, 2.0232022723831733E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3FXAA,MIA,DFW,600,194.0,1121.0,5,8.8,30.12,68,71,0.0,90,30.12,30.09,7.0,69,0,0,0,MIA_DFW,0,0,0,0,0,0,0,0,0,0,1,0,1,49,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(0.5108246817629585, -0.5108246817629585))","Map(vectorType -> dense, length -> 2, values -> List(0.6249997792180133, 0.3750002207819867))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,OO,N445SW,ABR,MSP,510,69.0,257.0,2,395.3,29.89,14,20,0.0,78,29.95,28.51,10.0,18,260,15,0,ABR_MSP,0,0,0,0,0,0,0,0,0,0,1,0,1,50,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.748873704735071))","Map(vectorType -> dense, length -> 2, values -> List(45.04322600362736, -45.04322600362736))","Map(vectorType -> dense, length -> 2, values -> List(1.0, 0.0))",0.0


df3


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N38451,BOS,ORD,600,173.0,867.0,4,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_ORD,0,0,0,0,0,0,0,0,0,0,1,1,1,81,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(26.236111903545236, -26.236111903545236))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999959654, 4.034550471487819E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N15986,CLE,IAD,603,82.0,288.0,2,238.0,30.15,6,19,0.0,57,30.19,29.28,10.0,16,230,13,0,CLE_IAD,0,0,0,0,0,0,0,0,0,0,0,0,0,82,train,2.0,"Map(vectorType -> sparse, length -> 9, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.9856801091687157))","Map(vectorType -> dense, length -> 2, values -> List(26.353357617038554, -26.353357617038554))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999964118, 3.588240815588506E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N692DL,PHL,ATL,610,155.0,666.0,3,3.0,30.19,11,30,0.0,45,30.18,30.16,10.0,24,230,14,0,PHL_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,83,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(26.390874847420918, -26.390874847420918))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999965439, 3.4561242756581123E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N69838,MCO,EWR,659,152.0,937.0,4,27.4,30.22,58,61,0.0,90,30.21,30.11,10.0,59,360,10,0,MCO_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,84,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(26.236111903545236, -26.236111903545236))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999959654, 4.034550471487819E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N723UW,BOS,CLT,605,147.0,728.0,3,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_CLT,0,0,0,0,0,0,0,0,0,0,1,1,1,85,train,5.0,"Map(vectorType -> sparse, length -> 9, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(26.390874847420918, -26.390874847420918))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999965439, 3.4561242756581123E-12))",0.0
1.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,NK,N605NK,ACY,RSW,600,171.0,982.0,4,18.3,30.19,17,28,0.0,63,30.19,30.12,10.0,25,250,7,0,ACY_RSW,0,0,0,0,0,0,0,0,0,0,0,0,0,86,train,7.0,"Map(vectorType -> sparse, length -> 9, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 11), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 11), values -> List(0.0, 0.0, 0.0, 3.748873704735071))","Map(vectorType -> dense, length -> 2, values -> List(-0.6931454065738762, 0.6931454065738762))","Map(vectorType -> dense, length -> 2, values -> List(0.33333372755257634, 0.6666662724474237))",1.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N909DE,MCO,LGA,640,153.0,950.0,4,27.4,30.22,58,61,0.0,90,30.21,30.11,10.0,59,360,10,0,MCO_LGA,0,0,0,0,0,0,0,0,0,1,0,0,1,87,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(26.390874847420918, -26.390874847420918))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999965439, 3.4561242756581123E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N30401,ORD,IAH,510,175.0,925.0,4,201.8,30.07,4,16,0.0,59,30.11,29.35,10.0,13,250,17,30,ORD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,88,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(26.236111903545236, -26.236111903545236))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999959654, 4.034550471487819E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N486AA,PIT,DFW,635,200.0,1067.0,5,366.7,30.14,4,21,0.0,47,30.22,28.85,10.0,17,240,10,23,PIT_DFW,0,0,0,0,0,0,0,0,0,0,0,0,0,89,train,6.0,"Map(vectorType -> sparse, length -> 9, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 10), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 10), values -> List(0.0, 0.0, 0.0, 3.748873704735071))","Map(vectorType -> dense, length -> 2, values -> List(0.693146519437839, -0.693146519437839))","Map(vectorType -> dense, length -> 2, values -> List(0.6666665197506269, 0.33333348024937315))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N329NB,PIT,MSP,645,154.0,726.0,3,366.7,30.14,4,21,0.0,47,30.22,28.85,10.0,17,240,10,23,PIT_MSP,0,0,0,0,0,0,0,0,0,0,0,0,0,90,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(26.390874847420918, -26.390874847420918))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999965439, 3.4561242756581123E-12))",0.0


df4


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N526JB,JFK,RSW,630,195.0,1074.0,5,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_RSW,0,0,0,0,0,0,0,0,0,0,0,0,0,121,train,0.0,"Map(vectorType -> sparse, length -> 10, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.2113982300032355))","Map(vectorType -> dense, length -> 2, values -> List(26.930377567491025, -26.930377567491025))","Map(vectorType -> dense, length -> 2, values -> List(0.999999999997985, 2.015054789694659E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N931DN,BOS,MSP,600,210.0,1124.0,5,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_MSP,0,0,0,0,0,0,0,0,0,0,1,1,1,122,train,1.0,"Map(vectorType -> sparse, length -> 10, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(27.14879880073176, -27.14879880073176))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999983804, 1.6195933483231784E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N626AW,PIT,CLT,650,99.0,366.0,2,366.7,30.14,4,21,0.0,47,30.22,28.85,10.0,17,240,10,23,PIT_CLT,0,0,0,0,0,0,0,0,0,0,0,0,0,123,train,5.0,"Map(vectorType -> sparse, length -> 10, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(27.292865042783802, -27.292865042783802))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999985976, 1.4024337247064977E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N799SW,MCO,ATL,655,95.0,404.0,2,27.4,30.22,58,61,0.0,90,30.21,30.11,10.0,59,360,10,0,MCO_ATL,0,0,0,0,0,0,0,0,0,1,0,0,1,124,train,10.0,"Map(vectorType -> sparse, length -> 10, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2), values -> List(1.0, 1.0, 4.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2), values -> List(0.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(18.24809345494331, -18.24809345494331))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999881162446, 1.1883755424157982E-8))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N732US,CLT,PHL,630,93.0,449.0,2,221.9,30.33,27,32,0.0,82,30.34,29.5,9.0,30,0,0,0,CLT_PHL,0,0,0,0,0,0,0,0,0,0,0,0,0,125,train,5.0,"Map(vectorType -> sparse, length -> 10, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(27.292865042783802, -27.292865042783802))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999985976, 1.4024337247064977E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N834JB,BOS,FLL,611,213.0,1237.0,5,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_FLL,0,0,0,0,0,0,0,0,0,0,1,1,1,126,train,0.0,"Map(vectorType -> sparse, length -> 10, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.2113982300032355))","Map(vectorType -> dense, length -> 2, values -> List(26.930377567491025, -26.930377567491025))","Map(vectorType -> dense, length -> 2, values -> List(0.999999999997985, 2.015054789694659E-12))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N293AY,PHL,CLT,630,104.0,449.0,2,3.0,30.19,11,30,0.0,45,30.18,30.16,10.0,24,230,14,0,PHL_CLT,0,0,0,0,0,0,0,0,0,0,0,0,0,127,train,5.0,"Map(vectorType -> sparse, length -> 10, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(27.292865042783802, -27.292865042783802))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999985976, 1.4024337247064977E-12))",0.0
1.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,F9,N949FR,ATL,TTN,600,110.0,701.0,3,307.8,30.32,26,35,0.0,70,30.34,29.21,10.0,32,0,0,0,ATL_TTN,0,0,0,0,0,0,0,0,0,0,0,0,0,128,train,6.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 10), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 10), values -> List(0.0, 0.0, 0.0, 4.530597729822599))","Map(vectorType -> dense, length -> 2, values -> List(-1.0821152010009882E-6, 1.0821152010009882E-6))","Map(vectorType -> dense, length -> 2, values -> List(0.4999997294711998, 0.5000002705288003))",1.0
1.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,NK,N522NK,DTW,LAS,652,273.0,1749.0,7,192.3,30.05,5,18,0.0,57,30.08,29.34,10.0,15,210,15,0,DTW_LAS,0,0,0,0,0,0,0,0,0,0,1,1,1,129,train,8.0,"Map(vectorType -> sparse, length -> 10, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 12), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 12), values -> List(0.0, 0.0, 0.0, 6.324555320336758))","Map(vectorType -> dense, length -> 2, values -> List(-26.759437657357736, 26.759437657357736))","Map(vectorType -> dense, length -> 2, values -> List(2.390693421947188E-12, 0.9999999999976094))",1.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N327NB,DCA,MSP,620,165.0,931.0,4,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_MSP,0,0,0,0,0,0,0,0,0,0,0,0,0,130,train,1.0,"Map(vectorType -> sparse, length -> 10, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(27.14879880073176, -27.14879880073176))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999999983804, 1.6195933483231784E-12))",0.0


df5


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N833UA,EWR,LAX,659,386.0,2454.0,10,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_LAX,0,0,0,0,0,0,0,0,0,0,0,0,0,161,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(18.504269042762182, -18.504269042762182))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908019014, 9.198098616103323E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N128UW,DCA,CLT,659,89.0,331.0,2,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_CLT,0,0,0,0,0,0,0,0,0,0,0,0,0,162,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(18.504269042762182, -18.504269042762182))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908019014, 9.198098616103323E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N763JB,JFK,SJU,614,223.0,1598.0,7,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_SJU,0,0,0,0,0,0,0,0,0,0,0,0,0,163,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(18.503615363747926, -18.503615363747926))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999907958869, 9.204113138316927E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N924DN,MIA,ATL,625,120.0,594.0,3,8.8,30.12,68,71,0.0,90,30.12,30.09,7.0,69,0,0,0,MIA_ATL,0,0,0,0,0,0,0,0,0,0,1,0,1,164,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(18.503709973774242, -18.503709973774242))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999907967577, 9.203242279376411E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N902WN,FLL,LAS,640,330.0,2173.0,9,3.4,30.12,70,71,0.0,96,30.11,30.11,5.0,70,340,7,0,FLL_LAS,0,0,0,0,1,0,0,0,0,0,1,1,1,165,train,5.0,"Map(vectorType -> sparse, length -> 9, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(18.504269042762182, -18.504269042762182))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908019014, 9.198098616103323E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N831AW,EWR,PHX,630,341.0,2133.0,9,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_PHX,0,0,0,0,0,0,0,0,0,0,0,0,0,166,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(18.504269042762182, -18.504269042762182))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908019014, 9.198098616103323E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,NK,N607NK,ORD,LGA,556,124.0,733.0,3,201.8,30.07,4,16,0.0,59,30.11,29.35,10.0,13,250,17,30,ORD_LGA,0,0,0,0,0,0,0,0,0,0,0,0,0,167,train,8.0,"Map(vectorType -> sparse, length -> 9, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 12), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 12), values -> List(0.0, 0.0, 0.0, 4.530597729822599))","Map(vectorType -> dense, length -> 2, values -> List(18.504745798355213, -18.504745798355213))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908062857, 9.193714345379078E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N651AW,MCI,PHX,550,178.0,1044.0,5,306.3,30.26,8,16,0.0,71,30.31,29.16,10.0,14,220,13,0,MCI_PHX,0,0,0,0,0,0,0,0,0,0,1,0,1,168,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(18.504269042762182, -18.504269042762182))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908019014, 9.198098616103323E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N3748Y,LGA,ATL,600,159.0,762.0,4,3.4,30.12,6,28,0.0,39,30.12,30.09,10.0,22,250,15,25,LGA_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,169,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(18.503709973774242, -18.503709973774242))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999907967577, 9.203242279376411E-9))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N268WN,PHL,FLL,655,170.0,992.0,4,3.0,30.19,11,30,0.0,45,30.18,30.16,10.0,24,230,14,0,PHL_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,170,train,5.0,"Map(vectorType -> sparse, length -> 9, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(18.504269042762182, -18.504269042762182))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999908019014, 9.198098616103323E-9))",0.0


In [0]:
#Random Forest

#train models with random forest
def fit_forest_model(df, numTrees=10):
    rf = RandomForestClassifier(labelCol="label", featuresCol="feature_vector", numTrees=numTrees)
    lrn = rf.fit(df)
    return lrn

#return model results of each fold
forest_models = {}
for key in d_processed.keys():
    print(key)
    forest_models[key] = fit_forest_model(d_processed[key])
    lrn_summary = forest_models[key].summary
    display(lrn_summary.predictions)

df1


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N17245,IAD,IAH,535,207.0,1190.0,5,88.4,30.22,12,19,0.0,74,30.23,29.87,10.0,17,190,5,0,IAD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,1,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.796491796909347, 1.2035082030906528))","Map(vectorType -> dense, length -> 2, values -> List(0.8796491796909347, 0.12035082030906527))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N31412,TPA,EWR,555,156.0,997.0,4,5.8,30.22,55,60,0.0,84,30.22,30.21,9.0,57,20,7,0,TPA_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,2,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.796491796909347, 1.2035082030906528))","Map(vectorType -> dense, length -> 2, values -> List(0.8796491796909347, 0.12035082030906527))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N337JB,BOS,BUF,549,96.0,395.0,2,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_BUF,0,0,0,0,0,0,0,0,0,0,1,0,1,3,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(8.926926579518042, 1.0730734204819572))","Map(vectorType -> dense, length -> 2, values -> List(0.8926926579518042, 0.10730734204819573))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N79521,PHL,IAH,530,230.0,1325.0,6,3.0,30.2,12,29,0.0,49,30.2,30.17,10.0,24,250,11,0,PHL_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,4,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.796491796909347, 1.2035082030906528))","Map(vectorType -> dense, length -> 2, values -> List(0.8796491796909347, 0.12035082030906527))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N656JB,FLL,BDL,553,166.0,1173.0,5,3.4,30.12,70,71,0.0,96,30.12,30.11,6.0,70,310,6,0,FLL_BDL,0,0,0,0,1,0,0,0,0,0,1,0,1,5,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(8.926926579518042, 1.0730734204819572))","Map(vectorType -> dense, length -> 2, values -> List(0.8926926579518042, 0.10730734204819573))",0.0
1.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3LLAA,JFK,MIA,545,185.0,1089.0,5,3.4,30.16,11,29,0.0,47,30.16,30.14,10.0,24,250,14,0,JFK_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,6,train,3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.9856801091687157))","Map(vectorType -> dense, length -> 2, values -> List(8.326926579518043, 1.6730734204819573))","Map(vectorType -> dense, length -> 2, values -> List(0.8326926579518042, 0.16730734204819572))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3FKAA,EWR,MIA,559,183.0,1085.0,5,2.1,30.15,11,23,0.0,60,30.15,30.12,10.0,20,240,9,0,EWR_MIA,0,0,0,0,0,0,0,0,0,0,1,1,1,7,train,3.0,"Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.9856801091687157))","Map(vectorType -> dense, length -> 2, values -> List(8.326926579518043, 1.6730734204819573))","Map(vectorType -> dense, length -> 2, values -> List(0.8326926579518042, 0.16730734204819572))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N284JB,SYR,JFK,555,83.0,209.0,1,125.9,30.0,10,22,0.0,60,30.01,29.55,10.0,19,260,16,0,SYR_JFK,0,0,0,0,0,0,0,0,0,0,1,0,1,8,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(8.926926579518042, 1.0730734204819572))","Map(vectorType -> dense, length -> 2, values -> List(0.8926926579518042, 0.10730734204819573))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N623JB,BOS,PBI,545,205.0,1197.0,5,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_PBI,0,0,0,0,0,0,0,0,0,0,1,0,1,9,train,0.0,"Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.1547290184283368))","Map(vectorType -> dense, length -> 2, values -> List(8.926926579518042, 1.0730734204819572))","Map(vectorType -> dense, length -> 2, values -> List(0.8926926579518042, 0.10730734204819573))",0.0
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N37290,BOS,EWR,550,88.0,200.0,1,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_EWR,0,0,0,0,0,0,0,0,0,0,1,0,1,10,train,2.0,"Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.796491796909347, 1.2035082030906528))","Map(vectorType -> dense, length -> 2, values -> List(0.8796491796909347, 0.12035082030906527))",0.0


df2


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N14204,CHS,IAH,615,172.0,925.0,4,12.2,30.35,31,33,0.0,92,30.34,30.29,3.0,32,350,3,0,CHS_IAH,0,0,0,0,1,0,0,0,0,0,0,0,0,41,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.765331593774861))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N4WPAA,JAX,DFW,635,170.0,918.0,4,7.9,30.3,45,48,0.0,89,30.3,30.27,10.0,47,340,6,0,JAX_DFW,0,0,0,0,0,0,0,0,0,0,1,1,1,42,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N994AT,AVL,ATL,650,65.0,164.0,1,645.3,30.29,19,27,0.0,72,30.32,27.99,10.0,24,0,0,0,AVL_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,43,train,2.0,"Map(vectorType -> sparse, length -> 9, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N17560,MHT,ORD,600,173.0,843.0,4,67.4,29.98,6,16,0.0,65,30.04,29.73,10.0,14,180,6,0,MHT_ORD,0,0,0,0,0,0,0,0,0,0,0,1,0,44,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 2.765331593774861))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N937JB,JFK,LAX,630,377.0,2475.0,10,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_LAX,0,0,0,0,0,0,0,0,0,0,0,0,0,45,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.3646136786930683))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3JNAA,DCA,MIA,659,165.0,919.0,4,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,46,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N368JB,EWR,BOS,625,68.0,200.0,1,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_BOS,0,0,0,0,0,0,0,0,0,0,0,0,0,47,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.3646136786930683))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N789SW,PVD,MCO,620,190.0,1072.0,5,16.8,30.07,6,24,0.0,46,30.06,30.0,10.0,19,250,11,0,PVD_MCO,0,0,0,0,0,0,0,0,0,0,0,1,0,48,train,9.0,"Map(vectorType -> sparse, length -> 9, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2), values -> List(1.0, 1.0, 4.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2), values -> List(0.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N3FXAA,MIA,DFW,600,194.0,1121.0,5,8.8,30.12,68,71,0.0,90,30.12,30.09,7.0,69,0,0,0,MIA_DFW,0,0,0,0,0,0,0,0,0,0,1,0,1,49,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,OO,N445SW,ABR,MSP,510,69.0,257.0,2,395.3,29.89,14,20,0.0,78,29.95,28.51,10.0,18,260,15,0,ABR_MSP,0,0,0,0,0,0,0,0,0,0,1,0,1,50,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.748873704735071))","Map(vectorType -> dense, length -> 2, values -> List(8.582067170634243, 1.417932829365756))","Map(vectorType -> dense, length -> 2, values -> List(0.8582067170634243, 0.14179328293657562))",0.0


df3


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N38451,BOS,ORD,600,173.0,867.0,4,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_ORD,0,0,0,0,0,0,0,0,0,0,1,1,1,81,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(9.593821773994867, 0.406178226005134))","Map(vectorType -> dense, length -> 2, values -> List(0.9593821773994866, 0.0406178226005134))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,EV,N15986,CLE,IAD,603,82.0,288.0,2,238.0,30.15,6,19,0.0,57,30.19,29.28,10.0,16,230,13,0,CLE_IAD,0,0,0,0,0,0,0,0,0,0,0,0,0,82,train,2.0,"Map(vectorType -> sparse, length -> 9, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 6), values -> List(0.0, 0.0, 0.0, 2.9856801091687157))","Map(vectorType -> dense, length -> 2, values -> List(9.135488440661533, 0.8645115593384672))","Map(vectorType -> dense, length -> 2, values -> List(0.9135488440661532, 0.08645115593384672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N692DL,PHL,ATL,610,155.0,666.0,3,3.0,30.19,11,30,0.0,45,30.18,30.16,10.0,24,230,14,0,PHL_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,83,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.343821773994867, 0.6561782260051338))","Map(vectorType -> dense, length -> 2, values -> List(0.9343821773994867, 0.06561782260051338))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N69838,MCO,EWR,659,152.0,937.0,4,27.4,30.22,58,61,0.0,90,30.21,30.11,10.0,59,360,10,0,MCO_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,84,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(9.593821773994867, 0.406178226005134))","Map(vectorType -> dense, length -> 2, values -> List(0.9593821773994866, 0.0406178226005134))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N723UW,BOS,CLT,605,147.0,728.0,3,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_CLT,0,0,0,0,0,0,0,0,0,0,1,1,1,85,train,5.0,"Map(vectorType -> sparse, length -> 9, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.209562514735607, 0.7904374852643931))","Map(vectorType -> dense, length -> 2, values -> List(0.9209562514735607, 0.0790437485264393))",0.0
1.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,NK,N605NK,ACY,RSW,600,171.0,982.0,4,18.3,30.19,17,28,0.0,63,30.19,30.12,10.0,25,250,7,0,ACY_RSW,0,0,0,0,0,0,0,0,0,0,0,0,0,86,train,7.0,"Map(vectorType -> sparse, length -> 9, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 11), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 11), values -> List(0.0, 0.0, 0.0, 3.748873704735071))","Map(vectorType -> dense, length -> 2, values -> List(7.159562514735606, 2.840437485264393))","Map(vectorType -> dense, length -> 2, values -> List(0.7159562514735607, 0.2840437485264393))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N909DE,MCO,LGA,640,153.0,950.0,4,27.4,30.22,58,61,0.0,90,30.21,30.11,10.0,59,360,10,0,MCO_LGA,0,0,0,0,0,0,0,0,0,1,0,0,1,87,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.343821773994867, 0.6561782260051338))","Map(vectorType -> dense, length -> 2, values -> List(0.9343821773994867, 0.06561782260051338))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N30401,ORD,IAH,510,175.0,925.0,4,201.8,30.07,4,16,0.0,59,30.11,29.35,10.0,13,250,17,30,ORD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,88,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(9.593821773994867, 0.406178226005134))","Map(vectorType -> dense, length -> 2, values -> List(0.9593821773994866, 0.0406178226005134))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,AA,N486AA,PIT,DFW,635,200.0,1067.0,5,366.7,30.14,4,21,0.0,47,30.22,28.85,10.0,17,240,10,23,PIT_DFW,0,0,0,0,0,0,0,0,0,0,0,0,0,89,train,6.0,"Map(vectorType -> sparse, length -> 9, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 10), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 10), values -> List(0.0, 0.0, 0.0, 3.748873704735071))","Map(vectorType -> dense, length -> 2, values -> List(8.185488440661533, 1.8145115593384669))","Map(vectorType -> dense, length -> 2, values -> List(0.8185488440661534, 0.1814511559338467))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N329NB,PIT,MSP,645,154.0,726.0,3,366.7,30.14,4,21,0.0,47,30.22,28.85,10.0,17,240,10,23,PIT_MSP,0,0,0,0,0,0,0,0,0,0,0,0,0,90,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.343821773994867, 0.6561782260051338))","Map(vectorType -> dense, length -> 2, values -> List(0.9343821773994867, 0.06561782260051338))",0.0


df4


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N526JB,JFK,RSW,630,195.0,1074.0,5,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_RSW,0,0,0,0,0,0,0,0,0,0,0,0,0,121,train,0.0,"Map(vectorType -> sparse, length -> 10, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.2113982300032355))","Map(vectorType -> dense, length -> 2, values -> List(9.21267911976211, 0.787320880237893))","Map(vectorType -> dense, length -> 2, values -> List(0.9212679119762107, 0.0787320880237893))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N931DN,BOS,MSP,600,210.0,1124.0,5,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_MSP,0,0,0,0,0,0,0,0,0,0,1,1,1,122,train,1.0,"Map(vectorType -> sparse, length -> 10, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.919345786428774, 1.0806542135712265))","Map(vectorType -> dense, length -> 2, values -> List(0.8919345786428774, 0.10806542135712265))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N626AW,PIT,CLT,650,99.0,366.0,2,366.7,30.14,4,21,0.0,47,30.22,28.85,10.0,17,240,10,23,PIT_CLT,0,0,0,0,0,0,0,0,0,0,0,0,0,123,train,5.0,"Map(vectorType -> sparse, length -> 10, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(8.919345786428774, 1.0806542135712265))","Map(vectorType -> dense, length -> 2, values -> List(0.8919345786428774, 0.10806542135712265))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N799SW,MCO,ATL,655,95.0,404.0,2,27.4,30.22,58,61,0.0,90,30.21,30.11,10.0,59,360,10,0,MCO_ATL,0,0,0,0,0,0,0,0,0,1,0,0,1,124,train,10.0,"Map(vectorType -> sparse, length -> 10, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2), values -> List(1.0, 1.0, 4.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2), values -> List(0.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(8.919345786428774, 1.0806542135712265))","Map(vectorType -> dense, length -> 2, values -> List(0.8919345786428774, 0.10806542135712265))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N732US,CLT,PHL,630,93.0,449.0,2,221.9,30.33,27,32,0.0,82,30.34,29.5,9.0,30,0,0,0,CLT_PHL,0,0,0,0,0,0,0,0,0,0,0,0,0,125,train,5.0,"Map(vectorType -> sparse, length -> 10, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(8.919345786428774, 1.0806542135712265))","Map(vectorType -> dense, length -> 2, values -> List(0.8919345786428774, 0.10806542135712265))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N834JB,BOS,FLL,611,213.0,1237.0,5,3.7,30.04,6,24,0.0,46,30.03,30.01,10.0,19,240,13,0,BOS_FLL,0,0,0,0,0,0,0,0,0,0,1,1,1,126,train,0.0,"Map(vectorType -> sparse, length -> 10, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.2113982300032355))","Map(vectorType -> dense, length -> 2, values -> List(9.21267911976211, 0.787320880237893))","Map(vectorType -> dense, length -> 2, values -> List(0.9212679119762107, 0.0787320880237893))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N293AY,PHL,CLT,630,104.0,449.0,2,3.0,30.19,11,30,0.0,45,30.18,30.16,10.0,24,230,14,0,PHL_CLT,0,0,0,0,0,0,0,0,0,0,0,0,0,127,train,5.0,"Map(vectorType -> sparse, length -> 10, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(8.919345786428774, 1.0806542135712265))","Map(vectorType -> dense, length -> 2, values -> List(0.8919345786428774, 0.10806542135712265))",0.0
1.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,F9,N949FR,ATL,TTN,600,110.0,701.0,3,307.8,30.32,26,35,0.0,70,30.34,29.21,10.0,32,0,0,0,ATL_TTN,0,0,0,0,0,0,0,0,0,0,0,0,0,128,train,6.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 10), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 10), values -> List(0.0, 0.0, 0.0, 4.530597729822599))","Map(vectorType -> dense, length -> 2, values -> List(6.9797746363313085, 3.0202253636686924))","Map(vectorType -> dense, length -> 2, values -> List(0.6979774636331308, 0.30202253636686927))",0.0
1.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,NK,N522NK,DTW,LAS,652,273.0,1749.0,7,192.3,30.05,5,18,0.0,57,30.08,29.34,10.0,15,210,15,0,DTW_LAS,0,0,0,0,0,0,0,0,0,0,1,1,1,129,train,8.0,"Map(vectorType -> sparse, length -> 10, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 12), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 12), values -> List(0.0, 0.0, 0.0, 6.324555320336758))","Map(vectorType -> dense, length -> 2, values -> List(4.4633913704743575, 5.536608629525642))","Map(vectorType -> dense, length -> 2, values -> List(0.4463391370474358, 0.5536608629525641))",1.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N327NB,DCA,MSP,620,165.0,931.0,4,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_MSP,0,0,0,0,0,0,0,0,0,0,0,0,0,130,train,1.0,"Map(vectorType -> sparse, length -> 10, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 14, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(8.919345786428774, 1.0806542135712265))","Map(vectorType -> dense, length -> 2, values -> List(0.8919345786428774, 0.10806542135712265))",0.0


df5


label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,cv,OP_CARRIER_Index,carrier_vec,feature_vector,scaled_feature_vector,rawPrediction,probability,prediction
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,UA,N833UA,EWR,LAX,659,386.0,2454.0,10,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_LAX,0,0,0,0,0,0,0,0,0,0,0,0,0,161,train,3.0,"Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 7), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N128UW,DCA,CLT,659,89.0,331.0,2,3.0,30.23,13,29,0.0,51,30.22,30.16,10.0,24,200,5,0,DCA_CLT,0,0,0,0,0,0,0,0,0,0,0,0,0,162,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,B6,N763JB,JFK,SJU,614,223.0,1598.0,7,3.4,30.15,8,29,0.0,41,30.14,30.12,10.0,23,260,17,0,JFK_SJU,0,0,0,0,0,0,0,0,0,0,0,0,0,163,train,0.0,"Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 4), values -> List(0.0, 0.0, 0.0, 2.468552207266437))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N924DN,MIA,ATL,625,120.0,594.0,3,8.8,30.12,68,71,0.0,90,30.12,30.09,7.0,69,0,0,0,MIA_ATL,0,0,0,0,0,0,0,0,0,0,1,0,1,164,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N902WN,FLL,LAS,640,330.0,2173.0,9,3.4,30.12,70,71,0.0,96,30.11,30.11,5.0,70,340,7,0,FLL_LAS,0,0,0,0,1,0,0,0,0,0,1,1,1,165,train,5.0,"Map(vectorType -> sparse, length -> 9, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.926829268292682, 0.07317073170731707))","Map(vectorType -> dense, length -> 2, values -> List(0.9926829268292684, 0.007317073170731708))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N831AW,EWR,PHX,630,341.0,2133.0,9,2.1,30.14,9,24,0.0,52,30.14,30.11,10.0,20,240,11,0,EWR_PHX,0,0,0,0,0,0,0,0,0,0,0,0,0,166,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,NK,N607NK,ORD,LGA,556,124.0,733.0,3,201.8,30.07,4,16,0.0,59,30.11,29.35,10.0,13,250,17,30,ORD_LGA,0,0,0,0,0,0,0,0,0,0,0,0,0,167,train,8.0,"Map(vectorType -> sparse, length -> 9, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 12), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 12), values -> List(0.0, 0.0, 0.0, 4.530597729822599))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,US,N651AW,MCI,PHX,550,178.0,1044.0,5,306.3,30.26,8,16,0.0,71,30.31,29.16,10.0,14,220,13,0,MCI_PHX,0,0,0,0,0,0,0,0,0,0,1,0,1,168,train,4.0,"Map(vectorType -> sparse, length -> 9, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 8), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,DL,N3748Y,LGA,ATL,600,159.0,762.0,4,3.4,30.12,6,28,0.0,39,30.12,30.09,10.0,22,250,15,25,LGA_ATL,0,0,0,0,0,0,0,0,0,0,0,0,0,169,train,1.0,"Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 5), values -> List(0.0, 0.0, 0.0, 2.59870097418821))","Map(vectorType -> dense, length -> 2, values -> List(9.892346509671993, 0.10765349032800672))","Map(vectorType -> dense, length -> 2, values -> List(0.9892346509671993, 0.010765349032800672))",0.0
0.0,2015,1,1,1,4,2015-01-01T09:00:00.000+0000,2015-01-01T09:00:00.000+0000,WN,N268WN,PHL,FLL,655,170.0,992.0,4,3.0,30.19,11,30,0.0,45,30.18,30.16,10.0,24,230,14,0,PHL_FLL,0,0,0,0,0,0,0,0,0,0,0,0,0,170,train,5.0,"Map(vectorType -> sparse, length -> 9, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(0, 1, 2, 9), values -> List(0.0, 0.0, 0.0, 3.2914029430219163))","Map(vectorType -> dense, length -> 2, values -> List(9.926829268292682, 0.07317073170731707))","Map(vectorType -> dense, length -> 2, values -> List(0.9926829268292684, 0.007317073170731708))",0.0


In [0]:
#XGBoost - Needs to use larger dataset to work

#train models with XGBoost
def fit_xgboost_model(df, maxIter=10):
    xg = GBTClassifier(labelCol="label", featuresCol="feature_vector", maxIter=maxIter)
    lrn = xg.fit(df)
    return lrn

#return model results of each fold
xgboost_models = {}
for key in d_processed.keys():
    print(key)
    xgboost_models[key] = fit_xgboost_model(d_processed[key])
    lrn_summary = xgboost_models[key].summary
    display(lrn_summary.predictions)

df1


[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
[0;32m<command-2647101326228189>[0m in [0;36m<cell line: 11>[0;34m()[0m
[1;32m     11[0m [0;32mfor[0m [0mkey[0m [0;32min[0m [0md_processed[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m     12[0m     [0mprint[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 13[0;31m     [0mxgboost_models[0m[0;34m[[0m[0mkey[0m[0;34m][0m [0;34m=[0m [0mfit_xgboost_model[0m[0;34m([0m[0md_processed[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     14[0m     [0mlrn_summary[0m [0;34m=[0m [0mxgboost_models[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m.[0m[0msummary[0m[0;34m[0m[0;34m[0m[0m
[1;32m     15[0m     [0mdisplay[0m[0;34m([0m[0mlrn_summary[0m[0;34m.[0m[0mpr

In [0]:
#Neural Network (MLPC) - still troubleshooting

#train models with Multi Layer Neural Perceptron
def fit_MLPC_model(df, blockSize=128, seed=1234, layers = [4, 5, 4, 3], maxIter = 10):
    MLPC = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector", maxIter=maxIter, layers=layers, blockSize=blockSize, seed=seed)
    lrn = MLPC.fit(df)
    return lrn

#return model results of each fold
MLPC_models = {}
for key in d_processed.keys():
    print(key)
    MLPC_models[key] = fit_MLPC_model(d_processed[key])
    result = MLPC_models[key].transform(d_processed[key])
    result.show(10)

df1


[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
[0;32m<command-2647101326228184>[0m in [0;36m<cell line: 11>[0;34m()[0m
[1;32m     11[0m [0;32mfor[0m [0mkey[0m [0;32min[0m [0md_processed[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m     12[0m     [0mprint[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 13[0;31m     [0mMLPC_models[0m[0;34m[[0m[0mkey[0m[0;34m][0m [0;34m=[0m [0mfit_MLPC_model[0m[0;34m([0m[0md_processed[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     14[0m     [0mresult[0m [0;34m=[0m [0mMLPC_models[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0md_processed[0m[0;34m[[0m[0mkey[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     15[0m   

In [0]:
%run "/Shared/w261_Section4_Group2/Phase 3/custom_cv_module"

In [0]:
display(d_processed['df1'])

In [0]:
# run cross validation & return the crossvalidation F0.5 score for 'test' set
cv = CustomCrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=4)

In [0]:
cvModel = cv.fit(d_processed)

In [0]:
cvModel.bestModel

In [0]:
#precision by label 

cvModel.avgMetrics

In [0]:
# make predictions
predictions = cvModel.transform(d_processed['df1'])

display(predictions.groupby('label', 'prediction').count())

In [0]:
display(predictions)

In [0]:
predictions.show()