In [0]:
# import libraries
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pyspark.sql import functions as f
from pyspark.sql import SQLContext
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, avg, mean
from pyspark.sql.functions import min
from pyspark.sql.functions import col, max
from pyspark.sql.functions import format_string
from pyspark.sql.functions import substring
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import concat
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import lit
from pyspark.sql.functions import to_utc_timestamp
from pyspark.sql.functions import expr
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import instr
from pyspark.sql.functions import row_number

from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer,OneHotEncoder
from pyspark.ml.classification import MultilayerPerceptronClassifier


from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator



In [0]:
#Initializes blob storage credentials/location
blob_container = "w261-sec4-group2" # The name of your container created in https://portal.azure.com
storage_account = "kdevery" # The name of your Storage account created in https://portal.azure.com
secret_scope = "sec4-group2" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

#Points to SAS token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# Read in training and test data

train_df = spark.read.parquet(f"{blob_url}/feature_engineered_data").cache()
test_df = spark.read.parquet(f"{blob_url}/feature_engineered_data_test")

In [0]:
#practice_df = train_df.limit(10000).cache()

#display(practice_df)

In [0]:
#string indexing of carrier
carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
train_df = carrier_indexer.fit(train_df).transform(train_df)

#one hot encoding
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="carrier_vec")
train_df = onehotencoder_carrier_vector.fit(train_df).transform(train_df)

In [0]:
display(train_df)

DEP_DEL15,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,OP_CARRIER_Index,carrier_vec
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N17245,IAD,IAH,535,207.0,1190.0,5,88.4,30.22,12,19,0.0,74,30.23,29.87,10.0,17,190.0,5,0,IAD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N31412,TPA,EWR,555,156.0,997.0,4,5.8,30.22,55,60,0.0,84,30.22,30.21,9.0,57,20.0,7,0,TPA_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,2,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N337JB,BOS,BUF,549,96.0,395.0,2,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230.0,11,0,BOS_BUF,0,0,0,0,0,0,0,0,0,0,1,0,1,3,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N79521,PHL,IAH,530,230.0,1325.0,6,3.0,30.2,12,29,0.0,49,30.2,30.17,10.0,24,250.0,11,0,PHL_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N656JB,FLL,BDL,553,166.0,1173.0,5,3.4,30.12,70,71,0.0,96,30.12,30.11,6.0,70,310.0,6,0,FLL_BDL,0,0,0,0,1,0,0,0,0,0,1,0,1,5,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))"
1.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3LLAA,JFK,MIA,545,185.0,1089.0,5,3.4,30.16,11,29,0.0,47,30.16,30.14,10.0,24,250.0,14,0,JFK_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,6,2.0,"Map(vectorType -> sparse, length -> 18, indices -> List(2), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3FKAA,EWR,MIA,559,183.0,1085.0,5,2.1,30.15,11,23,0.0,60,30.15,30.12,10.0,20,240.0,9,0,EWR_MIA,0,0,0,0,0,0,0,0,0,0,1,1,1,7,2.0,"Map(vectorType -> sparse, length -> 18, indices -> List(2), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N284JB,SYR,JFK,555,83.0,209.0,1,125.9,30.0,10,22,0.0,60,30.01,29.55,10.0,19,260.0,16,0,SYR_JFK,0,0,0,0,0,0,0,0,0,0,1,0,1,8,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N623JB,BOS,PBI,545,205.0,1197.0,5,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230.0,11,0,BOS_PBI,0,0,0,0,0,0,0,0,0,0,1,0,1,9,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N37290,BOS,EWR,550,88.0,200.0,1,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230.0,11,0,BOS_EWR,0,0,0,0,0,0,0,0,0,0,1,0,1,10,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))"


In [0]:
#splitting training dataframe into five folds contained in dictionary "d"

d = {}
folds = ['df1','df2','df3','df4','df5']

each_len = train_df.count()/5
start = 1
val_size = each_len/5
stop = each_len
precision_list = []

for fold in folds:
    d[fold] = train_df.filter(col('Index').between(start,stop))\
                                  .withColumn('cv', F.when(col('Index').between(start,(stop-val_size)), 'train')
                                         .otherwise('val'))
    start += each_len
    stop += each_len

                                  

In [0]:
d['df1'].count()

Out[7]: 7092308

In [0]:
train_df.columns

In [0]:
def process_fold_df(fold_df):
    
    
    #imputation
    fold_df.createOrReplaceTempView("fold_view")
    
    imputation_columns = ['CRS_ELAPSED_TIME','HourlyAltimeterSetting','HourlyDewPointTemperature',
             'HourlyDryBulbTemperature','HourlyRelativeHumidity','HourlySeaLevelPressure',
             'HourlyStationPressure','HourlyVisibility','HourlyWetBulbTemperature',
             'HourlyWindDirection']

    means = {}

    for impute_col in imputation_columns:
        mean = spark.sql(f"SELECT AVG({impute_col}) FROM fold_view").collect()[0][0]
        means[impute_col] = mean
    
    print(means)
    
    #fill Nones and Nans - Seems to error sometimes?
    fold_df = fold_df.fillna(0,["HourlyWindGustSpeed"]) \
         .fillna(means["CRS_ELAPSED_TIME"],["CRS_ELAPSED_TIME"]) \
         .fillna(means["HourlyAltimeterSetting"],["HourlyAltimeterSetting"]) \
         .fillna(means["HourlyDewPointTemperature"],["HourlyDewPointTemperature"]) \
         .fillna(means["HourlyDryBulbTemperature"],["HourlyDryBulbTemperature"]) \
         .fillna(0,["HourlyPrecipitation"]) \
         .fillna(means["HourlyRelativeHumidity"],["HourlyRelativeHumidity"]) \
         .fillna(means["HourlySeaLevelPressure"],["HourlySeaLevelPressure"]) \
         .fillna(means["HourlyStationPressure"],["HourlyStationPressure"]) \
         .fillna(means["HourlyVisibility"],["HourlyVisibility"]) \
         .fillna(means["HourlyWetBulbTemperature"],["HourlyWetBulbTemperature"]) \
         .fillna(means["HourlyWindDirection"],["HourlyWindDirection"]) \
         .fillna(0,["HourlyWindSpeed"]) \
         .fillna("",["TAIL_NUM"])
         
    
    #vector assembler
    feature_cols = ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK','Snow','Thunder','CloudySkyCondition','carrier_vec']
    #assemble = VectorAssembler(inputCols=feature_cols, outputCol='features')
    #outputCol = "features"
    df_va = VectorAssembler(inputCols = feature_cols, outputCol = 'feature_vector')
    model_input = df_va.transform(fold_df)
    
    #rename delay flag to label
    model_input = model_input.withColumnRenamed("DEP_DEL15","label")
    #model_input = assemble.transform(fold_df) \
    #               .withColumnRenamed('DEP_DEL15', 'label')
    
    #scaling
    scaler=StandardScaler().setInputCol("feature_vector").setOutputCol("scaled_feature_vector")
    model_input = scaler.fit(model_input).transform(model_input)
    #model_input = model_input.select('label', 'scaled_feature_vector','cv')
    
    
    return model_input

In [0]:
d_processed = {}
for key in d.keys():
    print(key)
    d_processed[key] = process_fold_df(d[key])

df1
{'CRS_ELAPSED_TIME': 142.44063335103797, 'HourlyAltimeterSetting': 30.036623524997363, 'HourlyDewPointTemperature': 45.04962921085635, 'HourlyDryBulbTemperature': 60.196596939883925, 'HourlyRelativeHumidity': 62.408511124495895, 'HourlySeaLevelPressure': 30.025656140365168, 'HourlyStationPressure': 29.14308683282565, 'HourlyVisibility': 9.306679411101904, 'HourlyWetBulbTemperature': 52.40839550851939, 'HourlyWindDirection': 171.98819901580603}
df2
{'CRS_ELAPSED_TIME': 145.7431274478107, 'HourlyAltimeterSetting': 30.016099642615004, 'HourlyDewPointTemperature': 49.02220257590917, 'HourlyDryBulbTemperature': 65.19480606686781, 'HourlyRelativeHumidity': 61.42273726168876, 'HourlySeaLevelPressure': 29.998699273505157, 'HourlyStationPressure': 29.114083127363998, 'HourlyVisibility': 9.46247965227108, 'HourlyWetBulbTemperature': 56.45884165832188, 'HourlyWindDirection': 174.31167995906648}
df3
{'CRS_ELAPSED_TIME': 143.64426185521455, 'HourlyAltimeterSetting': 30.042330170210207, 'HourlyD

In [0]:
display(d_processed['df1'])

label,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,two_hrs_pre_flight_utc,Date_Time_utc,OP_CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,ELEVATION,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,HourlyWindGustSpeed,Route,Rain,Snow,Thunder,Fog,Mist,Freezing,Blowing,Smoke,Drizzle,Overcast,Broken,Scattered,CloudySkyCondition,Index,OP_CARRIER_Index,carrier_vec,cv,feature_vector,scaled_feature_vector
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N17245,IAD,IAH,535,207.0,1190.0,5,88.4,30.22,12,19,0.0,74,30.23,29.87,10.0,17,190,5,0,IAD_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 10), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 10), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 3.5064207761284507))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N31412,TPA,EWR,555,156.0,997.0,4,5.8,30.22,55,60,0.0,84,30.22,30.21,9.0,57,20,7,0,TPA_EWR,0,0,0,0,0,0,0,0,0,1,0,0,1,2,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 10), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 10), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 3.5064207761284507))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N337JB,BOS,BUF,549,96.0,395.0,2,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_BUF,0,0,0,0,0,0,0,0,0,0,1,0,1,3,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 4.832763598086092))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N79521,PHL,IAH,530,230.0,1325.0,6,3.0,30.2,12,29,0.0,49,30.2,30.17,10.0,24,250,11,0,PHL_IAH,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 10), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 10), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 3.5064207761284507))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N656JB,FLL,BDL,553,166.0,1173.0,5,3.4,30.12,70,71,0.0,96,30.12,30.11,6.0,70,310,6,0,FLL_BDL,0,0,0,0,1,0,0,0,0,0,1,0,1,5,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 4.832763598086092))"
1.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3LLAA,JFK,MIA,545,185.0,1089.0,5,3.4,30.16,11,29,0.0,47,30.16,30.14,10.0,24,250,14,0,JFK_MIA,0,0,0,0,0,0,0,0,0,0,0,0,0,6,2.0,"Map(vectorType -> sparse, length -> 18, indices -> List(2), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 8), values -> List(1.0, 1.0, 4.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 8), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.946297433592378))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,AA,N3FKAA,EWR,MIA,559,183.0,1085.0,5,2.1,30.15,11,23,0.0,60,30.15,30.12,10.0,20,240,9,0,EWR_MIA,0,0,0,0,0,0,0,0,0,0,1,1,1,7,2.0,"Map(vectorType -> sparse, length -> 18, indices -> List(2), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 8), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 8), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 2.946297433592378))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N284JB,SYR,JFK,555,83.0,209.0,1,125.9,30.0,10,22,0.0,60,30.01,29.55,10.0,19,260,16,0,SYR_JFK,0,0,0,0,0,0,0,0,0,0,1,0,1,8,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 4.832763598086092))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,B6,N623JB,BOS,PBI,545,205.0,1197.0,5,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_PBI,0,0,0,0,0,0,0,0,0,0,1,0,1,9,6.0,"Map(vectorType -> sparse, length -> 18, indices -> List(6), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 12), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 4.832763598086092))"
0.0,2015,1,1,1,4,2015-01-01T08:00:00.000+0000,2015-01-01T08:00:00.000+0000,UA,N37290,BOS,EWR,550,88.0,200.0,1,3.7,30.06,6,22,0.0,50,30.06,30.03,10.0,18,230,11,0,BOS_EWR,0,0,0,0,0,0,0,0,0,0,1,0,1,10,4.0,"Map(vectorType -> sparse, length -> 18, indices -> List(4), values -> List(1.0))",train,"Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 10), values -> List(1.0, 1.0, 4.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(0, 1, 2, 5, 10), values -> List(0.2828427921931523, 0.11329088575942964, 2.0103218253419253, 2.001637044646038, 3.5064207761284507))"


### Model Building

In [0]:
%run "/Shared/w261_Section4_Group2/Phase 3/custom_cv_module"

#### Logistic Regression

In [0]:
# set up grid search: estimator, set of params, and evaluator
logistic_model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
grid = ParamGridBuilder()\
            .addGrid(logistic_model.threshold, [0.5])\
            .build() 

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5, metricLabel=1)

In [0]:
# run cross validation & return the crossvalidation F0.5 score for 'test' set
cv = CustomCrossValidator(estimator=logistic_model, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=8)

In [0]:
cvModel = cv.fit(d_processed)

fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
fold 3 start...
fold 3 end
fold 4 start...
fold 4 end
fold 5 start...
fold 5 end
Best Model:  {Param(parent='LogisticRegression_d7645db3811a', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.5} Detailed Score [0.07351303740880462, 0.08322128392227596, 0.07725616090740332, 0.06855200628987705, 7.46517833632393e-05] Avg Score 0.060523428062344835


In [0]:
#for individual testing

#test_train = d_processed['df1'].filter(col('cv')=='train')
#test_val = d_processed['df1'].filter(col('cv')=='val')

#test_logistic_model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
#evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
#lrModel = test_logistic_model.fit(test_train)
#predictions = lrModel.transform(test_val)
#evaluator.evaluate(predictions)

In [0]:
# make predictions
predictions = cvModel.transform(d_processed['df1'])

display(predictions.groupby('label', 'prediction').count())

label,prediction,count
1.0,0.0,1275196
0.0,0.0,5817112


In [0]:
fbeta = cvModel.avgMetrics[0]
print(f"Logistic Regression F0.5 Score: {fbeta}")

Logistic Regression F0.5 Score: 0.060523428062344835


#### Random Forest

In [0]:
# set up grid search: estimator, set of params, and evaluator
rf_model = RandomForestClassifier(labelCol="label", featuresCol="scaled_feature_vector")
grid = ParamGridBuilder()\
            .addGrid(rf_model.maxDepth, [10])\
            .addGrid(rf_model.numTrees, [64])\
            .build()

# Example using F0.5 score for evaluator
evaluator = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', beta=0.5,metricLabel=1)

In [0]:
cv_rf = CustomCrossValidator(estimator=rf_model, estimatorParamMaps=grid, evaluator=evaluator,splitWord =('train','val'), cvCol = 'cv',parallelism=8)

In [0]:
cvModel_rf1 = cv_rf.fit(d_processed)

fold 1 start...
fold 1 end
fold 2 start...
fold 2 end
fold 3 start...
fold 3 end
fold 4 start...
fold 4 end
fold 5 start...
fold 5 end
Best Model:  {Param(parent='RandomForestClassifier_b175a3cfe482', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 10, Param(parent='RandomForestClassifier_b175a3cfe482', name='numTrees', doc='Number of trees to train (>= 1).'): 64} Detailed Score [0.011205522409182662, 0.015062820133580356, 0.05309732010543106, 0.051756398425788476, 6.787373313677099e-06] Avg Score 0.02622576968945925


In [0]:
# make predictions
predictions_rf = cvModel_rf1.transform(d_processed['df1'])

display(predictions_rf.groupby('label', 'prediction').count())

label,prediction,count
1.0,1.0,2
0.0,1.0,6
1.0,0.0,1275194
0.0,0.0,5817106


In [0]:
fbeta = cvModel_rf1.avgMetrics[0]
print(f"Random Forest F0.5 Score: {fbeta}")

Random Forest F0.5 Score: 0.02622576968945925


### Advanced Modeling

#### May not use whats below this!

In [0]:
#Logistic Regression

#train models with logistic regression
def fit_log_model(df):
    model = LogisticRegression(labelCol="label", featuresCol="scaled_feature_vector")
    lrn = model.fit(df)
    return lrn

#return model results of each fold
logistic_models = {}
for key in d_processed.keys():
    print(key)
    logistic_models[key] = fit_log_model(d_processed[key])
    lrn_summary = logistic_models[key].summary
    display(lrn_summary.predictions)

#evaluator = MulticlassClassificationEvaluator(metricName='precisionByLabel')
#evaluator = BinaryClassificationEvaluator(metricName='Precision')


#grid = ParamGridBuilder()\
#        .addGrid(model.threshold,[0.5,0.8])\
#        .build()

In [0]:
#Random Forest

#train models with random forest
def fit_forest_model(df, numTrees=10):
    rf = RandomForestClassifier(labelCol="label", featuresCol="feature_vector", numTrees=numTrees)
    lrn = rf.fit(df)
    return lrn

#return model results of each fold
forest_models = {}
for key in d_processed.keys():
    print(key)
    forest_models[key] = fit_forest_model(d_processed[key])
    lrn_summary = forest_models[key].summary
    display(lrn_summary.predictions)

In [0]:
#XGBoost - Needs to use larger dataset to work

#train models with XGBoost
def fit_xgboost_model(df, maxIter=10):
    xg = GBTClassifier(labelCol="label", featuresCol="feature_vector", maxIter=maxIter)
    lrn = xg.fit(df)
    return lrn

#return model results of each fold
xgboost_models = {}
for key in d_processed.keys():
    print(key)
    xgboost_models[key] = fit_xgboost_model(d_processed[key])
    lrn_summary = xgboost_models[key].summary
    display(lrn_summary.predictions)

In [0]:
#Neural Network (MLPC) - still troubleshooting

#train models with Multi Layer Neural Perceptron
def fit_MLPC_model(df, blockSize=128, seed=1234, layers = [4, 5, 4, 3], maxIter = 10):
    MLPC = MultilayerPerceptronClassifier(labelCol="label", featuresCol="scaled_feature_vector", maxIter=maxIter, layers=layers, blockSize=blockSize, seed=seed)
    lrn = MLPC.fit(df)
    return lrn

#return model results of each fold
MLPC_models = {}
for key in d_processed.keys():
    print(key)
    MLPC_models[key] = fit_MLPC_model(d_processed[key])
    result = MLPC_models[key].transform(d_processed[key])
    result.show(10)