# Model Experimentation

## Notebook Setup

In [0]:
%pip install timezonefinder
%pip install tzfpy

Python interpreter will be restarted.
Collecting timezonefinder
  Downloading timezonefinder-6.1.8.tar.gz (45.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting h3<4,>=3.7.6
  Downloading h3-3.7.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Collecting cffi<2,>=1.15.1
  Using cached cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (441 kB)
Collecting setuptools>=65.5
  Using cached setuptools-65.6.3-py3-none-any.whl (1.2 MB)
Building wheels for collected packages: timezonefinder
  Building wheel for timezonefinder (PEP 517): started
  Building wheel for timezonefinder (PEP 517): finished with status 'done'
  Created wheel for timezonefinder: filename=timezonefinder-6.1.8-c

In [0]:
# General 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sys
from statistics import mean
import itertools
import mlflow.spark
from sklearn import neighbors

# PySpark 
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import regexp_replace

# SQL Functions
from pyspark.sql import functions as f
from pyspark.sql.functions import monotonically_increasing_id, to_timestamp, to_utc_timestamp, to_date
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, first, dense_rank
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType, FloatType, DecimalType
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from functools import reduce
from pyspark.sql.functions import rand,col,when,concat,substring,lit,udf,lower,sum as ps_sum,count as ps_count,row_number
from pyspark.sql.window import *
from pyspark.sql import DataFrame
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, BucketedRandomProjectionLSH
from pyspark.ml.linalg import DenseMatrix, Vectors, VectorUDT
from pyspark.sql.functions import row_number

# ML
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Misc 
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from timezonefinder import TimezoneFinder
from tzfpy import get_tz



In [0]:
# Display and define where mids-w261 is located
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
# display(dbutils.fs.ls(f"{data_BASE_DIR}"))

# Inspect the Mount's Final Project folder 
data_BASE_DIR = "dbfs:/mnt/mids-w261/datasets_final_project_2022/"
# display(dbutils.fs.ls(f"{data_BASE_DIR}"))

In [0]:
blob_container = "housestark" # The name of your container created in https://portal.azure.com
storage_account = "neilp" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261_s1g4" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261_s1g4_key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# df = spark.read.parquet(f"{blob_url}/df_main_3m")
df_full = spark.read.parquet(f"{blob_url}/df_main_fullClean")

## Helper Functions

In [0]:
def preModeling_dataEdit(df):
  '''
  Input: df that has already gone through the final join, cleaning, and feature engineering
  Output: df that includes null imputing and # and % of flights (by tail number) that were delayed and cancelled in the past 90 days --> these depend on window functions, as such they need to be done right after the data is split for modelling and not during feature engineering phase
  '''
  
  ### FINAL CLEANING 
  # Remove rows with null scheduled_departure_UTC because these are rows without a proper timezone (timezonefinder could not find)
  df = df.na.drop(subset=["scheduled_departure_UTC"])
  dropCols = ['TAXI_IN', 'TAXI_OUT']
  df = df.drop(*dropCols) 

  
  ### FINAL FEATURE ADDITIONS
  ## GET NUMBER & PERCENTAGE OF TIMES A PLANE (BY TAIL NUMBER) HAS BEEN DELAYED OR CANCELLED IN THE PAST 3 MONTHS (2 COLUMNS)
  # Make window function
  df = df.withColumn('roundedMonth', f.date_trunc('month', df.scheduled_departure_UTC))
  window_3m = Window().partitionBy('TAIL_NUM').orderBy(f.col('roundedMonth').cast('long')).rangeBetween(-(86400*89), 0) 

  # Add in Columns
  # Number of flights delayed/cancelled
  df = df.withColumn('no_delays_last3m', when(df.TAIL_NUM.isNotNull(), f.sum('dep_delay_15').over(window_3m)).otherwise(-1)) \
         .withColumn('no_cancellation_last3m', when(df.TAIL_NUM.isNotNull(), f.sum('CANCELLED').over(window_3m)).otherwise(-1)) 
  # Percentage of flights delayed/cancelled
  df = df.withColumn('count_flights_last3m', when(df.TAIL_NUM.isNotNull(), f.count('TAIL_NUM').over(window_3m)).otherwise(-1)) 
  df = df.withColumn('perc_delays_last3m', when(df.count_flights_last3m != -1, (df.no_delays_last3m/ df.count_flights_last3m)).otherwise(-1.0)) \
         .withColumn('perc_cancellation_last3m', when(df.count_flights_last3m != -1, (df.no_cancellation_last3m/ df.count_flights_last3m)).otherwise(-1.0))     
  
  ### HANDLING NULLS
  ## Imputing Hourly Weather Data to the best of our ability (up to 3 hours back)
  window = Window.partitionBy(col("ORIGIN_AIRPORT_ID"))\
                     .orderBy(col("rounded_depTimestamp"))\
                     .rowsBetween(0,3)
  
  cols_to_fill  = ['origin_HourlyAltimeterSetting', 'origin_HourlyDewPointTemperature', 'origin_HourlyDryBulbTemperature', 'origin_HourlyPrecipitation', 'origin_HourlyPressureChange', 'origin_HourlyPressureTendency', 'origin_HourlyRelativeHumidity', 'origin_HourlySeaLevelPressure', 'origin_HourlyStationPressure', 'origin_HourlyVisibility', 'origin_HourlyWetBulbTemperature', 'origin_HourlyWindDirection', 'origin_HourlyWindGustSpeed', 'origin_HourlyWindSpeed', 'origin_HourlySkyConditions_SCT_cnt', 'origin_HourlySkyConditions_OVC_cnt', 'origin_HourlySkyConditions_FEW_cnt', 'origin_HourlySkyConditions_BKN_cnt', 'origin_HourlySkyConditions_VV_cnt', 'origin_HourlySkyConditions_SKC_cnt', 'origin_HourlySkyConditions_CLR_cnt', 'dest_HourlyAltimeterSetting', 'dest_HourlyDewPointTemperature', 'dest_HourlyDryBulbTemperature', 'dest_HourlyPrecipitation', 'dest_HourlyPressureChange', 'dest_HourlyPressureTendency', 'dest_HourlyRelativeHumidity', 'dest_HourlySeaLevelPressure', 'dest_HourlyStationPressure', 'dest_HourlyVisibility', 'dest_HourlyWetBulbTemperature', 'dest_HourlyWindDirection','dest_HourlyWindGustSpeed', 'dest_HourlyWindSpeed', 'dest_HourlySkyConditions_SCT_cnt', 'dest_HourlySkyConditions_OVC_cnt', 'dest_HourlySkyConditions_FEW_cnt', 'dest_HourlySkyConditions_BKN_cnt', 'dest_HourlySkyConditions_VV_cnt', 'dest_HourlySkyConditions_SKC_cnt', 'dest_HourlySkyConditions_CLR_cnt']

  
  for field in cols_to_fill:
      filled_column_start = first(df[field], ignorenulls=True).over(window)
      df = df.withColumn(field, filled_column_start)
  
  ## We are still left with some null values --> will deal with them now in accordance to the table in section VII of this notebook
  impute_minus1int = ['DEP_DELAY_NEW', 'holiday' ,'holiday_in2DayRange']
  df = df.na.fill(value = -1,subset = impute_minus1int)
  
  impute_minus9999int = ['DEP_DELAY']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_minus1fl = ['perc_delays_last3m', 'perc_cancellation_last3m']
  df = df.na.fill(value = -1.0,subset = impute_minus1fl)
  
  impute_minus9999int = ['elevation_ft']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_99int = [ 'origin_HourlyRelativeHumidity', 'dest_HourlyRelativeHumidity']
  df = df.na.fill(value = 99 ,subset = impute_99int)
  
  impute_99fl = ['origin_HourlyPrecipitation', 'dest_HourlyPrecipitation']
  df = df.na.fill(value = 99.0 ,subset = impute_99fl)
  
  impute_999int = ['origin_HourlyPressureTendency', 'dest_HourlyPressureTendency']
  df = df.na.fill(value = 999 ,subset = impute_999int)
  
  impute_999fl = ['origin_HourlyPressureChange', 'dest_HourlyPressureChange']
  df = df.na.fill(value = 999.0 ,subset = impute_999fl)
  
  impute_9999int = ['origin_HourlyDewPointTemperature', 'origin_HourlyDryBulbTemperature', 'origin_HourlyWetBulbTemperature', 'origin_HourlyWindGustSpeed', 'dest_HourlyDewPointTemperature', 'dest_HourlyDryBulbTemperature', 'dest_HourlyWetBulbTemperature', 'dest_HourlyWindGustSpeed']
  df = df.na.fill(value = 9999 ,subset = impute_9999int)
    
  impute_99999int = ['origin_HourlyWindDirection', 'origin_HourlyWindSpeed', 'dest_HourlyWindDirection', 'dest_HourlyWindSpeed']
  df = df.na.fill(value = 99999 ,subset = impute_99999int)
  
  impute_99999fl = ['origin_HourlyAltimeterSetting',  'dest_HourlyAltimeterSetting', 'origin_HourlySeaLevelPressure','dest_HourlySeaLevelPressure', 'origin_HourlyStationPressure', 'dest_HourlyStationPressure']
  df = df.na.fill(value = 99999.0 ,subset = impute_99999fl)
  
  impute_999999fl = ['origin_HourlyVisibility', 'dest_HourlyVisibility']
  df = df.na.fill(value = 999999.0 ,subset = impute_999999fl)
  
  impute_str = ['TAIL_NUM', 'type', 'origin_HourlySkyConditions', 'dest_HourlySkyConditions', 'local_timestamp', 'timezone']
  df = df.na.fill(value = 'no_data',subset = impute_str)
  
  imputed_cols  = cols_to_fill + ['no_delays_last3m', 'no_cancellation_last3m', 'count_flights_last3m', 'perc_delays_last3m', 'perc_cancellation_last3m', 'elevation_ft']
  
  return df,imputed_cols

In [0]:
# Function to create pipeline
def create_pipeline(df, inputCols_cat, inputCols_cont):
  """Creates a feature engineering pipeline for modeling 
  Args:
    inputCols_cat (list): list of categorical input cols
    inputCols_cont (list): list of continuous input cols 
  
  Returns: 
    pipeline (Pipeline): MLlib pipeline with stages  
  """
  
  # String Indexer
  inputCols_categorical_indexed = [f'{i}_index' for i in inputCols_cat]
  string_indexer = StringIndexer(inputCols = inputCols_cat, 
                                 outputCols = inputCols_categorical_indexed).setHandleInvalid('keep')

  # One Hot Encoder  
  inputCols_categorical_encoded = [f'{i}_encoded' for i in inputCols_categorical_indexed]
  one_hot_encoder = OneHotEncoder(inputCols = inputCols_categorical_indexed, 
                                  outputCols = inputCols_categorical_encoded)

  # Vector Assembler (Categorical)
  assembler_cat = VectorAssembler(inputCols = inputCols_categorical_encoded, 
                              outputCol = 'features_cat').setHandleInvalid('keep')
  
  # Vector Assembler (Continuous)
  assembler_cont = VectorAssembler(inputCols = inputCols_cont, 
                              outputCol = 'features_cont').setHandleInvalid('keep')

  # Pipeline
  return Pipeline().setStages([string_indexer, one_hot_encoder, assembler_cat, assembler_cont])

In [0]:
def impute_and_scale_features(df):
  
  # Impute data 
  imputed_df, imputed_cols = preModeling_dataEdit(df)

  # Vector Assembler (Continuous) 
  assembler_cont = VectorAssembler(inputCols = ['features_cont'] + imputed_cols, 
                              outputCol = 'features_cont_all').setHandleInvalid('keep')

  # Standard Scaler 
  scaler = StandardScaler(inputCol = 'features_cont_all',
                          outputCol = 'features_scaled',
                          withMean = True, withStd = True)
  
  # Vector Assembler (Continuous + Categorical) 
  assembler_all = VectorAssembler(inputCols = ['features_scaled', 'features_cat'], 
                              outputCol = 'features_all').setHandleInvalid('keep')

  pipeline = Pipeline().setStages([assembler_cont, scaler, assembler_all])

  # Create features_scaled for all dfs
  pipeline_df = pipeline.fit(imputed_df).transform(imputed_df) 
  
  return  pipeline_df

In [0]:
def SmoteSampling(vectorized, k = 5, minorityClass = 1, majorityClass = 0, percentageOver = 200, percentageUnder = 100):
    if(percentageUnder > 100|percentageUnder < 10):
        raise ValueError("Percentage Under must be in range 10 - 100");
    if(percentageOver < 100):
        raise ValueError("Percentage Over must be in at least 100");
    dataInput_min = vectorized[vectorized['label'] == minorityClass]
    dataInput_maj = vectorized[vectorized['label'] == majorityClass]
    feature = dataInput_min.select('features_cont')
    feature = feature.rdd
    feature = feature.map(lambda x: x[0])
    feature = feature.collect()
    feature = np.asarray(feature)
    nbrs = neighbors.NearestNeighbors(n_neighbors=k, algorithm='auto').fit(feature)
    neighbours =  nbrs.kneighbors(feature)
    gap = neighbours[0]
    neighbours = neighbours[1]
    min_rdd = dataInput_min.drop('label').rdd
    pos_rddArray = min_rdd.map(lambda x : list(x))
    pos_ListArray = pos_rddArray.collect()
    min_Array = list(pos_ListArray)
    newRows = []
    nt = len(min_Array)
    nexs = percentageOver/100
    for i in range(nt):
        for j in range(nexs):
            neigh = random.randint(1,k)
            difs = min_Array[neigh][0] - min_Array[i][0]
            newRec = (min_Array[i][0]+random.random()*difs)
            newRows.insert(0,(newRec))
    newData_rdd = sc.parallelize(newRows)
    newData_rdd_new = newData_rdd.map(lambda x: Row(features = x, label = 1))
    new_data = newData_rdd_new.toDF()
    new_data_minor = dataInput_min.unionAll(new_data)
    new_data_major = dataInput_maj.sample(False, (float(percentageUnder)/float(100)))
    return new_data_major.unionAll(new_data_minor)

In [0]:
def get_sampling(train_df, sampling):
  """Modifies the training data to under/over sample 
  Args:
    train_df (df): training data
    sampling (string): if none, no sampling is performed; if under, undersampling is performed; if over, oversampling is performed 
  Returns:
    train_df_sampled (df): modified training data 
  """
  print("train_df: ", train_df.count())
  
  # No sampling 
  if sampling == 'none':
    return train_df
  
  # Undersampling
  elif sampling == 'under':
    no_delay = train_df.filter(col('label') == 0)
    delay = train_df.filter(col('label') == 1)
    
    class_ratio =  delay.count() / no_delay.count()
    no_delay_sample = no_delay.sample(withReplacement=True, fraction=class_ratio)
    train_df_sampled = delay.unionAll(no_delay_sample)
    
    return train_df_sampled
    
  # Oversampling
  elif sampling == 'over':
    train_df_sampled = SmoteSampling(train_df)
    
    print("train_df_sampled:" , train_df_sampled.count())
    return train_df_sampled

In [0]:
def grid_search_test_train_split(pipeline_df, sample_size, sampling='none'):
  """Splits the dataframe in train and test splits for grid search 
  Args:
    df (dataframe): dataframe to model on; requirements:
      - Has gone through create_pipeline function 
      - Has 'Year' column from 2015 - 2021
      - Has 'features' column (not scaled)
      - Has 'label' column
    sample_size (float): optional parameter to specify if you would like a subset of the data 
    sampling (string): if none, no sampling is performed; if under, undersampling is performed; if over, oversampling is performed 
  
  Returns: 
    results_df (df):  dataframe of parameters tested and the results from that iteration   
  """
  # ---------- Split Data ---------- #
  train = pipeline_df.filter(col('Year') <= 2019)
  train = get_sampling(train, sampling)
  val   = pipeline_df.filter(col('Year') == 2020)

  # ---------- Get Subset of Train & Val Data ---------- #
  if sample_size:
    train = train.sample(sample_size)
    val = val.sample(sample_size)

  # ---------- Impute and Scale Features ---------- #
  train_df_full = impute_and_scale_features(train)
  val_df_full   = impute_and_scale_features(val)
  
  return train_df_full, val_df_full

In [0]:
inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df = pipeline.fit(df_full).transform(df_full)

In [0]:
# pipeline_df
train_001_none, val_001_none = grid_search_test_train_split(pipeline_df, 0.00001, sampling='over')


train_df:  31730153


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-632558266976412>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0;31m# pipeline_df[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mtrain_001_none[0m[0;34m,[0m [0mval_001_none[0m [0;34m=[0m [0mgrid_search_test_train_split[0m[0;34m([0m[0mpipeline_df[0m[0;34m,[0m [0;36m0.00001[0m[0;34m,[0m [0msampling[0m[0;34m=[0m[0;34m'over'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-632558266976332>[0m in [0;36mgrid_search_test_train_split[0;34m(pipeline_df, sample_size, sampling)[0m
[1;32m     15[0m   [0;31m# ---------- Split Data ---------- #[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m     16[0m   [0mtrain[0m [0;34m=[0m [0mpipeline_df[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m'Year'[0m[0;3

In [0]:
def get_model(model_type, params):
  """Builds a model based on the given parameters
  Args:
    model_type (string): type of model to be built 
    params (dict): dictionary of parameters specific to the model_type
  Returns:
    model: MLlib model ready to be trained 
    ml_type (string): type of model (classification or regression)
  """
  # Logistic Regression
  if model_type == 'LogisticRegression':
    ml_type = 'c'
    model = LogisticRegression(featuresCol = 'features_all',
                               labelCol = 'label',
                               maxIter = params['maxIter'],
                               regParam = params['regParam'],
                               elasticNetParam = params['elasticNetParam'])

  # Linear Regression
  elif model_type == 'LinearRegression':
    ml_type = 'r'
    model = LinearRegression(featuresCol = 'features_all',
                             labelCol = 'DEP_DELAY_NEW',
                             maxIter = params['maxIter'],
                             regParam = params['regParam'],
                             elasticNetParam = params['elasticNetParam'])

  # Decision Tree Classifier
  elif model_type == 'DecisionTreeClassifier':
    ml_type = 'c'
    model = DecisionTreeClassifier(featuresCol = 'features_all',
                                   labelCol = 'label',
                                   maxDepth = params['maxDepth'],
                                   impurity = params['impurity'],
                                   maxBins = params['maxBins'])
    
  # Decision Tree Regressor
  elif model_type == 'DecisionTreeRegressor':
    ml_type = 'r'
    model = DecisionTreeRegressor(featuresCol = 'features_all',
                                  labelCol = 'DEP_DELAY_NEW',
                                  maxDepth = params['maxDepth'])

  # Random Forest Classifier
  elif model_type == 'RandomForestClassifier':
    ml_type = 'c'
    model = RandomForestClassifier(featuresCol = 'features_all',
                                   labelCol='label',
                                   numTrees= params['numTrees'], 
                                   maxDepth=params['maxDepth'], 
                                   impurity = params['impurity'],
                                   maxBins = params['maxBins'])
  
  # Random Forest Regressor
  elif model_type == 'RandomForestRegressor':
    ml_type = 'r'
    model = RandomForestRegressor(featuresCol = 'features_all',
                                   labelCol='DEP_DELAY_NEW',
                                   numTrees= params['numTrees'], 
                                   maxDepth=params['maxDepth'])

  # Gradient Boosted Tree Regressor 
  elif model_type == 'GBTRegressor':
    ml_type = 'r'
    model = GBTRegressor(featuresCol = 'features_all',
                         labelCol='DEP_DELAY_NEW',
                         maxIter= params['maxIter'], 
                         maxDepth=params['maxDepth'],
                         stepSize = params['stepSize'])

  # MLP NN Classifier 
  elif model_type == 'MultilayerPerceptronClassifier':
    ml_type = 'c'
    model = MultilayerPerceptronClassifier(featuresCol = 'features_all',
                         labelCol='label',
                         layers = params['layers'],
                         maxIter= params['maxIter'], 
                         blockSize=params['blockSize'],
                         stepSize = params['stepSize'])

  
  return model, ml_type

In [0]:
def get_param_permutations(params):
  """Given a dictionary of parameters to test in a grid search, returns all possible permutations
  Args:
    params (dict): dictionary of parameters inputted by user
  Returns:
    param_list (list): list of dictionaries to pass to the model
  """
  param_list = []
  vals = params.values()

  # Loop through all permutations 
  for param_vals in list(itertools.product(*vals)):
    # Create a dictionary to hold each permutation of parameters 
    param_dict = {}
    # Loop over the different parameters 
    for i, key in enumerate(params.keys()):
      param_dict[key] = param_vals[i]
    # Add each dictionary to the parameter list 
    param_list.append(param_dict)
  return param_list 

In [0]:
def evaluate_model(predictions, ml_type):
  """Provides evaluation metrics for classification/regression models
  Args:
    predictions (df): dataframe of predicated and actual values 
    ml_type (string): type of model 
  Returns:
    classification: accuracy, precision, recall, f1score
    regression: r2, rmse, mse, mae
  """
  if ml_type == 'c':
    eval_accuracy = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')
    eval_precision = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='precisionByLabel')
    eval_recall = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='recallByLabel')
    eval_f1 = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')

    accuracy = eval_accuracy.evaluate(predictions)
    precision = eval_precision.evaluate(predictions)
    recall = eval_recall.evaluate(predictions)
    f1score = eval_f1.evaluate(predictions)
    
    return accuracy, precision, recall, f1score
    
  elif ml_type == 'r':
    eval_r2 = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='r2')
    eval_rmse = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='rmse')
    eval_mse = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='mse')
    eval_mae = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='mae')
    
    r2 = eval_r2.evaluate(predictions)
    rmse = eval_rmse.evaluate(predictions)
    mse = eval_mse.evaluate(predictions)
    mae = eval_mae.evaluate(predictions)
    
    return r2, rmse, mse, mae

## Modeling Functions

In [0]:
def train_model_no_CV(train_df, val_df, model_type, params, train_metrics=False):
  """Splits the df into time series cross validation splits, trains a model, and provides evaluation metrics. Should be used for experimentation to determine best model parameters.
  Args:
    train_df (df): training data that has been through grid_search_test_train_split
    val_df (df): validation data that has been through grid_search_test_train_split
    model_type (string): indicates the type of model that will be trained 
    params (dict): a dictionary of parameters as keys and list of parameter values as values 
        - LogisticRegression: { 'maxIter': [10,20,30], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8,0.9] }
        - LinearRegression: { 'maxIter': [10,20,30], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8,0.9] }
        - DecisionTreeClassifier: { 'numClasses': [3], 'maxDepth': [2], 'impurity': ['gini'], 'maxBins': [32] }
        - DecisionTreeRegressor: { 'maxDepth': [1,2,3] }
  
  Returns: 
    results_df (df):  dataframe of parameters tested and the results from that iteration   
  """

  # ---------- Train Model ---------- #
  param_permutations = get_param_permutations(params)

  results_df = pd.DataFrame()
  for param in param_permutations:
    model, ml_type = get_model(model_type, param)
    trained_model  = model.fit(train_df)
    
    if train_metrics == True:
      training_predictions = trained_model.transform(train_df)
    predictions          = trained_model.transform(val_df)

    # ---------- Evaluate Model ---------- #
    iter_params = pd.DataFrame(param, index=[0])
    
    # Classification 
    if ml_type == 'c':
      if train_metrics == True:
        train_accuracy, train_precision, train_recall, train_f1score = evaluate_model(training_predictions, ml_type)
        train_iter_results = pd.DataFrame({'Train Accuracy': [train_accuracy], 'Train Precision': [train_precision], 'Train Recall': [train_recall], 'Train F1 Score': [train_f1score]})
      val_accuracy, val_precision, val_recall, val_f1score = evaluate_model(predictions, ml_type)
      val_iter_results = pd.DataFrame({'Val Accuracy': [val_accuracy], 'Val Precision': [val_precision], 'Val Recall': [val_recall], 'Val F1 Score': [val_f1score]})

    # Regression
    elif ml_type == 'r':
      if train_metrics == True:
        train_r2, train_rmse, train_mse, train_mae = evaluate_model(training_predictions, ml_type)
        train_iter_results = pd.DataFrame({'Train R2': [train_r2], 'Train RMSE': [train_rmse], 'Train MSE': [train_mse], 'Train MAE': [train_mae]})
      val_r2, val_rmse, val_mse, val_mae = evaluate_model(predictions, ml_type)
      val_iter_results = pd.DataFrame({'Val R2': [val_r2], 'Val RMSE': [val_rmse], 'Val MSE': [val_mse], 'Val MAE': [val_mae]})
      
    if train_metrics == True:
      iter_df = pd.concat([iter_params, train_iter_results, val_iter_results], axis=1)  
    else:
      iter_df = pd.concat([iter_params, val_iter_results], axis=1)
    results_df = pd.concat([results_df,iter_df], axis=0)
  
  return results_df

## Experimentation

For phase 3, we ran experiments on basic models (logisic regression, linear regression, decision tree classifier, and decision tree regressor) to determine the best hyperparameters for each model. We went through various levels of sampling to iterate on models and hyperparameter tune. We started with a sample size of 1%, then ran experiments on a sample size of 10%. The experimentation was done with an experimentation function that included a grid search of hyperparameters and no/under/oversampling. Below is a table summarizing the experiments that were run. Please note that not all of the experiments are shown on this notebook, as some were run on separate notebooks. This notebook contains all of the experiments that have been slightly optimized so that we didn't have to rerun everything (for example, in some cases we would run ~6 experients as opposed to the full 36). 

#### Sample Size = 0.1%
| Model Type     | Model                    | Evaluation Metric | Training Time (None/Under/Over Sampling) | Number of Experiments | Sampling        | Hyperparameters | 
| -------------- | ------------------------ | ----------------- | ---------------------------------------- | --------------------- | --------------- | --------------- |
| Classification | Logistic Regression      |  F1 Score         | 21/23/26 mins                            |           36          | None/Under/Over | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.3, 0.4 <br/> Elastic Net Param: 0, 0.8 | 
| Regression     | Linear Regression        |  MAE              | 22/24/28 mins                            |           36          | None/Under/Over | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.3, 0.4 <br/> Elastic Net Param: 0, 0.8 |
| Classification | Decision Tree Classifier |  F1 Score         | 12/31/4 mins                             |           72          | None/Under/Over | Max Depth: 1, 2, 3, 4 <br/> Impurity: Gini, Entropy <br/> Max Bins: 28,32,40 |
| Regression     | Decision Tree Regressor  |  MAE              | 5/5/10 mins                              |           12          | None/Under/Over | Max Depth: 1, 2, 3, 4 |
| Classification     | Random Forest Classification  |  F1 Score              | 19.40/11.66/10.99 mins                              |           108          | None/Over/Under | Number of Trees: 10, 20, 50 <br/> Max Depth: 5, 10, 15 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32,40 |
| Regression     | Random Forest Regression  |  MAE              | 3.97/5.09/4.90 mins                              |           27          | None/Over/Under | Number of Trees: 10, 20, 50 <br/> Max Depth: 5, 10, 15 |
| Regression     | Gradient Boosted Trees Regression |  MAE              |  38.20 mins                              |           81          | None/Over/Under | Max Depth: 5, 10, 15 <br/> Max Iterations: 10, 20, 50 <br/> Step Size (Learning Rate): 0.01, 0.1, 1.0 |


#### Sample Size = 1%
| Model Type     | Model                    | Evaluation Metric | Training Time (None/Under/Over Sampling) | Number of Experiments | Sampling        | Hyperparameters | 
| -------------- | ------------------------ | ----------------- | ---------------------------------------- | --------------------- | --------------- | --------------- |
| Classification | Logistic Regression      |  F1 Score         | 21/23/26 mins                            |           36          | None/Under/Over | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.3, 0.4 <br/> Elastic Net Param: 0, 0.8 | 
| Regression     | Linear Regression        |  MAE              | 22/24/28 mins                            |           36          | None/Under/Over | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.3, 0.4 <br/> Elastic Net Param: 0, 0.8 |
| Classification | Decision Tree Classifier |  F1 Score         | 12/31/4 mins                             |           72          | None/Under/Over | Max Depth: 1, 2, 3, 4 <br/> Impurity: Gini, Entropy <br/> Max Bins: 28,32,40 |
| Regression     | Decision Tree Regressor  |  MAE              | 5/5/10 mins                              |           12          | None/Under/Over | Max Depth: 1, 2, 3, 4 |
| Classification     | Random Forest Classification  |  F1 Score              | 5/5/10 mins                              |           20          | None/Under | Number of Trees: 10, 20, 50 <br/> Max Depth: 5, 10, 15 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32,40 |
| Regression     | Random Forest Regression  |  MAE              | 5/5/10 mins                              |           3          | None/Over/Under | Number of Trees: 10, 20, 50 <br/> Max Depth: 5, 10, 15 |
| Regression     | Gradient Boosted Trees Regression |  MAE              |  mins                              |           81          | None/Over/Under | Max Depth: 5, 10, 15 <br/> Max Iterations: 10, 20, 50 <br/> Step Size (Learning Rate): 0.01, 0.1, 1.0 |

#### Sample Size = 10%
| Model Type     | Model                    | Evaluation Metric | Training Time | Number of Experiments | Sampling        | Hyperparameters | 
| -------------- | ------------------------ | ----------------- | ------------- | --------------------- | --------------- | --------------- |
| Classification | Logistic Regression      |  F1 Score         |     4 mins    |           4           | None            | Max Iterations: 10, 15 <br/> Regularization Param: 0.1, 0.2 <br/> Elastic Net Param: 0 | 
| Regression     | Linear Regression        |  MAE              |     8 mins    |           4           | None            | Max Iterations: 10, 15 <br/> Regularization Param: 0.1, 0.2 <br/> Elastic Net Param: 0 |
| Classification | Decision Tree Classification |  F1 Score         |     8 mins    |           6           | Under           | Max Depth: 2, 3, 4 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32 |
| Regression     | Decision Tree Regression  |  MAE              |     3 mins    |           2           | None            | Max Depth: 1, 2 |
| Classification     | Random Forest Classification  |  F1 Score              | 5/5/10 mins                              |           12          | None/Under/Over | Number of Trees: 10, 20, 50 <br/> Max Depth: 5, 10, 15 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32,40 |
| Regression     | Random Forest Regression  |  MAE              | 5/5/10 mins                              |           12          | None/Under/Over | Number of Trees: 10, 20, 50 <br/> Max Depth: 5, 10, 15 |
| Regression     | Gradient Boosted Trees Regression |  MAE              |  mins                              |           81          | None/Under/Over | Max Depth: 5, 10, 15 <br/> Max Iterations: 10, 20, 50 <br/> Step Size (Learning Rate): 0.01, 0.1, 1.0 |

#### Experimentation Results 
| Sample Size    | Model                    | Best Parameters  | Performance | 
| -------------- | ------------------------ | ---------------- | ----------- |
| 1%             | Logistic Regression      | Sampling: None <br/> Max Iterations: 10 <br/> Regularization Param: 0.2 <br/> Elastic Net Param: 0 | F1: 0.788 |
| 1%             | Linear Regression        | Sampling: None <br/> Max Iterations: 10 <br/> Regularization Param: 0.2 <br/> Elastic Net Param: 0 | MAE: 15.487 |
| 1%             | Decision Tree Classifier | Sampling: Over <br/> Max Depth: 2 <br/> Impurity: Gini <br/> Max Bins: 40 | F1: 0.898 |
| 1%             | Decision Tree Regressor  | Sampling: None <br/> Max Depth: 1     | MAE: 5.532 |
| 1%             | Random Forest Classifier | Sampling:  <br/> Number of Trees:  <br/> Max Depth:  <br/> Impurity: <br/> Max Bins: | F1: |
| 1%             | Random Forest Regressor  | Sampling:  <br/> Number of Trees:  <br/> Max Depth:      | MAE:  |
| 1%             | Gradient Boosted Trees Regressor  | Sampling:  <br/> Max Depth: <br/> Max Iterations: <br/> Step Size (Learning Rate):     | MAE:  |
| 10%            | Logistic Regression      | Sampling: None <br/> Max Iterations: 15 <br/> Regularization Param: 0.1 <br/> Elastic Net Param: 0 | F1: 0.788 |
| 10%            | Linear Regression        | Sampling: None <br/> Max Iterations: 15 <br/> Regularization Param: 0.1 <br/> Elastic Net Param: 0 | MAE: 15.270 |
| 10%            | Decision Tree Classifier | Sampling: Over <br/> Max Depth: 2 <br/> Impurity: Gini <br/> Max Bins: 32 | F1: 0.898 |
| 10%            | Decision Tree Regressor  | Sampling: None <br/> Max Depth: 1     | MAE: 6.312 |
| 10%             | Random Forest Classifier | Sampling:  <br/> Number of Trees:  <br/> Max Depth:  <br/> Impurity: <br/> Max Bins: | F1: |
| 10%             | Random Forest Regressor  | Sampling:  <br/> Number of Trees:  <br/> Max Depth:      | MAE:  |
| 10%             | Gradient Boosted Trees Regressor  | Sampling:  <br/> Max Depth: <br/> Max Iterations: <br/> Step Size (Learning Rate):     | MAE:  |

In [0]:
# Pipeline Set Up

inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df = pipeline.fit(df_full).transform(df_full)

train_001_none, val_001_none = grid_search_test_train_split(pipeline_df, 0.001, sampling='none')
train_001_none = train_001_none.cache()
val_001_none  = val_001_none.cache()

train_001_over, val_001_over = grid_search_test_train_split(pipeline_df, 0.001, sampling='over')
train_001_over = train_001_over.cache()
val_001_over  = val_001_over.cache()

train_001_under, val_001_under = grid_search_test_train_split(pipeline_df, 0.001, sampling='under')
train_001_under = train_001_under.cache()
val_001_under  = val_001_under.cache()


train_10_none, val_10_none = grid_search_test_train_split(pipeline_df, 0.1, sampling='none')
train_10_none = train_10_none.cache()
val_10_none  = val_10_none.cache()

train_10_over, val_10_over = grid_search_test_train_split(pipeline_df, 0.1, sampling='over')
train_10_over = train_10_over.cache()
val_10_over  = val_10_over.cache()

train_10_under, val_10_under = grid_search_test_train_split(pipeline_df, 0.1, sampling='under')
train_10_under = train_10_under.cache()
val_10_under  = val_10_under.cache()

In [0]:
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8]}
log_reg_c_001_none = train_model_no_CV(train_001_none, val_001_none, model_type='LogisticRegression', params=log_reg_c_params)
display(log_reg_c_001_none)

maxIter,regParam,elasticNetParam,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.8980181236823903,0.9362780593772628,0.9807838179519596,0.8980181236823903
10,0.2,0.8,0.8970502219265744,0.919748778785764,0.999747155499368,0.8970502219265744
10,0.3,0.0,0.8987869823574566,0.9357091259330604,0.9825537294563844,0.8987869823574566
10,0.3,0.8,0.7841786175758201,0.8521870286576169,1.0,0.7841786175758201
10,0.4,0.0,0.8997211271465301,0.934068568688564,0.9850821744627054,0.8997211271465301
10,0.4,0.8,0.7841786175758201,0.8521870286576169,1.0,0.7841786175758201
20,0.2,0.0,0.8980181236823903,0.9362780593772628,0.9807838179519596,0.8980181236823903
20,0.2,0.8,0.8970502219265744,0.919748778785764,0.999747155499368,0.8970502219265744
20,0.3,0.0,0.8987869823574566,0.9357091259330604,0.9825537294563844,0.8987869823574566
20,0.3,0.8,0.7841786175758201,0.8521870286576169,1.0,0.7841786175758201


### 0.1% Sample Size

#### Logistic Regression

In [0]:
# Logistic Regression No sampling
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8]}
log_reg_c_no_sampling01 = train_model_no_CV(train_001_none, val_001_none,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_no_sampling01)

In [0]:
# Logistic Regression Over sampling 
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8]}
log_reg_c_over_sampling01 = train_model_no_CV(train_001_over, val_001_over,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_over_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975321>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Logistic Regression Over sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlog_reg_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m[0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.2[0m[0;34m,[0m[0;36m0.3[0m[0;34m,[0m[0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m0.8[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlog_reg_c_under_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'Logis

In [0]:
# Logistic Regression Under sampling 
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8]}
log_reg_c_under_sampling01 = train_model_no_CV(train_001_under, val_001_under,  model_type='LogisticRegression', params=log_reg_c_params,  train_metrics = True)
display(log_reg_c_under_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975322>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Logistic Regression Under sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlog_reg_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m[0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.2[0m[0;34m,[0m[0;36m0.3[0m[0;34m,[0m[0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m0.8[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlog_reg_c_over_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'Logis

#### Linear Regression

In [0]:
# Linear Regression, no sampling
lin_reg_r_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.3, 0.4], 'elasticNetParam': [0.0, 0.8] }
lin_reg_r_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='LinearRegression', params=lin_reg_r_params,  train_metrics = True)
display(lin_reg_r_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975324>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Linear Regression, no sample[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlin_reg_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.2[0m[0;34m,[0m [0;36m0.3[0m[0;34m,[0m [0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.0[0m[0;34m,[0m [0;36m0.8[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlin_reg_r_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'Linear

#### Decision Tree Classification

In [0]:
# Decision Tree Classifier No sampling 
dt_c_params = { 'maxDepth': [2,3,4], 'impurity': ['gini','entropy'], 'maxBins': [28,32,40] }
dt_c_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='DecisionTreeClassifier', params=dt_c_params,  train_metrics = True)
display(dt_c_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975330>[0m in [0;36m<cell line: 4>[0;34m()[0m
[1;32m      2[0m [0;31m# tested maxDepth of 2,3,4 and 2 always returns the best results[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0mdt_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m,[0m[0;36m4[0m[0;34m][0m[0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m28[0m[0;34m,[0m[0;36m32[0m[0;34m,[0m[0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m [0mdt_c_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0

In [0]:
# Decision Tree Classifier Over sampling 
dt_c_params = { 'maxDepth': [2,3,4], 'impurity': ['gini','entropy'], 'maxBins': [32, 40] }
dt_c_over_sampling01 = train_model_no_CV(train_001_over, val_001_over, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_over_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975331>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Decision Tree Classifier Over sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mdt_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m,[0m[0;36m4[0m[0;34m][0m[0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mdt_c_over_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'DecisionTre

In [0]:
# Decision Tree Classifier Under sampling 
dt_c_params = { 'maxDepth': [2,3,4], 'impurity': ['gini','entropy'], 'maxBins': [32, 40] }
dt_c_under_sampling01 = train_model_no_CV(train_001_under, val_001_under, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_under_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975332>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Decision Tree Classifier Under sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mdt_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m1[0m[0;34m,[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m,[0m[0;36m4[0m[0;34m][0m[0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mdt_c_under_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;3

#### Decision Tree Regression

In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [2,3,4] }
dt_r_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='DecisionTreeRegressor', params=dt_r_params,  train_metrics = True)
display(dt_r_no_sampling01)

#### Random Forest Classification

In [0]:
# Random Forest Classifier - No Sampling
rf_c_params = { 'maxDepth': [5, 10, 15], 'numTrees': [10, 20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [32, 40] }
rf_c_no_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='RandomForestClassifier', params=rf_c_params, sampling='none')
display(rf_c_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975337>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Classifier - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m [0;36m10[0m[0;34m,[0m [0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_c_no_sampling01[0m 

In [0]:
# Random Forest Classifier - Over Sampling
rf_c_params = { 'maxDepth': [5,10,15], 'numTrees': [10, 20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [32, 40] }
rf_c_over_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='RandomForestClassifier', params=rf_c_params, sampling='over')
display(rf_c_over_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975338>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Classifier - Over Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_c_over_sampling01[0m

In [0]:
# Random Forest Classifier - Under Sampling
rf_c_params = { 'maxDepth': [5,10,15], 'numTrees': [10, 20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [32, 40] }
rf_c_under_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='RandomForestClassifier', params=rf_c_params, sampling='under')
display(rf_c_under_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975339>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Classifier - Under Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_c_under_sampling01[

#### Random Forest Regression

In [0]:
# Random Forest Regression - No Sampling
rf_r_params = { 'maxDepth': [5,10,15], 'numTrees': [10, 20, 50]}
rf_r_no_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='RandomForestRegressor', params=rf_r_params, sampling='none')
display(rf_r_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975341>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Regression - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_r_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'RandomForestRegressor'[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0mrf_r_params[0m[0;34m,[0m [0msamp

In [0]:
# Random Forest Regression - No Sampling - FILTERED
rf_r_params = { 'maxDepth': [5,10,15], 'numTrees': [10, 20, 50]}
rf_r_no_sampling01 = train_model_no_CV(pipeline_df_sample01_r, model_type='RandomForestRegressor', params=rf_r_params, sampling='none')
display(rf_r_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975342>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Regression - No Sampling - FILTERED[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_r_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01_r[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'RandomForestRegressor'[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0mrf_r_params[0m[0;34m,

#### Gradient Boosted Trees Regression

In [0]:
# Gradient Boosted Trees Regression - No Sampling
gbt_r_params = { 'maxDepth': [5,10,15], 'maxIter': [10, 20, 50] , 'stepSize': [0.01, 0.1, 1.0]}
gbt_r_no_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='GBTRegressor', params=gbt_r_params, sampling='none')
display(gbt_r_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975344>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Gradient Boosted Trees Regression - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mgbt_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'stepSize'[0m[0;34m:[0m [0;34m[[0m[0;36m0.01[0m[0;34m,[0m [0;36m0.1[0m[0;34m,[0m [0;36m1.0[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mgbt_r_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;3

In [0]:
# Gradient Boosted Trees Regression - No Sampling - FILTERED
gbt_r_params = { 'maxDepth': [5,10,15], 'maxIter': [10, 20, 50] , 'stepSize': [0.01, 0.1, 1.0]}
gbt_r_no_sampling01 = train_model_no_CV(pipeline_df_sample01_r, model_type='GBTRegressor', params=gbt_r_params, sampling='none')
display(gbt_r_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975345>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Gradient Boosted Trees Regression - No Sampling - FILTERED[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mgbt_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'stepSize'[0m[0;34m:[0m [0;34m[[0m[0;36m0.01[0m[0;34m,[0m [0;36m0.1[0m[0;34m,[0m [0;36m1.0[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mgbt_r_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample

#### MultiLayer Perceptron (Neural Network)

https://towardsdatascience.com/spark-multilayer-perceptron-classifier-for-poi-classification-99e5c68b4a77

need to define at least 2 layer structures for layers

Classifier trainer based on the Multilayer Perceptron. Each layer has sigmoid activation function, output layer has softmax. Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. --> https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.MultilayerPerceptronClassifier.html#pyspark.ml.classification.MultilayerPerceptronClassifier.blockSize

In [0]:
# MultiLayer Perceptrion Classification - No Sampling
mlp_r_params = { 'maxIter': [], 'layers': [] , 'stepSize': [], 'blockSize': [] }
mlp_r_no_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='MultilayerPerceptronClassifier', params=mlp_r_params, sampling='none')
display(mlp_r_no_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975347>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# MultiLayer Perceptrion Classification - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mmlp_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m'layers'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m [0;34m,[0m [0;34m'stepSize'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m'blockSize'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mmlp_r_no_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'MultilayerPerceptronClassifier'[0m[0;34m,[0m [0mparams[0m[0;34m=[0m

In [0]:
# MultiLayer Perceptrion Classification - Over Sampling
mlp_r_params = { 'maxIter': [], 'layers': [] , 'stepSize': [], 'blockSize': [] }
mlp_r_over_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='MultilayerPerceptronClassifier', params=mlp_r_params, sampling='over')
display(mlp_r_over_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975348>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# MultiLayer Perceptrion Classification - Over Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mmlp_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m'layers'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m [0;34m,[0m [0;34m'stepSize'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m'blockSize'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mmlp_r_over_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'MultilayerPerceptronClassifier'[0m[0;34m,[0m [0mparams[0m[0;34m=

In [0]:
# MultiLayer Perceptrion Classification - Under Sampling
mlp_r_params = { 'maxIter': [], 'layers': [] , 'stepSize': [], 'blockSize': [] }
mlp_r_under_sampling01 = train_model_no_CV(pipeline_df_sample01, model_type='MultilayerPerceptronClassifier', params=mlp_r_params, sampling='under')
display(mlp_r_under_sampling01)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975349>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# MultiLayer Perceptrion Classification - Under Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mmlp_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m'layers'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m [0;34m,[0m [0;34m'stepSize'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m'blockSize'[0m[0;34m:[0m [0;34m[[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mmlp_r_under_sampling01[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample01[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'MultilayerPerceptronClassifier'[0m[0;34m,[0m [0mparams[0m[0;34m

In [0]:
pipeline_df_sample01.unpersist()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975350>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mpipeline_df_sample01[0m[0;34m.[0m[0munpersist[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mNameError[0m: name 'pipeline_df_sample01' is not defined

### 10% Sample Size

In [0]:
# 10% Sample Pipeline

df_sample10 = df.sample(0.1)

inputCols_categorical = ['QUARTER', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['Year', 'DAY_OF_MONTH', 'elevation_ft', 'DISTANCE']

pipeline10 = create_pipeline(df_sample10, inputCols_categorical, inputCols_continuous)
pipeline_df_sample10 = pipeline10.fit(df_sample10).transform(df_sample10).persist()

pipeline_df_sample10_r = pipeline_df_sample10.filter(pipeline_df_sample10.label == 1)

df_sample10.unpersist()

# display(pipeline_df_sample10)



[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975378>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# 10% Sample Pipeline[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;34m[0m[0m
[0;32m----> 3[0;31m [0mdf_sample10[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0msample[0m[0;34m([0m[0;36m0.1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      4[0m [0;34m[0m[0m
[1;32m      5[0m [0minputCols_categorical[0m [0;34m=[0m [0;34m[[0m[0;34m'QUARTER'[0m[0;34m,[0m [0;34m'DAY_OF_WEEK'[0m[0;34m,[0m [0;34m'DISTANCE_GROUP'[0m[0;34m,[0m [0;34m'holiday'[0m[0;34m,[0m [0;34m'holiday_in2DayRange'[0m[0;34m,[0m [0;34m'C19'[0m[0;34m,[0m [0;34m'OP_UNIQUE_CARRIER'[0m[0;34m,[0m [0;34m'type'[0m[0;34m,[0m [0;34m'DEP_TIME_BLK'[0m[0;34m,[0m [0;34m'ORIGIN_AIRPORT_ID'[0m

In [0]:
# 10% Sample Pipeline

df_sample10 = df.sample(0.1)

inputCols_categorical = ['QUARTER', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['Year', 'DAY_OF_MONTH', 'elevation_ft', 'DISTANCE']

pipeline10 = create_pipeline(df_sample10, inputCols_categorical, inputCols_continuous)
pipeline_df_sample10 = pipeline10.fit(df_sample10).transform(df_sample10).persist()

pipeline_df_sample10_r = pipeline_df_sample10.filter(pipeline_df_sample10.label == 1)

df_sample10.unpersist()

# display(pipeline_df_sample10)



[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975379>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# 10% Sample Pipeline[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;34m[0m[0m
[0;32m----> 3[0;31m [0mdf_sample10[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0msample[0m[0;34m([0m[0;36m0.1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      4[0m [0;34m[0m[0m
[1;32m      5[0m [0minputCols_categorical[0m [0;34m=[0m [0;34m[[0m[0;34m'QUARTER'[0m[0;34m,[0m [0;34m'DAY_OF_WEEK'[0m[0;34m,[0m [0;34m'DISTANCE_GROUP'[0m[0;34m,[0m [0;34m'holiday'[0m[0;34m,[0m [0;34m'holiday_in2DayRange'[0m[0;34m,[0m [0;34m'C19'[0m[0;34m,[0m [0;34m'OP_UNIQUE_CARRIER'[0m[0;34m,[0m [0;34m'type'[0m[0;34m,[0m [0;34m'DEP_TIME_BLK'[0m[0;34m,[0m [0;34m'ORIGIN_AIRPORT_ID'[0m

#### Logistic Regression

**No Sampling:**
- maxIter: keep 10 and 20
- regParam: keep 0.3 and 0.4
- elasticNetParam: keep 0

**Over Sampling:**
- maxIter: 20
- regParam: keep experimenting with 0.4, 0.3, 0.2
- elasticNetParam: 0.8

**Under Sampling:**
- maxIter: 20
- regParam: keep experimenting with 0.4, 0.3, 0.2
- elasticNetParam: 0.8

In [0]:
# Logistic Regression No sampling
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.3, 0.4], 'elasticNetParam': [0]}
log_reg_c_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='LogisticRegression', params=log_reg_c_params, sampling='none')
display(log_reg_c_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975381>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Logistic Regression No sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlog_reg_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m[0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.3[0m[0;34m,[0m [0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlog_reg_c_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'LogisticRegression'[0m[0;34m,[0m [0mparams[0m[0;34m=[0

In [0]:
# Logistic Regression Over sampling 
log_reg_c_params = { 'maxIter': [20], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0.8]}
log_reg_c_under_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='LogisticRegression', params=log_reg_c_params, sampling='over')
display(log_reg_c_under_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975382>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Logistic Regression Over sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlog_reg_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.2[0m[0;34m,[0m[0;36m0.3[0m[0;34m,[0m[0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.8[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlog_reg_c_under_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'LogisticRegression'[0m[0;34m,[0m [0mparams[0m[0;

In [0]:
# Logistic Regression Under sampling 
log_reg_c_params = { 'maxIter': [20], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0.8]}
log_reg_c_under_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='LogisticRegression', params=log_reg_c_params, sampling='under')
display(log_reg_c_under_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975383>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Logistic Regression Under sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlog_reg_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.2[0m[0;34m,[0m[0;36m0.3[0m[0;34m,[0m[0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.8[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlog_reg_c_under_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'LogisticRegression'[0m[0;34m,[0m [0mparams[0m[0

#### Linear Regression

**No Sampling:**
- maxIter: keep experimenting with 10 and 20
- regParam: keep 0.4, 0.3, 0.2
- elasticNetParam: keep 0 and 0.8

In [0]:
# Linear Regression, no sample
lin_reg_r_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.3, 0.4], 'elasticNetParam': [0.0, 0.8] }
lin_reg_r_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='LinearRegression', params=lin_reg_r_params, sampling='none')
display(lin_reg_r_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975385>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Linear Regression, no sample[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mlin_reg_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m][0m[0;34m,[0m [0;34m'regParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.2[0m[0;34m,[0m [0;36m0.3[0m[0;34m,[0m [0;36m0.4[0m[0;34m][0m[0;34m,[0m [0;34m'elasticNetParam'[0m[0;34m:[0m [0;34m[[0m[0;36m0.0[0m[0;34m,[0m [0;36m0.8[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mlin_reg_r_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'Linear

#### Decision Tree Classification

**No Sampling:**
- maxDepth: keep 2
- impurity: keep gini and entropy
- maxBins: keep 28, 32, 40

**Under Sampling:**
- maxDepth: keep 1, 2, 3, 4
- impurity: keep entropy and gini
- maxBins: keep 32 and 40

In [0]:
# Decision Tree Classifier No sampling 
# tested maxDepth of 2,3,4 and 2 always returns the best results 
dt_c_params = { 'maxDepth': [2], 'impurity': ['gini','entropy'], 'maxBins': [28,32,40] }
dt_c_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='DecisionTreeClassifier', params=dt_c_params, sampling='none')
display(dt_c_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975387>[0m in [0;36m<cell line: 4>[0;34m()[0m
[1;32m      2[0m [0;31m# tested maxDepth of 2,3,4 and 2 always returns the best results[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0mdt_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m2[0m[0;34m][0m[0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m28[0m[0;34m,[0m[0;36m32[0m[0;34m,[0m[0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m [0mdt_c_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'DecisionTreeC

In [0]:
# Decision Tree Classifier Under sampling 
dt_c_params = { 'maxDepth': [1,2,3,4], 'impurity': ['gini','entropy'], 'maxBins': [32, 40] }
dt_c_under_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='DecisionTreeClassifier', params=dt_c_params, sampling='under')
display(dt_c_under_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975388>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Decision Tree Classifier Under sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mdt_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m1[0m[0;34m,[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m,[0m[0;36m4[0m[0;34m][0m[0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mdt_c_under_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;3

#### Decision Tree Regression

**No Sampling:**
- try again

**Filtered:**
- try again

In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [1,2,3] }
dt_r_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='DecisionTreeRegressor', params=dt_r_params, sampling='none')
display(dt_r_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975390>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Decision Tree Regressor No sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mdt_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m1[0m[0;34m,[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mdt_r_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'DecisionTreeRegressor'[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0mdt_r_params[0m[0;34m,[0m [0msampling[0m[0;34m=[0m[0;34m'none'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      4[0m [0mdisplay[0m[0;34m([0m[0mdt_r_no_sa

In [0]:
# Decision Tree Regressor No sampling - FILTERED
dt_r_params = { 'maxDepth': [1,2,3] }
dt_r_no_sampling10 = train_model_no_CV(pipeline_df_sample10_r, model_type='DecisionTreeRegressor', params=dt_r_params, sampling='none')
display(dt_r_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975391>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Decision Tree Regressor No sampling - FILTERED[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mdt_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m1[0m[0;34m,[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mdt_r_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10_r[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'DecisionTreeRegressor'[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0mdt_r_params[0m[0;34m,[0m [0msampling[0m[0;34m=[0m[0;34m'none'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      4[0m [0mdisplay[0m[0;34m([0m

#### Random Forest Classification

**No Sampling:**
- maxDepth: keep 15; 10 underperforming
- numTrees: keep 10, 20, 50
- impurity: keep gini
- maxBins: keep 32 and 40 

**Under Sampling:**
- maxDepth: keep 5 and 10; 15 underperforming
- numTrees: keep 20 and 50
- impurity: keep gini and entropy 
- maxBins: keep 40

In [0]:
# Random Forest Classifier - No Sampling
rf_c_params = { 'maxDepth': [15], 'numTrees': [10, 20, 50] , 'impurity': ['gini'], 'maxBins': [32, 40] }
rf_c_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='RandomForestClassifier', params=rf_c_params, sampling='none')
display(rf_c_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975393>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Classifier - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m32[0m[0;34m,[0m [0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_c_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34

In [0]:
# Random Forest Classifier - Under Sampling
rf_c_params = { 'maxDepth': [5, 10], 'numTrees': [20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [40] }
rf_c_under_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='RandomForestClassifier', params=rf_c_params, sampling='under')
display(rf_c_under_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975394>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Classifier - Under Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_c_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m [0;36m10[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'impurity'[0m[0;34m:[0m [0;34m[[0m[0;34m'gini'[0m[0;34m,[0m[0;34m'entropy'[0m[0;34m][0m[0;34m,[0m [0;34m'maxBins'[0m[0;34m:[0m [0;34m[[0m[0;36m40[0m[0;34m][0m [0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_c_under_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample1

#### Random Forest Regression

In [0]:
# Random Forest Regression - No Sampling
rf_r_params = { 'maxDepth': [5,10], 'numTrees': [20, 50]}
rf_r_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='RandomForestRegressor', params=rf_r_params, sampling='none')
display(rf_r_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975396>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Random Forest Regression - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mrf_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m][0m[0;34m,[0m [0;34m'numTrees'[0m[0;34m:[0m [0;34m[[0m[0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mrf_r_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;34m,[0m [0mmodel_type[0m[0;34m=[0m[0;34m'RandomForestRegressor'[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0mrf_r_params[0m[0;34m,[0m [0msampling[0m[0;34m=[0m[0;34m'none'[0m[0;34m)[0m[

#### Gradient Boosted Trees Regression

In [0]:
# Gradient Boosted Trees Regression - No Sampling
gbt_r_params = { 'maxDepth': [5,10,15], 'maxIter': [10, 20, 50] , 'stepSize': [0.01, 0.1, 1.0]}
gbt_r_no_sampling10 = train_model_no_CV(pipeline_df_sample10, model_type='GBTRegressor', params=gbt_r_params, sampling='none')
display(gbt_r_no_sampling10)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-632558266975398>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Gradient Boosted Trees Regression - No Sampling[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mgbt_r_params[0m [0;34m=[0m [0;34m{[0m [0;34m'maxDepth'[0m[0;34m:[0m [0;34m[[0m[0;36m5[0m[0;34m,[0m[0;36m10[0m[0;34m,[0m[0;36m15[0m[0;34m][0m[0;34m,[0m [0;34m'maxIter'[0m[0;34m:[0m [0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m20[0m[0;34m,[0m [0;36m50[0m[0;34m][0m [0;34m,[0m [0;34m'stepSize'[0m[0;34m:[0m [0;34m[[0m[0;36m0.01[0m[0;34m,[0m [0;36m0.1[0m[0;34m,[0m [0;36m1.0[0m[0;34m][0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mgbt_r_no_sampling10[0m [0;34m=[0m [0mtrain_model_no_CV[0m[0;34m([0m[0mpipeline_df_sample10[0m[0;3