## Sandbox Decision Tree pipeline
Danielle Yoseloff

In [0]:
# imports
# imports
import pandas as pd
import numpy as np
import pytz
from datetime import datetime, timedelta, time
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql.functions import to_timestamp
from prophet.plot import plot_forecast_component
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, StructType, DoubleType, LongType
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when, to_timestamp, lit, udf
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeRegressor

from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier

In [0]:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"

period = "" # on of the following values ("", "3m", "6m", "1y")

df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr.parquet")

df = df.withColumn("outcome", (when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double"))



In [0]:
# display(df)
print("Number of rows:", df.count(),"\nNumber of columns:", len(df.columns))

In [0]:
# define what departure time variable is called
dep_utc_varname = "sched_depart_utc"

df_handled = df.withColumns(
    {
        "dep_hour_utc": 
            F.hour(col(dep_utc_varname)),
        "outcome":  
            (F.when((col("DEP_DELAY") >= 15) | (col("CANCELLED") == 1), 1).otherwise(0)).cast("double")
            }
)

In [0]:
# # define train/test split date
# min_test_dt = "2019-10-01"

# df_train = df_handled.filter(F.col(dep_utc_varname) < min_test_dt)
# df_train.cache()
# df_test = df_handled.filter(F.col(dep_utc_varname) >= min_test_dt)
# df_test.cache()

In [0]:
# display(df)
# print("DF Number of rows:", df.count(),"   Number of columns:", len(df.columns))
# print("TRAIN Number of rows:", df_train.count(),"   Number of columns:", len(df_train.columns))
# print("TEST Number of rows:", df_test.count(),"   Number of columns:", len(df_test.columns))

## Breakdown of MLP

In [0]:
sampledata = df.sample(fraction=(1000/df.count()) ,seed=42)
sampledata = sampledata.fillna(0)
(train_data, test_data) = sampledata.randomSplit([0.8,0.2], seed=42)

In [0]:
display(sampledata)

https://dev.to/ruizleandro/build-a-multilayer-perceptron-with-pyspark-5bkj

In [0]:
categorical_cols = ["OP_UNIQUE_CARRIER"]
numeric_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column)) for column in categorical_cols]

encoders = [OneHotEncoder(inputCol='{0}_index'.format(
    column), outputCol='{0}_ohe'.format(
    column)) for column in categorical_cols]

featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features')
layers = [len(featuresCreator.getInputCols()), 4, 2, 2]

classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features',
                                            maxIter=10,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)

pipeline = Pipeline(stages=indexers + encoders + [featuresCreator])
model = pipeline.fit(train_data)
model.transform(train_data.limit(1))

In [0]:
display(model.transform(train_data.limit(1)))

In [0]:
tmp = model.transform(train_data.limit(1)).select('features').toPandas()

In [0]:
display(tmp)

In [0]:
L = tmp.iloc[0,0].size

In [0]:
# SOLUTION 1 

### set handle_invalid hyperparam to "keep"

categorical_cols = ["OP_UNIQUE_CARRIER"]
numeric_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(
    column)) for column in categorical_cols]

encoders = [OneHotEncoder(inputCol='{0}_index'.format(
    column), outputCol='{0}_ohe'.format(
    column)) for column in categorical_cols]

featuresCreator = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders] + numeric_cols,
    outputCol='features')

scaler = MinMaxScaler(inputCol='features',outputCol='features_scaled')


In [0]:
# SOLUTION 2

pipeline0 = Pipeline(stages=indexers + encoders + [featuresCreator, scaler])
model0 = pipeline0.fit(train_data)
tmp = model0.transform(train_data.limit(1)).select('features').toPandas()
L = tmp.iloc[0,0].size

In [0]:
# SOLUTION 3

layers = [L, 4, 2, 2]

classifier = MultilayerPerceptronClassifier(labelCol='outcome',
                                            featuresCol='features_scaled',
                                            maxIter=100,
                                            layers=layers,
                                            blockSize=128,
                                            seed=1234)

pipeline = Pipeline(stages=indexers + encoders + [featuresCreator, scaler, classifier])
model = pipeline.fit(train_data)

train_output_df = model.transform(train_data)
test_output_df = model.transform(test_data)

train_predictionAndLabels = train_output_df.withColumnRenamed('outcome','label').select('prediction', 'label')
test_predictionAndLabels = test_output_df.withColumnRenamed('outcome','label').select('prediction', 'label')

metrics = ['accuracy']
for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(
        train_predictionAndLabels)))
    print('Test ' + metric + ' = ' + str(evaluator.evaluate(
        test_predictionAndLabels)))

# SEE ABOVE

In [0]:
categorical_cols = ["OP_UNIQUE_CARRIER"]
numeric_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']

## pipeline stages
stages = []

for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index",handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_vec",handleInvalid="keep")
    stages += [indexer, encoder]
# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# scale features
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")

mlp = MultilayerPerceptronClassifier(layers=[len(assembler.getInputCols()), 2, 2], featuresCol="features_scaled" , labelCol="outcome", seed=1234)

pipeline = Pipeline(stages=stages+[assembler, scaler, mlp])
# pipeline = Pipeline(stages=stages+[assembler, scaler])
# pipeline = Pipeline(stages=stages+[mlp])

model = pipeline.fit(train_data)
train_output_df = model.transform(train_data)

test_output_df = model.transform(test_data)

# break out [assembler, scaler, on train  and fit to get the layer innput size then a seeperate pipleine to train model using ourput of fitted 

In [0]:
categorical_cols = ["OP_UNIQUE_CARRIER"]
numeric_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']

## pipeline stages
stages = []

for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index",handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_vec",handleInvalid="keep")
    stages += [indexer, encoder]
# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# scale features
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")

mlp = MultilayerPerceptronClassifier(layers=[len(assembler.getInputCols()), 2, 2], featuresCol="features_scaled" , labelCol="outcome", seed=1234)

pipeline1 = Pipeline(stages=stages+[assembler, scaler])

datas = pipeline1.fit(train_data)

# train_output_df = model.transform(train_data)

# test_output_df = model.transform(test_data)


# partial_pipeline = Pipeline(stages=stages+[assembler])

# partial_model = partial_pipeline.fit(data)

# transformed_data = partial_model.transform(data)

# vector_size = transformed_data.select("features").first()[0].size

# break out [assembler, scaler, on train  and fit to get the layer innput size then a seeperate pipleine to train model using ourput of fitted 

## Using spark RF classifier & MLP

In [0]:
df_train.columns

In [0]:
data = df_train.limit(300)

## pipeline column definitions
# categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
# weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
# remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
# weather_cols = [c for c in weather_cols if c not in remove_me]
# prior_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']
# numeric_cols = weather_cols + prior_cols


# categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
categorical_cols = []
weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
weather_cols = [c for c in weather_cols if c not in remove_me]
prior_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']
numeric_cols = weather_cols + prior_cols

## pipeline stages
stages = []

for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index",handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_vec",handleInvalid="keep")
    stages += [indexer, encoder]
# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# scale features
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")


# random forest model
# ### numTrees=30, maxDepth=10
# rf = RandomForestClassifier(numTrees=3, maxDepth=2, featuresCol="features_scaled" , labelCol="outcome", seed=42)

# pipeline = Pipeline(stages=stages+[assembler,scaler,rf])

# multilayer perceptron model
## TO DO: GET CORRECT LAYER VALUES
mlp = MultilayerPerceptronClassifier(layers=[len(assembler.getInputCols()), 2, 2], featuresCol="features_scaled" , labelCol="outcome", seed=1234)

pipeline = Pipeline(stages=stages+[assembler,scaler,mlp])



In [0]:
data = df_train.limit(300)

## pipeline column definitions
# categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
# weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
# remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
# weather_cols = [c for c in weather_cols if c not in remove_me]
# prior_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']
# numeric_cols = weather_cols + prior_cols


categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
# categorical_cols = []
weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
weather_cols = [c for c in weather_cols if c not in remove_me]
prior_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']
numeric_cols = weather_cols + prior_cols

## pipeline stages
stages = []

for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index",handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_vec",handleInvalid="keep",dropLast=False)
    stages += [indexer, encoder]
# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# scale features
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")


# random forest model
# ### numTrees=30, maxDepth=10
# rf = RandomForestClassifier(numTrees=3, maxDepth=2, featuresCol="features_scaled" , labelCol="outcome", seed=42)

# pipeline = Pipeline(stages=stages+[assembler,scaler,rf])

# multilayer perceptron model
## TO DO: GET CORRECT LAYER VALUES
mlp = MultilayerPerceptronClassifier(layers=[60, 2, 2], featuresCol="features_scaled" , labelCol="outcome", seed=1234)
data1=data.fillna(0)
pipeline = Pipeline(stages=stages+[assembler,scaler,mlp])

(train_df, dev_df) = data1.randomSplit([0.8,0.2], seed=42)

model = pipeline.fit(train_df)
dev_pred = model.transform(dev_df)


In [0]:
display(dev_pred.select('features'))

In [0]:
len(assembler.getInputCols())

In [0]:
pipeline.getStages()[-3].getInputCols()

In [0]:
assembler.extractParamMap()

In [0]:
traintransformed=model.transform(train_df)

In [0]:
display(traintransformed.select('features'))

In [0]:
mlp.getLayers()

In [0]:
display(dev_pred.select('features'))

In [0]:
data1=data.fillna(0)

In [0]:
encoder.getHandleInvalid()

In [0]:
display(dev_pred.select('probability'))

In [0]:
from pyspark.ml.feature import VectorSizeHint


In [0]:
from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                        outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

In [0]:
data = df_train.limit(300)

## pipeline column definitions
# categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
# weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
# remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
# weather_cols = [c for c in weather_cols if c not in remove_me]
# prior_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']
# numeric_cols = weather_cols + prior_cols


# categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
categorical_cols = ['OP_UNIQUE_CARRIER','YEAR','QUARTER','MONTH','DAY_OF_MONTH','origin_region','origin_type']
weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
weather_cols = [c for c in weather_cols if c not in remove_me]
prior_cols = ['turnaround_time_calc', 'priorflight_isdelayed_calc']
numeric_cols = weather_cols + prior_cols

## pipeline stages
stages = []

for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, 
                            outputCol=column + "_index",
                            handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", 
                            outputCol=column + "_vec",
                            handleInvalid="keep")
    stages += [indexer, encoder]
# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# vector size hint

# scale features
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")


#actually run




partial_pipeline = Pipeline(stages=stages+[assembler])

partial_model = partial_pipeline.fit(data)

transformed_data = partial_model.transform(data)

vector_size = transformed_data.select("features").first()[0].size

(train_df, dev_df) = transformed_data.randomSplit([0.8,0.2])

mlp1 = MultilayerPerceptronClassifier(layers=[vector_size, 2, 2], featuresCol="features_scaled" , labelCol="outcome", seed=1234)

pipeline = Pipeline(stages=[scaler, mlp1])
model = pipeline.fit(train_df)
dev_pred1 = model.transform(dev_df)


In [0]:
## pipeline stages
stages = []

for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, 
                            outputCol=column + "_index",
                            handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", 
                            outputCol=column + "_vec",
                            handleInvalid="keep")
    stages += [indexer, encoder]

# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='skip') #no nulls lol

# Fit preprocessing on ENTIRE dataset first
partial_pipeline = Pipeline(stages=stages+[assembler])
partial_model = partial_pipeline.fit(data)  # Use full dataset before splitting

# Transform the entire dataset
transformed_data = partial_model.transform(data)

# Get the feature vector size
vector_size = transformed_data.select("features").first()[0].size
print(f"Feature vector size: {vector_size}")

# Now split the transformed data
(train_transformed, dev_transformed) = transformed_data.randomSplit([0.8, 0.2])

# Define MLPs and remaining pipeline steps
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")
mlp1 = MultilayerPerceptronClassifier(
    layers=[vector_size, 2, 2], 
    featuresCol="features_scaled",
    labelCol="outcome", 
    seed=42
)

# Create a new pipeline for the remaining steps
final_pipeline = Pipeline(stages=[scaler, mlp1])
final_model = final_pipeline.fit(train_transformed)

# Transform validation data
dev_pred1 = final_model.transform(dev_transformed)


In [0]:
display(dev_pred1) #sanity check 

In [0]:
display(transformed_data.select('features'))

In [0]:
display(dev_pred1.select('features_scaled'))

In [0]:
vector_size

In [0]:
mlp1 = MultilayerPerceptronClassifier(layers=[vector_size, 2, 2], featuresCol="features_scaled" , labelCol="outcome", seed=1234)

pipeline = Pipeline(stages=stages+[assembler, scaler, mlp1])
model = pipeline.fit(train_df)
dev_pred1 = model.transform(dev_df)


In [0]:
display(transformed_train.select('features_scaled'))

In [0]:
dev_pred

In [0]:
display(dev_pred1)

In [0]:
display(dev_pred1.select('features'))

In [0]:
data.groupBy('outcome').count().show()

In [0]:
## troubleshooting

# train_df.selectExpr("size(features_scaled)").show()
# display(train_df)
# len(assembler.getInputCols())

#  assembler.selectExpr("size(features)").first()[0]

In [0]:
(train_df, dev_df) = data.randomSplit([0.8,0.2])
model = pipeline.fit(train_df)
dev_pred1 = model.transform(dev_df)
display(dev_pred1)

In [0]:
# f = [c for c in data.columns if c not in ["sched_depart_utc",'two_hours_prior_depart_UTC']]
# data.select([F.count(when(F.isnan(c)|col(c).isNull(), c)).alias(c) for c in f]).show()
#Replace 0 for null for all integer columns
data.na.fill(value=0).show()

In [0]:
(train_df, dev_df) = data.randomSplit([0.8,0.2])

In [0]:
model = pipeline.fit(train_df)

In [0]:
display(train_df)

In [0]:
dev_pred = model.transform(dev_df)

In [0]:
dev_pred = model.transform(dev_df)

In [0]:
dev_pred.select("prediction","outcome").show()

In [0]:
display(dev_pred)
# mlp.getMaxIter()
# mlp.getBlockSize()

In [0]:


# (train_df, dev_df) = data.randomSplit([0.8,0.2])
# train_df.cache()
# dev_df.cache()

# model = pipeline.fit(train_df)
# dev_pred = model.transform(dev_df)



In [0]:
display(dev_pred.where(F.col("outcome")!=F.col("prediction")))

In [0]:
display(dev_pred.where(F.col("prediction")==1))


In [0]:
display(train_df.where(F.col("outcome")==1))

In [0]:
categorical_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]
weather_cols = [col for col in df_train.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
numeric_cols = [c for c in weather_cols if c not in remove_me]

In [0]:
#  List to hold the stages of the pipeline
stages = []

# Index and encode each categorical column
for column in categorical_cols:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index",handleInvalid="keep")
    encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_vec",handleInvalid="keep")
    stages += [indexer, encoder]
# define encoded categorical feature names
categorical_vec_columns = [col + "_vec" for col in categorical_cols]

# assemble features
features = numeric_cols + categorical_vec_columns
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid='keep')

# scale features
scaler = MinMaxScaler(inputCol="features", \
    outputCol="features_scaled")

# logistic regression model
lr = LogisticRegression(featuresCol='features_scaled', \
    labelCol='outcome',maxIter=50)

# random forest model 
rf = RandomForestClassifier(numTrees=3, maxDepth=2, featuresCol="features_scaled" , labelCol="outcome", seed=42)

# construct pipeline object from all components
pipeline = Pipeline(stages=stages+[assembler,scaler,rf])

In [0]:
pipeline

## Abandoned attempt to adapt gradient boosting function for an ensemble mllib tree

In [0]:
#define weighted sum function to accumulate - (function currying)
def wt_sum_fcn(f1, f2, wt):
    def wsf(x):
        return (f1(x) + wt * f2(x))
    return wsf

In [0]:
# original

def GradientBoosting(nTrees, nDepth, gamma, bagFrac, X, Y):
    nDataPts = len(X)
    nSamp = int(bagFrac * nDataPts)
    
    # Define function T to accumulate average prediction functions from trained trees.  
    # initialize T to fcn mapping all x to zero to start 
    T = lambda x: 0.0
    
    # loop to generate individual trees in ensemble
    for i in range(nTrees):
        
        # take a random sample from the data
        sampIdx = np.random.choice(nDataPts, nSamp)
        
        xTrain = X[sampIdx]
        
        # estimate the regression values with the current trees.  
        yEst = T(xTrain)
        
        # subtract the estimate based on current ensemble from the labels
        yTrain = Y[sampIdx] - np.array(yEst).reshape([-1,1])
        
        # build a tree on the sampled data using residuals for labels
        tree = DecisionTreeRegressor(max_depth=nDepth)
        tree.fit(xTrain, yTrain)
                
        # add the new tree with a learning rate parameter (gamma)
        T = wt_sum_fcn(T, tree.predict, gamma)
    return T

In [0]:
# with spark mllib

def GradientBoosting(nTrees, nDepth, gamma, bagFrac, X, Y):
    nDataPts = X.count()
    nSamp = int(bagFrac * nDataPts)
    
    # Define function T to accumulate average prediction functions from trained trees.  
    # initialize T to fcn mapping all x to zero to start 
    T = lambda x: 0.0
    
    # loop to generate individual trees in ensemble
    for i in range(nTrees):
        
        # take a random sample from the data
        # sampIdx = np.random.choice(nDataPts, nSamp)
        
        # xTrain = X[sampIdx]

        xTrain = 

        
        # estimate the regression values with the current trees.  
        yEst = T(xTrain)
        
        # subtract the estimate based on current ensemble from the labels
        yTrain = Y[sampIdx] - np.array(yEst).reshape([-1,1])
        
        # build a tree on the sampled data using residuals for labels
        tree = DecisionTreeClassifier(featuresCol='features', labelCol='label', max_depth=nDepth)
        tree.fit(xTrain, yTrain)
                
        # add the new tree with a learning rate parameter (gamma)
        T = wt_sum_fcn(T, tree.predict, gamma)
    return T

In [0]:
# X.count()
nTrees = 20  # try changing the number of trees being built
nDepth = 3   # fairly shallow for 100 data points
gamma = 0.1
bagFrac = 1  

nDataPts = X.count()
nSamp = int(bagFrac * nDataPts)


X.sample(withReplacement=False, fraction=bagFrac,seed=42)

In [0]:
data = df_train.limit(1000)
data = data.withColumn("label", F.col("outcome"))
features = ['turnaround_time_calc', 'priorflight_isdelayed_calc']

va = VectorAssembler(inputCols = features, outputCol= 'features')
va_df = va.transform(data)
va_df = va_df.select(['features', 'label'])


X = va_df.select('features')
Y = va_df.select("label")

In [0]:
def EnsembleDataGen(npts, stdDev):
    #Define data set:
    #Starter is X sampled regularly in [-10, 10], Y = X + noise
    #Try swapping Y = X + noise for Y = np.sin(X) + noise
    X = np.linspace(-10.0, 10.0, npts)
    # Y = X + np.random.normal(0.0, stdDev, npts)   ### Linear
    Y = np.sin(X) + np.random.normal(0.0, stdDev, npts) ### Non-Linear
    return X.reshape([-1,1]), Y.reshape([-1,1])

u,v = EnsembleDataGen(100, 1.0)

In [0]:
nTrees = 20  # try changing the number of trees being built
nDepth = 3   # fairly shallow for 100 data points
gamma = 0.1
bagFrac = 1   # Bag fraction - how many points in each of the random subsamples.  

gbst = GradientBoosting(nTrees, nDepth, gamma, bagFrac, X, Y)

result = gbst(X)

plt.plot(X, result, 'r')
plt.scatter(X,Y)
display(plt.show())

## Simple Decision Tree with Spark MLLib Example

In [0]:
features = ['MONTH','turnaround_time_calc', 'priorflight_isdelayed_calc']
display(df_train[features])

In [0]:
data = df_train.limit(1000)
features = ['turnaround_time_calc', 'priorflight_isdelayed_calc']

va = VectorAssembler(inputCols = features, outputCol= 'features')
va_df = va.transform(data)
va_df = va_df.select(['features', 'outcome'])

In [0]:
display(va_df)

In [0]:
## Train pipeline
(train, test) = va_df.randomSplit([0.8, 0.2])
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='outcome')
dtc = dtc.fit(train)

In [0]:
## Predict on Test
pred = dtc.transform(test)  # pred dataframe
# display(pred)
display(pred.where(F.col('outcome')!=F.col('prediction')))