## Gradient-Boosted Tree Classifier

#### Gradient-boosted trees are a popular classification and regression method using ensembles of decision trees.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [None]:
spark = SparkSession.builder.appName("GBT Classifier").getOrCreate()

In [None]:
data = spark.read.csv("Data/iris.csv", header = True, inferSchema = True)
data.head(3)

In [None]:
data.columns

In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol = 'features')
output = featureassembler.transform(data)
output.show()

In [None]:
finalized_data = output.select("features", "class")
finalized_data.show()

In [None]:
#Index labels, adding metadata to the label columns
#Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol="class", outputCol="indexedLabel").fit(finalized_data)

#Automatically identify categorical features and index them
#We specify maxCategories so features with > 4 distinct values are treated as continus
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(finalized_data)


In [None]:
train, test = finalized_data.randomSplit([0.7, 0.3], 12345)
print(f"The number of Train Dataset: {str(train.count())}")
print(f"The number of Test Dataset: {str(test.count())}")


In [None]:
gbt = GBTClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", maxIter = 10)
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])


In [None]:
train.show()

In [None]:
model = pipeline.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
predictions.select("prediction", "indexedLabel", "features").show(5)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluator(predictions)
print("Test Error = %g " % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)