In [None]:
#Code Snippet 33
#Step 1 - Importing the data and all the necessary libraries
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkTreeComparisions').getOrCreate()
data = spark.read.format('libsvm').load('libsvm-breast-cancer.txt')
print("Libsvm format Data - Fully formatted and ready to use data")
data.show(3)
#Step 2 - Training our Tree models
# Splitting the data into 70 and 30 percent
train_data,test_data = data.randomSplit([0.7,0.3])
from pyspark.ml.classification import GBTClassifier,DecisionTreeClassifier,RandomForestClassifier
gbt = GBTClassifier() #Gradient Boosted Trees
rf = RandomForestClassifier(numTrees=150) #Random Forest with 150 Trees
dt = DecisionTreeClassifier() #Decision Trees
gbt_model = gbt.fit(train_data)
rf_model = rf.fit(train_data)
dt_model = dt.fit(train_data)

gbt_predictions = gbt_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
dt_predictions = dt_model.transform(test_data)

print("Gradient Boosted Tree Predictions")
gbt_predictions.show(3)

#Step 3 - Evaluating our Trained Model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
mul_eval_obj = MulticlassClassificationEvaluator(metricName='accuracy')

print("Accuracy of Decision Tree is {}".format(mul_eval_obj.evaluate(dt_predictions)))
print("Feature Importances of Decision Tree {}\n".format(dt_model.featureImportances))

print("Accuracy of Random Forest is {}".format(mul_eval_obj.evaluate(rf_predictions)))
print("Feature Importances of Decision Tree {}\n".format(rf_model.featureImportances))

print("Accuracy of GBT is {}".format(mul_eval_obj.evaluate(rf_predictions)))
print("Feature Importances of GBT {}\n".format(rf_model.featureImportances))