In [1]:
#Code Snippet 30
#Step 1 - Importing the data and essential libraries 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkTrees').getOrCreate()
data = spark.read.csv('breast-cancer.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(3)
#Step 2 - Data pre-processing and conerting data to spark accepted format
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['thickness','cell_size','cell_shape'],outputCol='features')
assembler_data = assembler.transform(data)
final_data = assembler_data.select('features','label')
print("Consolidated Data with features and labels")
final_data.show(3)
#Step 3 - Training our Decision model 
# Splitting the data into 80 and 20 percent
train_data,test_data = final_data.randomSplit([0.8,0.2])
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol='label',featuresCol='features')
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)
#Step 4 - Evaluating our Trained Model
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
eval_obj = BinaryClassificationEvaluator(labelCol='label')
print("Area Under the Curve value is {}".format(eval_obj.evaluate(dt_predictions)))
mul_eval_obj = MulticlassClassificationEvaluator(labelCol='label',metricName='accuracy')
print("\nAccuracy of Decision Tree is {}".format(mul_eval_obj.evaluate(dt_predictions)))
print("\nPrediction Data")
dt_predictions.show(3)
print("Detemining which feature played a major role in Decision Making\n")
print(dt_model.featureImportances)

Initial Data
+---------+---------+----------+-----+
|thickness|cell_size|cell_shape|label|
+---------+---------+----------+-----+
|        5|        1|         1|    2|
|        5|        4|         4|    2|
|        3|        1|         1|    2|
+---------+---------+----------+-----+
only showing top 3 rows

Consolidated Data with features and labels
+-------------+-----+
|     features|label|
+-------------+-----+
|[5.0,1.0,1.0]|    2|
|[5.0,4.0,4.0]|    2|
|[3.0,1.0,1.0]|    2|
+-------------+-----+
only showing top 3 rows

Area Under the Curve value is 1.0

Accuracy of Decision Tree is 0.9285714285714286

Prediction Data
+-------------+-----+--------------------+--------------------+----------+
|     features|label|       rawPrediction|         probability|prediction|
+-------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,1.0]|    2|[0.0,0.0,346.0,0....|[0.0,0.0,0.980169...|       2.0|
|[1.0,1.0,1.0]|    2|[0.0,0.0,346.0,0....|[0.0,0.0,0.980169...|    