## Decision Tree Classifier

##### Decision trees are a popular family of classification and regression methods

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName("Decision Tree Classifier").getOrCreate()
data = spark.read.csv("Data/iris.csv", header=True, inferSchema=True)
data.head(10)


[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=5.4, sepal_width=3.9, petal_length=1.7, petal_width=0.4, class='Iris-setosa'),
 Row(sepal_length=4.6, sepal_width=3.4, petal_length=1.4, petal_width=0.3, class='Iris-setosa'),
 Row(sepal_length=5.0, sepal_width=3.4, petal_length=1.5, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.4, sepal_width=2.9, petal_length=1.4, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.9, sepal_width=3.1, petal_length=1.5, petal_width=0.1, class='Iris-setosa')]

In [3]:
data.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

#### Arrange or Assemble the feature columns together using VectorAssember class

In [4]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol = 'features')
output = featureassembler.transform(data)
output.show()

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|      class|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|         4.9|  

In [5]:
finalized_data = output.select("features", "class")
finalized_data.show()

+-----------------+-----------+
|         features|      class|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
|[5.4,3.9,1.7,0.4]|Iris-setosa|
|[4.6,3.4,1.4,0.3]|Iris-setosa|
|[5.0,3.4,1.5,0.2]|Iris-setosa|
|[4.4,2.9,1.4,0.2]|Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|
|[5.4,3.7,1.5,0.2]|Iris-setosa|
|[4.8,3.4,1.6,0.2]|Iris-setosa|
|[4.8,3.0,1.4,0.1]|Iris-setosa|
|[4.3,3.0,1.1,0.1]|Iris-setosa|
|[5.8,4.0,1.2,0.2]|Iris-setosa|
|[5.7,4.4,1.5,0.4]|Iris-setosa|
|[5.4,3.9,1.3,0.4]|Iris-setosa|
|[5.1,3.5,1.4,0.3]|Iris-setosa|
|[5.7,3.8,1.7,0.3]|Iris-setosa|
|[5.1,3.8,1.5,0.3]|Iris-setosa|
+-----------------+-----------+
only showing top 20 rows



#### Feature Encoding

In [6]:
#Index labels, adding metadata to the label columns
#Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "class", outputCol = "indexedLabel").fit(finalized_data)

#Automatically identify categorical features and index them
#We specify maxCategories so features with > 4 distinct values are treated as continous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "IndexedFeatures", maxCategories = 4).fit(finalized_data)

In [7]:
#Split the data into training and test set
(trainingData, testData) = finalized_data.randomSplit([0.7,0.3], 12345)

In [8]:
#Train model
dt = DecisionTreeClassifier(labelCol="indexedLabel",
                            featuresCol="IndexedFeatures")


In [9]:
trainingData.columns

['features', 'class']

In [10]:
# Chain indexers and tree in a pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, dt])

#Train model, this also run the indexers
model = pipeline.fit(trainingData)

In [11]:
#make predictions
predictions = model.transform(testData)

In [12]:
#Select Row to Display
predictions.select("prediction", "indexedLabel", "features").show(20)

+----------+------------+-----------------+
|prediction|indexedLabel|         features|
+----------+------------+-----------------+
|       0.0|         0.0|[4.6,3.2,1.4,0.2]|
|       0.0|         0.0|[4.8,3.1,1.6,0.2]|
|       1.0|         2.0|[4.9,2.5,4.5,1.7]|
|       0.0|         0.0|[5.0,3.0,1.6,0.2]|
|       0.0|         0.0|[5.0,3.2,1.2,0.2]|
|       0.0|         0.0|[5.0,3.5,1.3,0.3]|
|       0.0|         0.0|[5.1,3.5,1.4,0.3]|
|       0.0|         0.0|[5.4,3.4,1.5,0.4]|
|       0.0|         0.0|[5.4,3.9,1.3,0.4]|
|       1.0|         1.0|[5.7,2.8,4.1,1.3]|
|       0.0|         0.0|[5.7,4.4,1.5,0.4]|
|       0.0|         0.0|[5.8,4.0,1.2,0.2]|
|       1.0|         1.0|[6.0,2.9,4.5,1.5]|
|       2.0|         2.0|[6.1,2.6,5.6,1.4]|
|       1.0|         1.0|[6.1,2.9,4.7,1.4]|
|       1.0|         1.0|[6.2,2.2,4.5,1.5]|
|       1.0|         1.0|[6.2,2.9,4.3,1.3]|
|       2.0|         2.0|[6.2,3.4,5.4,2.3]|
|       2.0|         2.0|[6.4,2.8,5.6,2.2]|
|       2.0|         2.0|[6.4,3.

In [13]:
#Select (prediction, true label) and compute test error

evaluator = MulticlassClassificationEvaluator(labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g " % (accuracy))

Test Accuracy = 0.941176 


In [14]:
treeModel = model.stages[2]
#Summary only
print(treeModel)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3e278203c93b, depth=4, numNodes=13, numClasses=3, numFeatures=4


In [15]:
treeModel.numNodes

13

In [16]:
treeModel.numClasses

3

In [17]:
treeModel.numFeatures

4

In [18]:
treeModel.featureImportances

SparseVector(4, {0: 0.0173, 2: 0.5436, 3: 0.4391})