## PACKAGES

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName("IrisClassification").getOrCreate()

## DATA

In [11]:
data_path = "Iris.csv"
df = spark.read.csv(data_path, inferSchema=True, header=True)

## DATA PROCESSING

In [12]:
df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [13]:
df = df.drop("ID")

In [14]:
labelIndexer = StringIndexer(inputCol="Species", outputCol="label").fit(df)

In [15]:
featureAssembler = VectorAssembler(
    inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    outputCol="features",
)

## MODEL

In [16]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

## PIPELINE

In [17]:
Pipeline = Pipeline(stages=[labelIndexer, featureAssembler, dt])

## TRAIN, TEST SPLIT

In [18]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

## TRAINING

In [19]:
model = Pipeline.fit(trainingData)

## PREDICTION

In [23]:
predictions = model.transform(testData)

predictions.select("probability", "prediction", "label", "features").show(5)

+-------------+----------+-----+-----------------+
|  probability|prediction|label|         features|
+-------------+----------+-----+-----------------+
|[1.0,0.0,0.0]|       0.0|  0.0|[4.5,2.3,1.3,0.3]|
|[1.0,0.0,0.0]|       0.0|  0.0|[4.6,3.2,1.4,0.2]|
|[1.0,0.0,0.0]|       0.0|  0.0|[4.7,3.2,1.3,0.2]|
|[0.0,1.0,0.0]|       1.0|  1.0|[4.9,2.4,3.3,1.0]|
|[0.0,1.0,0.0]|       1.0|  2.0|[4.9,2.5,4.5,1.7]|
+-------------+----------+-----+-----------------+
only showing top 5 rows



## EVALUATION

In [25]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)

error = 1.0 - accuracy

print(f"Test Error: {error}")

Test Error: 0.04878048780487809
