# **IRIS Naive Bayes Classification**

In [None]:
# install
!pip install Pyspark

In [2]:
# start spark session
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
# read sklearn inbuilt data
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)
iris = iris.frame
iris = spark.createDataFrame(iris)

In [6]:
iris.show()

+-----------------+----------------+-----------------+----------------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
+-----------------+----------------+-----------------+----------------+------+
|              5.1|             3.5|              1.4|             0.2|     0|
|              4.9|             3.0|              1.4|             0.2|     0|
|              4.7|             3.2|              1.3|             0.2|     0|
|              4.6|             3.1|              1.5|             0.2|     0|
|              5.0|             3.6|              1.4|             0.2|     0|
|              5.4|             3.9|              1.7|             0.4|     0|
|              4.6|             3.4|              1.4|             0.3|     0|
|              5.0|             3.4|              1.5|             0.2|     0|
|              4.4|             2.9|              1.4|             0.2|     0|
|              4.9|             3.1|              1.

In [7]:
iris.printSchema()

root
 |-- sepal length (cm): double (nullable = true)
 |-- sepal width (cm): double (nullable = true)
 |-- petal length (cm): double (nullable = true)
 |-- petal width (cm): double (nullable = true)
 |-- target: long (nullable = true)



In [8]:
iris.columns

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)',
 'target']

In [9]:
from pyspark.ml.feature import VectorAssembler 

In [10]:
featureassembler= VectorAssembler(inputCols=['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)'], outputCol='Features')

In [11]:
output = featureassembler.transform(iris)

In [12]:
output.show()

+-----------------+----------------+-----------------+----------------+------+-----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|         Features|
+-----------------+----------------+-----------------+----------------+------+-----------------+
|              5.1|             3.5|              1.4|             0.2|     0|[5.1,3.5,1.4,0.2]|
|              4.9|             3.0|              1.4|             0.2|     0|[4.9,3.0,1.4,0.2]|
|              4.7|             3.2|              1.3|             0.2|     0|[4.7,3.2,1.3,0.2]|
|              4.6|             3.1|              1.5|             0.2|     0|[4.6,3.1,1.5,0.2]|
|              5.0|             3.6|              1.4|             0.2|     0|[5.0,3.6,1.4,0.2]|
|              5.4|             3.9|              1.7|             0.4|     0|[5.4,3.9,1.7,0.4]|
|              4.6|             3.4|              1.4|             0.3|     0|[4.6,3.4,1.4,0.3]|
|              5.0|           

In [13]:
modeldata=output.select('Features','target')

In [14]:
modeldata.show()

+-----------------+------+
|         Features|target|
+-----------------+------+
|[5.1,3.5,1.4,0.2]|     0|
|[4.9,3.0,1.4,0.2]|     0|
|[4.7,3.2,1.3,0.2]|     0|
|[4.6,3.1,1.5,0.2]|     0|
|[5.0,3.6,1.4,0.2]|     0|
|[5.4,3.9,1.7,0.4]|     0|
|[4.6,3.4,1.4,0.3]|     0|
|[5.0,3.4,1.5,0.2]|     0|
|[4.4,2.9,1.4,0.2]|     0|
|[4.9,3.1,1.5,0.1]|     0|
|[5.4,3.7,1.5,0.2]|     0|
|[4.8,3.4,1.6,0.2]|     0|
|[4.8,3.0,1.4,0.1]|     0|
|[4.3,3.0,1.1,0.1]|     0|
|[5.8,4.0,1.2,0.2]|     0|
|[5.7,4.4,1.5,0.4]|     0|
|[5.4,3.9,1.3,0.4]|     0|
|[5.1,3.5,1.4,0.3]|     0|
|[5.7,3.8,1.7,0.3]|     0|
|[5.1,3.8,1.5,0.3]|     0|
+-----------------+------+
only showing top 20 rows



In [15]:
# split data
train_data,test_data=modeldata.randomSplit([0.8,0.2])

In [16]:
train_data.show()

+-----------------+------+
|         Features|target|
+-----------------+------+
|[4.3,3.0,1.1,0.1]|     0|
|[4.4,2.9,1.4,0.2]|     0|
|[4.4,3.0,1.3,0.2]|     0|
|[4.5,2.3,1.3,0.3]|     0|
|[4.6,3.1,1.5,0.2]|     0|
|[4.6,3.2,1.4,0.2]|     0|
|[4.6,3.4,1.4,0.3]|     0|
|[4.6,3.6,1.0,0.2]|     0|
|[4.7,3.2,1.3,0.2]|     0|
|[4.7,3.2,1.6,0.2]|     0|
|[4.8,3.0,1.4,0.1]|     0|
|[4.8,3.0,1.4,0.3]|     0|
|[4.8,3.4,1.9,0.2]|     0|
|[4.9,2.4,3.3,1.0]|     1|
|[4.9,3.0,1.4,0.2]|     0|
|[4.9,3.1,1.5,0.1]|     0|
|[4.9,3.1,1.5,0.2]|     0|
|[4.9,3.6,1.4,0.1]|     0|
|[5.0,2.0,3.5,1.0]|     1|
|[5.0,3.0,1.6,0.2]|     0|
+-----------------+------+
only showing top 20 rows



In [17]:
# logistic regression model
from pyspark.ml.classification import NaiveBayes

In [18]:
nb = NaiveBayes(featuresCol='Features', labelCol='target')

In [19]:
nb=nb.fit(train_data)

In [20]:
# prediction
y_pred = nb.transform(test_data)

In [21]:
y_pred.show()

+-----------------+------+--------------------+--------------------+----------+
|         Features|target|       rawPrediction|         probability|prediction|
+-----------------+------+--------------------+--------------------+----------+
|[4.4,3.2,1.3,0.2]|     0|[-10.924546619963...|[0.70879033909987...|       0.0|
|[4.8,3.1,1.6,0.2]|     0|[-11.681472524179...|[0.66391980310474...|       0.0|
|[4.8,3.4,1.6,0.2]|     0|[-12.006726936101...|[0.70268869066115...|       0.0|
|[5.0,3.4,1.6,0.4]|     0|[-12.877721944643...|[0.65224071752808...|       0.0|
|[5.0,3.6,1.4,0.2]|     0|[-11.977937873515...|[0.76315549974619...|       0.0|
|[5.2,2.7,3.9,1.4]|     1|[-20.363260051982...|[0.05313757173720...|       1.0|
|[5.4,3.4,1.5,0.4]|     0|[-12.967869005639...|[0.68638356704092...|       0.0|
|[5.4,3.4,1.7,0.2]|     0|[-12.626445640424...|[0.71132465639945...|       0.0|
|[5.5,2.3,4.0,1.3]|     1|[-19.971835549742...|[0.04735914399959...|       1.0|
|[5.9,3.0,4.2,1.5]|     1|[-22.131329162

In [22]:
# confusion matrix
y_pred.groupBy('target', 'prediction').count().show()

+------+----------+-----+
|target|prediction|count|
+------+----------+-----+
|     0|       0.0|    7|
|     1|       1.0|    7|
|     2|       2.0|    6|
|     2|       1.0|    3|
+------+----------+-----+



In [23]:
from sklearn.metrics import confusion_matrix
pred=y_pred.select("prediction").collect()
orig=y_pred.select("target").collect()
print(confusion_matrix(orig, pred))

[[7 0 0]
 [0 7 0]
 [0 3 6]]


(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| logLoss|hammingLoss)

In [24]:
# evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='target', predictionCol='prediction')

In [26]:
accuracy = evaluator.evaluate(y_pred)
accuracy

0.8680306905370845

In [27]:
# close connection to spark
spark.stop()