In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

# Load data

In [2]:
iris = spark.read.csv("iris.csv",header=True,inferSchema=True)
iris.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [3]:
iris.dtypes

[('sepal_length', 'double'),
 ('sepal_width', 'double'),
 ('petal_length', 'double'),
 ('petal_width', 'double'),
 ('species', 'string')]

In [4]:
iris.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     null|
|    min|               4.3|                2.0|               1.0|               0.1|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



## Merge features to create a features-column

In [5]:
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

In [6]:
iris2 = iris.rdd.map(lambda x: Row(features = Vectors.dense(x[:-1]),species=x[-1])).toDF()
iris2.show()

+-----------------+-------+
|         features|species|
+-----------------+-------+
|[5.1,3.5,1.4,0.2]| setosa|
|[4.9,3.0,1.4,0.2]| setosa|
|[4.7,3.2,1.3,0.2]| setosa|
|[4.6,3.1,1.5,0.2]| setosa|
|[5.0,3.6,1.4,0.2]| setosa|
|[5.4,3.9,1.7,0.4]| setosa|
|[4.6,3.4,1.4,0.3]| setosa|
|[5.0,3.4,1.5,0.2]| setosa|
|[4.4,2.9,1.4,0.2]| setosa|
|[4.9,3.1,1.5,0.1]| setosa|
|[5.4,3.7,1.5,0.2]| setosa|
|[4.8,3.4,1.6,0.2]| setosa|
|[4.8,3.0,1.4,0.1]| setosa|
|[4.3,3.0,1.1,0.1]| setosa|
|[5.8,4.0,1.2,0.2]| setosa|
|[5.7,4.4,1.5,0.4]| setosa|
|[5.4,3.9,1.3,0.4]| setosa|
|[5.1,3.5,1.4,0.3]| setosa|
|[5.7,3.8,1.7,0.3]| setosa|
|[5.1,3.8,1.5,0.3]| setosa|
+-----------------+-------+
only showing top 20 rows



# Index label column with StringIndexer
## Import packages

In [7]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

## Build pipeline

In [8]:
stringindexer = StringIndexer(inputCol="species",outputCol='label')
stages = [stringindexer]
pipeline = Pipeline(stages=stages)

## Transform data

In [9]:
iris_df = pipeline.fit(iris2).transform(iris2)
iris_df.show(5)

+-----------------+-------+-----+
|         features|species|label|
+-----------------+-------+-----+
|[5.1,3.5,1.4,0.2]| setosa|  2.0|
|[4.9,3.0,1.4,0.2]| setosa|  2.0|
|[4.7,3.2,1.3,0.2]| setosa|  2.0|
|[4.6,3.1,1.5,0.2]| setosa|  2.0|
|[5.0,3.6,1.4,0.2]| setosa|  2.0|
+-----------------+-------+-----+
only showing top 5 rows



In [10]:
iris_df.describe().show()

+-------+---------+------------------+
|summary|  species|             label|
+-------+---------+------------------+
|  count|      150|               150|
|   mean|     null|               1.0|
| stddev|     null|0.8192319205190403|
|    min|   setosa|               0.0|
|    max|virginica|               2.0|
+-------+---------+------------------+



In [11]:
iris_df.dtypes

[('features', 'vector'), ('species', 'string'), ('label', 'double')]

# Naive Bayes classification
## Split data into training and test sets

In [12]:
train,test = iris_df.randomSplit([0.8,0.2],seed=1234)

## Build cross-validation model
### Estimator

In [13]:
from pyspark.ml.classification import NaiveBayes
naivebayes = NaiveBayes(featuresCol='features',labelCol='label')

### Parameter Grid

In [14]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
            addGrid(naivebayes.smoothing,[0,1,2,4,8]).\
            build()

### Evaluator
There are three categories in the label column

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()

### Build cross-validation model

In [16]:
from pyspark.ml.tuning import CrossValidator
crossvalidator = CrossValidator(estimator=naivebayes,estimatorParamMaps=param_grid,evaluator=evaluator)

### Fit cross-validation model

In [18]:
crossvalidation_mode = crossvalidator.fit(train)

### Prediction on training and test sets

In [19]:
pred_train = crossvalidation_mode.transform(train)
pred_train.show(5)

+-----------------+-------+-----+--------------------+--------------------+----------+
|         features|species|label|       rawPrediction|         probability|prediction|
+-----------------+-------+-----+--------------------+--------------------+----------+
|[4.4,3.2,1.3,0.2]| setosa|  2.0|[-12.291171378989...|[0.19555929123370...|       2.0|
|[4.5,2.3,1.3,0.3]| setosa|  2.0|[-11.142786320680...|[0.26949095780384...|       2.0|
|[4.6,3.1,1.5,0.2]| setosa|  2.0|[-12.550742246764...|[0.21656037289160...|       2.0|
|[4.6,3.2,1.4,0.2]| setosa|  2.0|[-12.592365024690...|[0.20052392100539...|       2.0|
|[4.6,3.4,1.4,0.3]| setosa|  2.0|[-13.148463542368...|[0.19824085251800...|       2.0|
+-----------------+-------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [20]:
pred_test = crossvalidation_mode.transform(test)
pred_test.show(5)

+-----------------+-------+-----+--------------------+--------------------+----------+
|         features|species|label|       rawPrediction|         probability|prediction|
+-----------------+-------+-----+--------------------+--------------------+----------+
|[4.3,3.0,1.1,0.1]| setosa|  2.0|[-11.402228667810...|[0.18240945200010...|       2.0|
|[4.4,2.9,1.4,0.2]| setosa|  2.0|[-11.923306551010...|[0.22553075812617...|       2.0|
|[4.4,3.0,1.3,0.2]| setosa|  2.0|[-11.964929328937...|[0.20930932928504...|       2.0|
|[4.8,3.1,1.6,0.2]| setosa|  2.0|[-12.851935892465...|[0.22180002940740...|       2.0|
|[5.0,3.3,1.4,0.2]| setosa|  2.0|[-13.114876846919...|[0.18601727933390...|       2.0|
+-----------------+-------+-----+--------------------+--------------------+----------+
only showing top 5 rows



## Best model from cross-validation

In [22]:
print("The parameter smoothing has best value:",crossvalidation_mode.bestModel._java_obj.getSmoothing())

The parameter smoothing has best value: 8.0


## Prediction Accuracy
* f1
* weightedPrecision
* weightedRecall
* accuracy

### Prediction accuracy on training data

In [25]:
print("training data(f1):",evaluator.setMetricName("f1").evaluate(pred_train))
print("training data(weightedPrecision):",evaluator.setMetricName("weightedPrecision").evaluate(pred_train))
print("training data(weightedRecall):",evaluator.setMetricName("weightedRecall").evaluate(pred_train))
print("training data(accuracy):",evaluator.setMetricName("accuracy").evaluate(pred_train))


training data(f1): 0.968236789665361
training data(weightedPrecision): 0.9689250225835592
training data(weightedRecall): 0.9682539682539681
training data(accuracy): 0.9682539682539683


### Prediction accuracy on test data

In [26]:
print("training data(f1):",evaluator.setMetricName("f1").evaluate(pred_test))
print("training data(weightedPrecision):",evaluator.setMetricName("weightedPrecision").evaluate(pred_test))
print("training data(weightedRecall):",evaluator.setMetricName("weightedRecall").evaluate(pred_test))
print("training data(accuracy):",evaluator.setMetricName("accuracy").evaluate(pred_test))


training data(f1): 0.958119658119658
training data(weightedPrecision): 0.9635416666666667
training data(weightedRecall): 0.9583333333333334
training data(accuracy): 0.9583333333333334


## Confusion Matrix

In [27]:
## Training data
train_conf_mat = pred_train.select("label","prediction")
train_conf_mat.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=2.0, prediction=2.0): 40,
             Row(label=0.0, prediction=0.0): 42,
             Row(label=1.0, prediction=1.0): 40,
             Row(label=0.0, prediction=1.0): 1,
             Row(label=1.0, prediction=0.0): 3})

In [28]:
## Testing data
test_conf_mat = pred_test.select("label",'prediction')
test_conf_mat.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=2.0, prediction=2.0): 10,
             Row(label=0.0, prediction=0.0): 7,
             Row(label=1.0, prediction=1.0): 6,
             Row(label=1.0, prediction=0.0): 1})