### Naive Bayes
###### Naive Bayes classifiers are family of of simple probablistic, multiclass classifiers based on applying Bayes' theorem with strong (naive) independent assumptions between every pair of features.

###### Naive Bayes can be trained very efficiently. With a single pass over the training data, it computes the conditional probability distribution of each feature given label. For prediction, it applies Bayes' theorem to compute the conditional probablity distribution of each label given an observation. 

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName("Naive Bayes").getOrCreate()
data = spark.read.csv("Data\diabetes.csv", header = True, inferSchema = True)
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [5]:
data.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [6]:
#Feature Assembler
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['Pregnancies',
                                                'Glucose',
                                                'BloodPressure',
                                                'SkinThickness',
                                                'Insulin',
                                                'BMI',
                                                'DiabetesPedigreeFunction',
                                                'Age',], outputCol = "features")

output = featureassembler.transform(data)
output.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|[0.0,137.0,40.0,3...|
|          5|    116|           

In [7]:
#Split the dataset
(train, test) = output.randomSplit([0.6, 0.4], 12345)

In [8]:
#Create the trainer
nb = NaiveBayes(featuresCol = "features", labelCol = "Outcome",smoothing = 1.0, modelType = "multinomial")

In [9]:
#Train the model
model = nb.fit(train)

In [11]:
#Select example rows to display
predictions = model.transform(test)
predictions.select("prediction", "features", "Outcome").show()

+----------+--------------------+-------+
|prediction|            features|Outcome|
+----------+--------------------+-------+
|       0.0|[0.0,84.0,64.0,22...|      0|
|       1.0|[0.0,84.0,82.0,31...|      0|
|       0.0|(8,[1,6,7],[94.0,...|      0|
|       1.0|[0.0,95.0,64.0,39...|      0|
|       0.0|[0.0,95.0,85.0,25...|      1|
|       0.0|[0.0,100.0,70.0,2...|      0|
|       0.0|[0.0,101.0,64.0,1...|      0|
|       0.0|[0.0,101.0,65.0,2...|      0|
|       0.0|[0.0,102.0,75.0,2...|      0|
|       0.0|[0.0,104.0,76.0,0...|      0|
|       0.0|[0.0,105.0,68.0,2...|      0|
|       1.0|[0.0,106.0,70.0,3...|      0|
|       0.0|[0.0,107.0,60.0,2...|      0|
|       0.0|[0.0,111.0,65.0,0...|      0|
|       0.0|[0.0,113.0,80.0,1...|      0|
|       1.0|[0.0,117.0,66.0,3...|      0|
|       1.0|[0.0,118.0,64.0,2...|      0|
|       0.0|[0.0,125.0,68.0,0...|      0|
|       1.0|[0.0,127.0,80.0,3...|      0|
|       0.0|[0.0,131.0,66.0,4...|      1|
+----------+--------------------+-

In [13]:
predictions.select('Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction', 'Outcome', 'prediction').show()

+-----------+-------+-------------+-------------+-------+----+------------------------+-------+----------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Outcome|prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+-------+----------+
|          0|     84|           64|           22|     66|35.8|                   0.545|      0|       0.0|
|          0|     84|           82|           31|    125|38.2|                   0.233|      0|       1.0|
|          0|     94|            0|            0|      0| 0.0|                   0.256|      0|       0.0|
|          0|     95|           64|           39|    105|44.6|                   0.366|      0|       1.0|
|          0|     95|           85|           25|     36|37.4|                   0.247|      1|       0.0|
|          0|    100|           70|           26|     50|30.8|                   0.597|      0|       0.0|
|          0|    101|           64|  

In [14]:
#Compute accuracy ob the test set
evaluator = MulticlassClassificationEvaluator(labelCol = "Outcome", predictionCol = "prediction", metricName = "accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.5896551724137931
