# Naive Bayes Classifier
The Naive Bayes algorithm is used for classification tasks. 
The naive assumption is that all the features variables are independent. That is the reason it is called naive. The probability that the observation $x=[x_1,...x_N]$ belongs to the class $C_k$ is given by:

$$P(C_k,x)=\frac{P(C_k)P(x_1,C_k)....P(x_N,C_k)}{P(x)}$$

where $P(x)=\sum_j{P(x_1,C_j)....P(x_N,C_j)P(C_j)}$.

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.{MinMaxScaler,StandardScaler}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Breast_cancer_wisconsin.csv")

df = [clump_thickness: int, size_uniformity: int ... 8 more fields]


[clump_thickness: int, size_uniformity: int ... 8 more fields]

In [3]:
df.printSchema

root
 |-- clump_thickness: integer (nullable = true)
 |-- size_uniformity: integer (nullable = true)
 |-- shape_uniformity: integer (nullable = true)
 |-- marginal_adhesion: integer (nullable = true)
 |-- epithelial_size: integer (nullable = true)
 |-- bare_nucleoli: string (nullable = true)
 |-- bland_chromatin: integer (nullable = true)
 |-- normal_nucleoli: integer (nullable = true)
 |-- mitoses: integer (nullable = true)
 |-- class: integer (nullable = true)



## Cast columns to double

In [4]:
val Columns_cast = df.columns.map(name => df(name).cast("double"))
val df_d = df.select(Columns_cast : _*)

Columns_cast = Array(CAST(clump_thickness AS DOUBLE), CAST(size_uniformity AS DOUBLE), CAST(shape_uniformity AS DOUBLE), CAST(marginal_adhesion AS DOUBLE), CAST(epithelial_size AS DOUBLE), CAST(bare_nucleoli AS DOUBLE), CAST(bland_chromatin AS DOUBLE), CAST(normal_nucleoli AS DOUBLE), CAST(mitoses AS DOUBLE), CAST(class AS DOUBLE))
df_d = [clump_thickness: double, size_uniformity: double ... 8 more fields]


[clump_thickness: double, size_uniformity: double ... 8 more fields]

In [5]:
df_d.printSchema

root
 |-- clump_thickness: double (nullable = true)
 |-- size_uniformity: double (nullable = true)
 |-- shape_uniformity: double (nullable = true)
 |-- marginal_adhesion: double (nullable = true)
 |-- epithelial_size: double (nullable = true)
 |-- bare_nucleoli: double (nullable = true)
 |-- bland_chromatin: double (nullable = true)
 |-- normal_nucleoli: double (nullable = true)
 |-- mitoses: double (nullable = true)
 |-- class: double (nullable = true)



## Replace NaNs with the mean

In [6]:
val M = df_d.select(mean("bare_nucleoli")).as[Double].collect
M

M = Array(3.5446559297218156)


[3.5446559297218156]

In [7]:
val df_m = df_d.na.fill(M(0))

df_m = [clump_thickness: double, size_uniformity: double ... 8 more fields]


[clump_thickness: double, size_uniformity: double ... 8 more fields]

In [8]:
df_m.show(5)

+---------------+---------------+----------------+-----------------+---------------+-------------+---------------+---------------+-------+-----+
|clump_thickness|size_uniformity|shape_uniformity|marginal_adhesion|epithelial_size|bare_nucleoli|bland_chromatin|normal_nucleoli|mitoses|class|
+---------------+---------------+----------------+-----------------+---------------+-------------+---------------+---------------+-------+-----+
|            5.0|            1.0|             1.0|              1.0|            2.0|          1.0|            3.0|            1.0|    1.0|  0.0|
|            5.0|            4.0|             4.0|              5.0|            7.0|         10.0|            3.0|            2.0|    1.0|  0.0|
|            3.0|            1.0|             1.0|              1.0|            2.0|          2.0|            3.0|            1.0|    1.0|  0.0|
|            6.0|            8.0|             8.0|              1.0|            3.0|          4.0|            3.0|            7.0|

## Correlation

In [9]:
var L = List.empty[(String,Double)]

for(cn <- df_m.columns) L = L :+ (cn,df_m.stat.corr("class", cn))

val df_corr = L.toDF("colname","correlation")
df_corr.show()

+-----------------+-------------------+
|          colname|        correlation|
+-----------------+-------------------+
|  clump_thickness| 0.7160013621134151|
|  size_uniformity| 0.8179037353075587|
| shape_uniformity| 0.8189337394205247|
|marginal_adhesion| 0.6968002062857461|
|  epithelial_size| 0.6827845300938645|
|    bare_nucleoli| 0.8160499264435984|
|  bland_chromatin| 0.7566161463789044|
|  normal_nucleoli| 0.7122436220251227|
|          mitoses|0.42317025679524317|
|            class|                1.0|
+-----------------+-------------------+



L = List((clump_thickness,0.7160013621134151), (size_uniformity,0.8179037353075587), (shape_uniformity,0.8189337394205247), (marginal_adhesion,0.6968002062857461), (epithelial_size,0.6827845300938645), (bare_nucleoli,0.8160499264435984), (bland_chromatin,0.7566161463789044), (normal_nucleoli,0.7122436220251227), (mitoses,0.42317025679524317), (class,1.0))
df_corr = [colname: string, correlation: double]


[colname: string, correlation: double]

In [10]:
df_corr.select(col("colname"),abs(col("correlation"))).sort(col("abs(correlation)").desc).show()

+-----------------+-------------------+
|          colname|   abs(correlation)|
+-----------------+-------------------+
|            class|                1.0|
| shape_uniformity| 0.8189337394205247|
|  size_uniformity| 0.8179037353075587|
|    bare_nucleoli| 0.8160499264435984|
|  bland_chromatin| 0.7566161463789044|
|  clump_thickness| 0.7160013621134151|
|  normal_nucleoli| 0.7122436220251227|
|marginal_adhesion| 0.6968002062857461|
|  epithelial_size| 0.6827845300938645|
|          mitoses|0.42317025679524317|
+-----------------+-------------------+



## Apply Vector Assembler

In [11]:
val features = df_m.columns.slice(0,df_m.columns.length -1)

features = Array(clump_thickness, size_uniformity, shape_uniformity, marginal_adhesion, epithelial_size, bare_nucleoli, bland_chromatin, normal_nucleoli, mitoses)


[clump_thickness, size_uniformity, shape_uniformity, marginal_adhesion, epithelial_size, bare_nucleoli, bland_chromatin, normal_nucleoli, mitoses]

In [12]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_81b4e0a368da


vecAssembler_81b4e0a368da

In [13]:
val df_v = assembler.transform(df_m).
    select(col("features"), col("class").as("label"))

df_v.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[5.0,1.0,1.0,1.0,...|  0.0|
|[5.0,4.0,4.0,5.0,...|  0.0|
|[3.0,1.0,1.0,1.0,...|  0.0|
|[6.0,8.0,8.0,1.0,...|  0.0|
|[4.0,1.0,1.0,3.0,...|  0.0|
|[8.0,10.0,10.0,8....|  1.0|
|[1.0,1.0,1.0,1.0,...|  0.0|
|[2.0,1.0,2.0,1.0,...|  0.0|
|[2.0,1.0,1.0,1.0,...|  0.0|
|[4.0,2.0,1.0,1.0,...|  0.0|
+--------------------+-----+
only showing top 10 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [14]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Scale the features

In [15]:
val scaler = new MinMaxScaler().//StandardScaler().
  setInputCol("features").
  setOutputCol("scaledFeatures")

scaler = minMaxScal_891fc14afbe4


minMaxScal_891fc14afbe4

In [16]:
val s = scaler.fit(trainingData)

s = minMaxScal_891fc14afbe4


minMaxScal_891fc14afbe4

In [17]:
val trainingData_s = s.transform(trainingData)
val testData_s = s.transform(testData)

trainingData_s = [features: vector, label: double ... 1 more field]
testData_s = [features: vector, label: double ... 1 more field]


[features: vector, label: double ... 1 more field]

## Train the model

In [18]:
val nb = new NaiveBayes().
    setFeaturesCol("scaledFeatures")
    //setModelType("bernoulli")//Supported options: "multinomial" and "bernoulli". Default is "multinomial"

nb = nb_f97d9a90ac6e


nb_f97d9a90ac6e

In [19]:
val model = nb.fit(trainingData_s)

model = NaiveBayesModel (uid=nb_f97d9a90ac6e) with 2 classes


NaiveBayesModel (uid=nb_f97d9a90ac6e) with 2 classes

## Make predictions

In [20]:
val predictions = model.transform(testData_s)
predictions.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|label|      scaledFeatures|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-0.4054651081081...|[0.66666666666666...|       0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-0.4054651081081...|[0.66666666666666...|       0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-1.0638558113443...|[0.63099210446949...|       0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-0.6005858084712...|[0.67658035802037...|       0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-0.7957065088342...|[0.68633792753489...|       0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-0.7957065088342...|[0.68633792753489...|       0.0|
|[1.0,1.0,1.0,1.0,...|  0.0|[0.0,0.0,0.0,0.0,...|[-0.5985089048032...|[0.

predictions = [features: vector, label: double ... 4 more fields]


[features: vector, label: double ... 4 more fields]

In [21]:
predictions.stat.crosstab("label", "prediction").sort("label_prediction").show()

+----------------+---+---+
|label_prediction|0.0|1.0|
+----------------+---+---+
|             0.0|132|  1|
|             1.0| 30| 49|
+----------------+---+---+



## Evaluate the model

In [22]:
val evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("f1")

evaluator = mcEval_18aa8e83c4b1


mcEval_18aa8e83c4b1

In [23]:
val f1= evaluator.evaluate(predictions)

f1 = 0.8445246824977751


0.8445246824977751