In [24]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.{MinMaxScaler,StandardScaler}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

lastException: Throwable = null


## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",";").
  load("../Datasets/Winequality_red.csv")

df = [fixed acidity: double, volatile acidity: double ... 10 more fields]


[fixed acidity: double, volatile acidity: double ... 10 more fields]

## Explore the dataset

In [3]:
df.show(10)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [4]:
df.columns

[fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]

In [5]:
df.columns.length

12

In [6]:
df.count

1599

In [7]:
df.printSchema

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [8]:
df.select("quality").describe().show()

+-------+------------------+
|summary|           quality|
+-------+------------------+
|  count|              1599|
|   mean|5.6360225140712945|
| stddev|0.8075694397347051|
|    min|                 3|
|    max|                 8|
+-------+------------------+



In [9]:
df.select("quality").distinct.sort("quality").show()

+-------+                                                                       
|quality|
+-------+
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
+-------+



In [10]:
df.groupBy("quality").count.sort("quality").show()

+-------+-----+
|quality|count|
+-------+-----+
|      3|   10|
|      4|   53|
|      5|  681|
|      6|  638|
|      7|  199|
|      8|   18|
+-------+-----+



## Correlation

In [11]:
var L = List.empty[(String,Double)]

for(cn <- df.columns) L = L :+ (cn,df.stat.corr("quality", cn))

val df_corr = L.toDF("colname","correlation")
df_corr.show()

+--------------------+--------------------+
|             colname|         correlation|
+--------------------+--------------------+
|       fixed acidity| 0.12405164911322263|
|    volatile acidity| -0.3905577802640061|
|         citric acid| 0.22637251431804048|
|      residual sugar|0.013731637340065798|
|           chlorides|-0.12890655993005293|
| free sulfur dioxide|-0.05065605724427597|
|total sulfur dioxide|-0.18510028892653774|
|             density|-0.17491922778336474|
|                  pH| -0.0577313912053826|
|           sulphates| 0.25139707906925995|
|             alcohol|  0.4761663240011364|
|             quality|                 1.0|
+--------------------+--------------------+



L = List((fixed acidity,0.12405164911322263), (volatile acidity,-0.3905577802640061), (citric acid,0.22637251431804048), (residual sugar,0.013731637340065798), (chlorides,-0.12890655993005293), (free sulfur dioxide,-0.05065605724427597), (total sulfur dioxide,-0.18510028892653774), (density,-0.17491922778336474), (pH,-0.0577313912053826), (sulphates,0.25139707906925995), (alcohol,0.4761663240011364), (quality,1.0))
df_corr = [colname: string, correlation: double]


[colname: string, correlation: double]

In [12]:
df_corr.
    select(col("colname"),abs(col("correlation"))).
    sort(col("abs(correlation)").desc).
    show()

+--------------------+--------------------+
|             colname|    abs(correlation)|
+--------------------+--------------------+
|             quality|                 1.0|
|             alcohol|  0.4761663240011364|
|    volatile acidity|  0.3905577802640061|
|           sulphates| 0.25139707906925995|
|         citric acid| 0.22637251431804048|
|total sulfur dioxide| 0.18510028892653774|
|             density| 0.17491922778336474|
|           chlorides| 0.12890655993005293|
|       fixed acidity| 0.12405164911322263|
|                  pH|  0.0577313912053826|
| free sulfur dioxide| 0.05065605724427597|
|      residual sugar|0.013731637340065798|
+--------------------+--------------------+



## Apply Vector Assembler

In [13]:
val features = df.columns.slice(0,df.columns.length -1)

features = Array(fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol)


[fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol]

In [14]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_2e2f9c69b92d


vecAssembler_2e2f9c69b92d

In [15]:
val df_v = assembler.transform(df).
    select(col("features"), col("quality").cast("double").as("label"))

df_v.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.4,0.7,0.0,1.9,...|  5.0|
|[7.8,0.88,0.0,2.6...|  5.0|
|[7.8,0.76,0.04,2....|  5.0|
|[11.2,0.28,0.56,1...|  6.0|
|[7.4,0.7,0.0,1.9,...|  5.0|
|[7.4,0.66,0.0,1.8...|  5.0|
|[7.9,0.6,0.06,1.6...|  5.0|
|[7.3,0.65,0.0,1.2...|  7.0|
|[7.8,0.58,0.02,2....|  7.0|
|[7.5,0.5,0.36,6.1...|  5.0|
+--------------------+-----+
only showing top 10 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [16]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Scale the features

In [17]:
val scaler = new StandardScaler().//MinMaxScaler().
  setInputCol("features").
  setOutputCol("scaledFeatures")

scaler = stdScal_d4c7de9d4b91


stdScal_d4c7de9d4b91

In [18]:
val s = scaler.fit(trainingData)

s = stdScal_d4c7de9d4b91


stdScal_d4c7de9d4b91

In [19]:
val trainingData_s = s.transform(trainingData)
val testData_s = s.transform(testData)

trainingData_s = [features: vector, label: double ... 1 more field]
testData_s = [features: vector, label: double ... 1 more field]


[features: vector, label: double ... 1 more field]

## Train the model

In [20]:
val nb = new NaiveBayes().
    setFeaturesCol("scaledFeatures")
    //setModelType("bernoulli")//Supported options: "multinomial" and "bernoulli". Default is "multinomial"

nb = nb_662fda7e1c46


nb_662fda7e1c46

In [26]:
val model = nb.fit(trainingData_s)

model = NaiveBayesModel (uid=nb_662fda7e1c46) with 6 classes


lastException: Throwable = null


NaiveBayesModel (uid=nb_662fda7e1c46) with 6 classes

## Make predictions

In [27]:
val predictions = model.transform(testData_s)
predictions.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|label|      scaledFeatures|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|[4.7,0.6,0.17,2.3...|  6.0|[2.71381398986969...|[-285.72606892756...|[0.00196126086374...|       2.0|
|[5.1,0.42,0.0,1.8...|  7.0|[2.94477688262456...|[-272.76430929131...|[0.00147485915066...|       2.0|
|[5.2,0.34,0.0,1.8...|  6.0|[3.00251760581327...|[-275.24939464936...|[0.00131457389691...|       3.0|
|[5.2,0.48,0.04,1....|  7.0|[3.00251760581327...|[-268.86130393288...|[0.00142202304493...|       2.0|
|[5.2,0.645,0.0,2....|  6.0|[3.00251760581327...|[-265.33217028752...|[0.00970683400433...|       2.0|
|[5.3,0.47,0.11,2....|  7.0|[3.06025832900199...|[-280.67213946002...|[0.00131376999555...|       2.0|
|[5.3,0.47,0.11,2....|  7.0|[3.06025832900199...|[-280.80061449914...|[0.

predictions = [features: vector, label: double ... 4 more fields]


[features: vector, label: double ... 4 more fields]

In [28]:
predictions.stat.crosstab("label", "prediction").sort("label_prediction").show()

+----------------+---+---+
|label_prediction|2.0|3.0|
+----------------+---+---+
|             3.0|  1|  1|
|             4.0| 10|  7|
|             5.0|141| 57|
|             6.0| 80|108|
|             7.0| 12| 52|
|             8.0|  0|  5|
+----------------+---+---+



## Evaluate the model

In [29]:
val evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("accuracy")

evaluator = mcEval_6a344a4fac5c


mcEval_6a344a4fac5c

In [30]:
val accuracy = evaluator.evaluate(predictions)

accuracy = 0.002109704641350211


0.002109704641350211