In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

## Load the dataset

In [2]:
val df = spark.read.
    format("csv").
    option("header", "false").
    option("inferschema", "true").
    option("delimiter","\t").
    load("../Datasets/Sms_spam.csv").toDF("label_s","message")

df = [label_s: string, message: string]


[label_s: string, message: string]

## Explore the dataset

In [3]:
df.show(5)

+-------+--------------------+
|label_s|             message|
+-------+--------------------+
|    ham|Go until jurong p...|
|    ham|Ok lar... Joking ...|
|   spam|Free entry in 2 a...|
|    ham|U dun say so earl...|
|    ham|Nah I don't think...|
+-------+--------------------+
only showing top 5 rows



In [4]:
df.printSchema

root
 |-- label_s: string (nullable = true)
 |-- message: string (nullable = true)



In [5]:
df.columns.length

2

In [6]:
df.count

5572

In [7]:
df.groupBy("label_s").count.show()

+-------+-----+
|label_s|count|
+-------+-----+
|    ham| 4825|
|   spam|  747|
+-------+-----+



## Split into train and test sets

In [8]:
val Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

trainingData = [label_s: string, message: string]
testData = [label_s: string, message: string]


[label_s: string, message: string]

## Vectorize the messages

In [9]:
val tokenizer = new Tokenizer().
  setInputCol("message").
  setOutputCol("raw_words")

tokenizer = tok_afc40bacd24b


tok_afc40bacd24b

In [10]:
val remover = new StopWordsRemover().
  setInputCol("raw_words").
  setOutputCol("filtered_words")

remover = stopWords_6857421eebef


stopWords_6857421eebef

In [11]:
val cv = new CountVectorizer().
  setInputCol("filtered_words").
  setOutputCol("rawFeatures")

cv = cntVec_466d36a7bc02


cntVec_466d36a7bc02

In [12]:
val idf = new IDF().
  setInputCol("rawFeatures").
  setOutputCol("features")

idf = idf_25ac84dc3289


idf_25ac84dc3289

In [13]:
val indexer = new StringIndexer().
  setInputCol("label_s").
  setOutputCol("label")

indexer = strIdx_405f2ef291de


strIdx_405f2ef291de

In [14]:
val pipeline = new Pipeline().
  setStages(Array(tokenizer, remover, cv, idf, indexer))

pipeline = pipeline_ed09cd86a6c8


pipeline_ed09cd86a6c8

In [15]:
val T = pipeline.fit(trainingData)

T = pipeline_ed09cd86a6c8


pipeline_ed09cd86a6c8

In [16]:
val trainingData_v = T.transform(trainingData).select("features","label")
trainingData_v.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(10809,[3,7,3611,...|  0.0|
|(10809,[3,7,42,19...|  0.0|
|(10809,[3,92,137,...|  0.0|
|(10809,[3,10,186,...|  0.0|
|(10809,[3,7,241,2...|  0.0|
+--------------------+-----+
only showing top 5 rows



trainingData_v = [features: vector, label: double]


[features: vector, label: double]

In [17]:
val testData_v = T.transform(testData).select("features","label")

testData_v = [features: vector, label: double]


[features: vector, label: double]

## Train the model

In [18]:
val nb = new NaiveBayes()
    //setFeaturesCol("scaledFeatures")
    //setModelType("bernoulli")//Supported options: "multinomial" and "bernoulli". Default is "multinomial"

nb = nb_6d735760042f


nb_6d735760042f

In [19]:
val model = nb.fit(trainingData_v)

model = NaiveBayesModel (uid=nb_6d735760042f) with 2 classes


NaiveBayesModel (uid=nb_6d735760042f) with 2 classes

## Make predictions using the best model

In [20]:
val predictions = model.transform(testData_v)
predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10809,[3,906,318...|  0.0|[-243.10993809934...|[1.0,5.2867135173...|       0.0|
|(10809,[3,13,100,...|  0.0|[-455.47924859972...|[1.0,3.8665499149...|       0.0|
|(10809,[3,98,114,...|  0.0|[-534.28737477799...|[1.0,8.8167830578...|       0.0|
|(10809,[0,3,14,19...|  0.0|[-786.15480064080...|[1.0,9.1941472834...|       0.0|
|(10809,[0,3,86,18...|  0.0|[-174.60474985062...|[0.99999998669087...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



predictions = [features: vector, label: double ... 3 more fields]


[features: vector, label: double ... 3 more fields]

In [21]:
predictions.stat.crosstab("label", "prediction").sort("label_prediction").show()

|label_prediction| 0.0|1.0|
+----------------+----+---+
|             0.0|1376| 27|
|             1.0|   9|209|
+----------------+----+---+



## Evaluate the model

In [22]:
val evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("f1")

evaluator = mcEval_f28a94fdbf2c


mcEval_f28a94fdbf2c

In [23]:
val f1 = evaluator.evaluate(predictions)

f1 = 0.9781600518501365


0.9781600518501365