In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

## Load the dataset

In [2]:
val df = spark.read.
    format("csv").
    option("header", "false").
    option("inferschema", "true").
    option("delimiter","\t").
    load("../Datasets/Sms_spam.csv").toDF("label_s","message")

df = [label_s: string, message: string]


[label_s: string, message: string]

## Explore the dataset

In [3]:
df.show(5)

+-------+--------------------+
|label_s|             message|
+-------+--------------------+
|    ham|Go until jurong p...|
|    ham|Ok lar... Joking ...|
|   spam|Free entry in 2 a...|
|    ham|U dun say so earl...|
|    ham|Nah I don't think...|
+-------+--------------------+
only showing top 5 rows



In [4]:
df.printSchema

root
 |-- label_s: string (nullable = true)
 |-- message: string (nullable = true)



In [5]:
df.columns.length

2

In [6]:
df.count

5572

In [7]:
df.groupBy("label_s").count.show()

|label_s|count|
+-------+-----+
|    ham| 4825|
|   spam|  747|
+-------+-----+



## Split into train and test sets

In [8]:
val Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

trainingData = [label_s: string, message: string]
testData = [label_s: string, message: string]


[label_s: string, message: string]

## Vectorize the messages

In [9]:
val tokenizer = new Tokenizer().
  setInputCol("message").
  setOutputCol("raw_words")

tokenizer = tok_7ea501010ef3


tok_7ea501010ef3

In [10]:
val remover = new StopWordsRemover().
  setInputCol("raw_words").
  setOutputCol("filtered_words")

remover = stopWords_c1d34395ab00


stopWords_c1d34395ab00

In [11]:
val cv = new CountVectorizer().
  setInputCol("filtered_words").
  setOutputCol("rawFeatures")

cv = cntVec_2b46de1a7351


cntVec_2b46de1a7351

In [12]:
val idf = new IDF().
  setInputCol("rawFeatures").
  setOutputCol("features")

idf = idf_ea201f77685b


idf_ea201f77685b

In [13]:
val indexer = new StringIndexer().
  setInputCol("label_s").
  setOutputCol("label")

indexer = strIdx_6990de57cdae


strIdx_6990de57cdae

In [14]:
val pipeline = new Pipeline().
  setStages(Array(tokenizer, remover, cv, idf, indexer))

pipeline = pipeline_87345f9d0f7e


pipeline_87345f9d0f7e

In [15]:
val T = pipeline.fit(trainingData)

T = pipeline_87345f9d0f7e


pipeline_87345f9d0f7e

In [16]:
val trainingData_v = T.transform(trainingData).select("features","label")
trainingData_v.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(10687,[3,7,4292,...|  0.0|
|(10687,[3,7,48,23...|  0.0|
|(10687,[3,9,163,2...|  0.0|
|(10687,[3,13,83,8...|  0.0|
|(10687,[3,94,154,...|  0.0|
+--------------------+-----+
only showing top 5 rows



trainingData_v = [features: vector, label: double]


[features: vector, label: double]

In [17]:
val testData_v = T.transform(testData).select("features","label")

testData_v = [features: vector, label: double]


[features: vector, label: double]

## Train the model

In [18]:
val model = new LogisticRegression().setMaxIter(100).fit(trainingData_v)

model = logreg_c3a868b7daa2


logreg_c3a868b7daa2

## Make predictions

In [19]:
val predictions = model.transform(testData_v)
predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10687,[3,77,98,2...|  0.0|[20.3066051243375...|[0.99999999848311...|       0.0|
|(10687,[3,882,198...|  0.0|[6.68864368953824...|[0.99875657782907...|       0.0|
|(10687,[0,3,14,20...|  0.0|[45.2817848404292...|[1.0,2.1595870426...|       0.0|
|(10687,[0,3,82,21...|  0.0|[18.4893439718609...|[0.99999999066358...|       0.0|
|(10687,[0,1,5,9,5...|  0.0|[41.7406510541376...|[1.0,7.4518752546...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



predictions = [features: vector, label: double ... 3 more fields]


[features: vector, label: double ... 3 more fields]

In [20]:
predictions.stat.crosstab("label", "prediction").show()

|label_prediction| 0.0|1.0|
+----------------+----+---+
|             1.0|  44|179|
|             0.0|1508|  1|
+----------------+----+---+



## Evaluate the model

In [21]:
val evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("f1")//setMetricName("accuracy")

evaluator = mcEval_4e5629b58a81


mcEval_4e5629b58a81

In [22]:
val f1 = evaluator.evaluate(predictions)

f1 = 0.9728148549433178


0.9728148549433178