In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

## Load the dataset

In [2]:
val df = spark.read.
    format("csv").
    option("header", "false").
    option("inferschema", "true").
    option("delimiter","\t").
    load("../Datasets/Sms_spam.csv").toDF("label_s","message")

df = [label_s: string, message: string]


[label_s: string, message: string]

## Explore the dataset

In [3]:
df.show(5)

+-------+--------------------+
|label_s|             message|
+-------+--------------------+
|    ham|Go until jurong p...|
|    ham|Ok lar... Joking ...|
|   spam|Free entry in 2 a...|
|    ham|U dun say so earl...|
|    ham|Nah I don't think...|
+-------+--------------------+
only showing top 5 rows



In [4]:
df.printSchema

root
 |-- label_s: string (nullable = true)
 |-- message: string (nullable = true)



In [5]:
df.columns.length

2

In [6]:
df.count

5572

In [7]:
df.groupBy("label_s").count.show()

+-------+-----+                                                                 
|label_s|count|
+-------+-----+
|    ham| 4825|
|   spam|  747|
+-------+-----+



## Split into train and test sets

In [8]:
val Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

trainingData = [label_s: string, message: string]
testData = [label_s: string, message: string]


[label_s: string, message: string]

## Vectorize the messages

In [9]:
val tokenizer = new Tokenizer().
  setInputCol("message").
  setOutputCol("raw_words")

tokenizer = tok_4c64ec021171


tok_4c64ec021171

In [10]:
val remover = new StopWordsRemover().
  setInputCol("raw_words").
  setOutputCol("filtered_words")

remover = stopWords_74027186f116


stopWords_74027186f116

In [11]:
val cv = new CountVectorizer().
  setInputCol("filtered_words").
  setOutputCol("rawFeatures")

cv = cntVec_7adcee02e789


cntVec_7adcee02e789

In [12]:
val idf = new IDF().
  setInputCol("rawFeatures").
  setOutputCol("features")

idf = idf_bff0a2d2aad4


idf_bff0a2d2aad4

In [13]:
val indexer = new StringIndexer().
  setInputCol("label_s").
  setOutputCol("label")

indexer = strIdx_dde28190f7ce


strIdx_dde28190f7ce

In [14]:
val pipeline = new Pipeline().
  setStages(Array(tokenizer, remover, cv, idf, indexer))

pipeline = pipeline_7a1bf8f1315c


pipeline_7a1bf8f1315c

In [15]:
val T = pipeline.fit(trainingData)

T = pipeline_7a1bf8f1315c


pipeline_7a1bf8f1315c

In [16]:
val trainingData_v = T.transform(trainingData).select("features","label")
trainingData_v.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(10826,[3,8,3603,...|  0.0|
|(10826,[3,8,44,22...|  0.0|
|(10826,[3,104,135...|  0.0|
|(10826,[3,14,79,8...|  0.0|
|(10826,[0,3,108,2...|  0.0|
+--------------------+-----+
only showing top 5 rows



trainingData_v = [features: vector, label: double]


[features: vector, label: double]

In [17]:
val testData_v = T.transform(testData).select("features","label")

testData_v = [features: vector, label: double]


[features: vector, label: double]

## Hyperparameter optimization with cross-validation

In [18]:
val lr = new LogisticRegression().setMaxIter(100)

lr = logreg_17a7a3a16604


logreg_17a7a3a16604

In [19]:
val evaluator = new MulticlassClassificationEvaluator().setMetricName("f1")

evaluator = mcEval_6fafbce3a836


mcEval_6fafbce3a836

In [20]:
val paramGrid = new ParamGridBuilder().
  addGrid(lr.regParam, Array(0.01, 0.1, 1, 10, 100)).
  addGrid(lr.elasticNetParam, Array(0.2,0.4,0.6,0.8,1.0)).
  build()

paramGrid = 


Array({
	logreg_17a7a3a16604-elasticNetParam: 0.2,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.4,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.6,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.8,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 1.0,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.2,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.4,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.6,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.8,
	logreg_17a7a3a16604-r...


[{
	logreg_17a7a3a16604-elasticNetParam: 0.2,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.4,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.6,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.8,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 1.0,
	logreg_17a7a3a16604-regParam: 0.01
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.2,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.4,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.6,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.8,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 1.0,
	logreg_17a7a3a16604-regParam: 0.1
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.2,
	logreg_17a7a3a16604-regParam: 1.0
}, {
	logreg_17a7a3a16604-elasticNetParam: 0.4,
	logreg_17a7a3a16604-regParam: 1.0
}

In [21]:
val CV = new CrossValidator().
  setEstimator(lr).
  setEvaluator(evaluator).
  setEstimatorParamMaps(paramGrid).
  setNumFolds(5)

CV = cv_6c35447a1d4e


cv_6c35447a1d4e

In [22]:
val model = CV.fit(trainingData_v)

model = cv_6c35447a1d4e


cv_6c35447a1d4e

In [23]:
model.write.overwrite.save("Model_Parameters")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


Best Model : "regParam"=0.01,"elasticNetParam"=0.4

## Make predictions using the best model

In [24]:
val predictions = model.transform(testData_v)
predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10826,[3,2100,31...|  0.0|[5.05390135226295...|[0.99365612460064...|       0.0|
|(10826,[3,12,169,...|  0.0|[5.03891607876171...|[0.99356096077091...|       0.0|
|(10826,[3,92,138,...|  0.0|[5.03891607876171...|[0.99356096077091...|       0.0|
|(10826,[0,3,15,19...|  0.0|[4.98622297270566...|[0.99321493334980...|       0.0|
|(10826,[14,180,20...|  0.0|[5.02393080526047...|[0.99346437878770...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



predictions = [features: vector, label: double ... 3 more fields]


[features: vector, label: double ... 3 more fields]

In [25]:
predictions.stat.crosstab("label", "prediction").sort("label_prediction").show()

|label_prediction| 0.0|1.0|
+----------------+----+---+
|             0.0|1419|  2|
|             1.0|  53|178|
+----------------+----+---+



## Evaluate the model

In [26]:
val f1 = evaluator.evaluate(predictions)

f1 = 0.9649348560020584


0.9649348560020584