In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

## Load the dataset

In [2]:
val df = spark.read.
    format("csv").
    option("header", "false").
    option("inferschema", "true").
    option("delimiter","\t").
    load("../Datasets/Sms_spam.csv").toDF("label_s","message")

df = [label_s: string, message: string]


[label_s: string, message: string]

## Explore the dataset

In [3]:
df.show(5)

+-------+--------------------+
|label_s|             message|
+-------+--------------------+
|    ham|Go until jurong p...|
|    ham|Ok lar... Joking ...|
|   spam|Free entry in 2 a...|
|    ham|U dun say so earl...|
|    ham|Nah I don't think...|
+-------+--------------------+
only showing top 5 rows



In [4]:
df.printSchema

root
 |-- label_s: string (nullable = true)
 |-- message: string (nullable = true)



In [5]:
df.columns.length

2

In [6]:
df.count

5572

In [7]:
df.groupBy("label_s").count.show()

+-------+-----+
|label_s|count|
+-------+-----+
|    ham| 4825|
|   spam|  747|
+-------+-----+



## Split into train and test sets

In [8]:
val Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

trainingData = [label_s: string, message: string]
testData = [label_s: string, message: string]


[label_s: string, message: string]

## Vectorize the messages

In [9]:
val tokenizer = new Tokenizer().
  setInputCol("message").
  setOutputCol("raw_words")

tokenizer = tok_5dee125b1c6c


tok_5dee125b1c6c

In [10]:
val remover = new StopWordsRemover().
  setInputCol("raw_words").
  setOutputCol("filtered_words")

remover = stopWords_3113d3328381


stopWords_3113d3328381

In [11]:
val cv = new CountVectorizer().
  setInputCol("filtered_words").
  setOutputCol("rawFeatures")

cv = cntVec_1d724a1724e0


cntVec_1d724a1724e0

In [12]:
val idf = new IDF().
  setInputCol("rawFeatures").
  setOutputCol("features")

idf = idf_e1ef0884083f


idf_e1ef0884083f

In [13]:
val pipeline = new Pipeline().
  setStages(Array(tokenizer, remover, cv, idf))

pipeline = pipeline_681918432f6d


pipeline_681918432f6d

In [14]:
val labelToDouble = udf((x: String) => {
    if (x == "ham") 0.0
    else 1.0
    }) 

labelToDouble = UserDefinedFunction(<function1>,DoubleType,Some(List(StringType)))


UserDefinedFunction(<function1>,DoubleType,Some(List(StringType)))

In [15]:
val T = pipeline.fit(trainingData)

T = pipeline_681918432f6d


pipeline_681918432f6d

In [16]:
val trainingData_v = T.transform(trainingData).select(col("features"),labelToDouble(col("label_s")).as("label"))
trainingData_v.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(10724,[2,7,3664,...|  0.0|
|(10724,[2,7,39,25...|  0.0|
|(10724,[2,87,154,...|  0.0|
|(10724,[2,643,205...|  0.0|
|(10724,[2,11,165,...|  0.0|
+--------------------+-----+
only showing top 5 rows



trainingData_v = [features: vector, label: double]


[features: vector, label: double]

In [17]:
val testData_v = T.transform(testData).select(col("features"),labelToDouble(col("label_s")).as("label"))

testData_v = [features: vector, label: double]


[features: vector, label: double]

## Hyperparameter optimization with cross-validation

In [18]:
//val rf = new RandomForestClassifier().setNumTrees(100)

//val evaluator = new MulticlassClassificationEvaluator().setMetricName("f1")

//val paramGrid = new ParamGridBuilder().
//  addGrid(rf.maxDepth, Array(10, 12, 14, 16, 20)).
//  build()

//val CV = new CrossValidator().
//  setEstimator(rf).
//  setEvaluator(evaluator).
//  setEstimatorParamMaps(paramGrid).
//  setNumFolds(5)

// val model = CV.fit(trainingData_v)

// model.write.overwrite.save("Model_Parameters")

Name: Syntax Error.
Message: 
StackTrace: 

## Train the Model

In [19]:
val rf = new RandomForestClassifier().
    setNumTrees(100).
    setMaxDepth(16)

rf = rfc_34b4b0097e57


rfc_34b4b0097e57

In [20]:
val model = rf.fit(trainingData_v)

model = RandomForestClassificationModel (uid=rfc_c04b68debba4) with 100 trees


RandomForestClassificationModel (uid=rfc_c04b68debba4) with 100 trees

## Make predictions

In [21]:
val predictions = model.transform(testData_v)
predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10724,[2,7,328,2...|  0.0|[92.3506681400142...|[0.92350668140014...|       0.0|
|(10724,[15,169,20...|  0.0|[92.3506681400142...|[0.92350668140014...|       0.0|
|(10724,[3,12,33,5...|  0.0|[88.7843670133305...|[0.88784367013330...|       0.0|
|(10724,[0,6,10,37...|  0.0|[92.2167271574494...|[0.92216727157449...|       0.0|
|(10724,[108,134,2...|  0.0|[92.3506681400142...|[0.92350668140014...|       0.0|
|(10724,[0,2,7,12,...|  0.0|[88.7620807027139...|[0.88762080702713...|       0.0|
|(10724,[20,33,95]...|  0.0|[87.0047389233171...|[0.87004738923317...|       0.0|
|(10724,[196,239],...|  0.0|[92.3506681400142...|[0.92350668140014...|       0.0|
|(10724,[3,4,36,19...|  0.0|[89.1836212851904...|[0.89183621285190...|       0.0|
|(10724,[6,22,35

predictions = [features: vector, label: double ... 3 more fields]


[features: vector, label: double ... 3 more fields]

In [22]:
predictions.stat.crosstab("label", "prediction").sort("label_prediction").show()

|label_prediction| 0.0|1.0|
+----------------+----+---+
|             0.0|1493|  0|
|             1.0| 147| 62|
+----------------+----+---+



## Evaluate the model

In [23]:
val evaluator = new MulticlassClassificationEvaluator().setMetricName("f1")

evaluator = mcEval_c4865c4b2f93


mcEval_c4865c4b2f93

In [24]:
val f1 = evaluator.evaluate(predictions)

f1 = 0.8922324370531045


0.8922324370531045