In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

In [2]:
raw = spark.read.option("delimiter","\t").csv('..\\case study 1 dataset\\SMSSpamCollection').toDF('spam', 'message')
raw.show(2)

+----+--------------------+
|spam|             message|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
+----+--------------------+
only showing top 2 rows



In [3]:
# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol('message').setOutputCol('words')
transformed = tokenizer.transform(raw)
transformed.show(1)

+----+--------------------+--------------------+
|spam|             message|               words|
+----+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|
+----+--------------------+--------------------+
only showing top 1 row



In [4]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol('words').setOutputCol('filtered')
cleaned = remover.transform(transformed)
cleaned.show(1)

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



In [5]:
# Custom stopwords
stopwords = StopWordsRemover().getStopWords() + ['-']
remover = StopWordsRemover().setStopWords(stopwords).setInputCol('words').setOutputCol('filtered')
cleaned = remover.transform(transformed)

In [6]:
# Generate features
from pyspark.ml.feature import CountVectorizer
cvmodel = CountVectorizer().setInputCol('filtered').setOutputCol('features').fit(cleaned)
featured = cvmodel.transform(cleaned)

In [7]:
# Convert to binary label
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer().setInputCol('spam').setOutputCol('label').fit(featured)
indexed = indexer.transform(featured)

In [8]:
# Split to tran and test sets
training, test = indexed.randomSplit([0.7, 0.3])

In [9]:
# Logistic Regression
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = log_reg.fit(training)
predictions = lrModel.transform(test)
predictions.select('features', 'label', 'prediction').show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName('areaUnderROC')
AUC = evaluator.evaluate(predictions)
print(AUC)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(13459,[3,7,44,21...|  0.0|       0.0|
|(13459,[3,87,117,...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 2 rows

0.5


In [10]:
%%time
# Random Forest
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier().setLabelCol('label').setFeaturesCol('features').setNumTrees(10)
model = rf.fit(training)
predictions = model.transform(test)

evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName("areaUnderROC")
AUC = evaluator.evaluate(predictions)
print(AUC)

0.5040983606557377
Wall time: 2min 51s


In [11]:
%%time
# Introduce bi-gram and note the change in accuracy
from pyspark.ml.feature import NGram
bigram = NGram().setN(2).setInputCol('filtered').setOutputCol('bigrams')
bigramDataFrame = bigram.transform(cleaned)
bigramDataFrame.select('bigrams').show(2, False)

cvmodel = CountVectorizer().setInputCol('bigrams').setOutputCol('features').fit(bigramDataFrame)
featured = cvmodel.transform(bigramDataFrame)
indexed = indexer.transform(featured)
indexed.show(2)

training, test = indexed.randomSplit([0.7, 0.3])
lrModel = log_reg.fit(training)
predictions = lrModel.transform(test)
evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName("areaUnderROC")
print(evaluator.evaluate(predictions))

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|bigrams                                                                                                                                                                                      |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go jurong, jurong point,, point, crazy.., crazy.. available, available bugis, bugis n, n great, great world, world la, la e, e buffet..., buffet... cine, cine got, got amore, amore wat...]|
|[ok lar..., lar... joking, joking wif, wif u, u oni...]                                                                                                                                      |
+---------------------------------------

It seems bi-gram doesn't help prediction.

In [12]:
%%time
# Decide on a strategy and generate a data pipeline
from pyspark.ml import Pipeline
tokenizer = Tokenizer().setInputCol('message').setOutputCol('words')
stopwords = StopWordsRemover().getStopWords() + ['-']
remover = StopWordsRemover().setStopWords(stopwords).setInputCol('words').setOutputCol('filtered')
cvmodel = CountVectorizer().setInputCol('filtered').setOutputCol('features').fit(cleaned)
indexer = StringIndexer().setInputCol('spam').setOutputCol('label').fit(featured)

rf = RandomForestClassifier().setLabelCol('label').setFeaturesCol('features').setNumTrees(10)
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, rf])

training, test = raw.randomSplit([0.7, 0.3])
model = pipeline.fit(training)
predictions = model.transform(test)

evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName("areaUnderROC")
AUC = evaluator.evaluate(predictions)
print(AUC)

0.5025641025641026
Wall time: 3min
