# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Read-the-data" data-toc-modified-id="Read-the-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read the data</a></div><div class="lev1 toc-item"><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Engineering</a></div><div class="lev1 toc-item"><a href="#NLP-cleaning" data-toc-modified-id="NLP-cleaning-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>NLP cleaning</a></div><div class="lev1 toc-item"><a href="#Vectorizing-data" data-toc-modified-id="Vectorizing-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Vectorizing data</a></div><div class="lev1 toc-item"><a href="#train-test-split" data-toc-modified-id="train-test-split-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>train test split</a></div><div class="lev1 toc-item"><a href="#Modelling" data-toc-modified-id="Modelling-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Modelling</a></div><div class="lev1 toc-item"><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Model evaluation</a></div>

In [10]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('bhishan').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.15.4'), ('pandas', '0.24.2'), ('pyspark', '2.4.3')]


In [17]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import CountVectorizer

# Read the data

In [3]:
!head  ../data/smsspamcollection/SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only

In [5]:
df = spark.read.csv('../data/smsspamcollection/SMSSpamCollection',header=False,
                   sep='\t',inferSchema=True)

print(df.count())
print(len(df.columns))
df.printSchema()

df.show(5)

5574
2
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



# Feature Engineering

In [7]:
df = df.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')
df.show(2)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
+-----+--------------------+
only showing top 2 rows



In [8]:
df = df.withColumn('length', F.length('text'))
df.show(2)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
+-----+--------------------+------+
only showing top 2 rows



In [9]:
df.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



# NLP cleaning

In [22]:
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stop_remove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')

In [18]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

# Vectorizing data

In [21]:
from pyspark.ml import Pipeline

In [23]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer,
                                 stop_remove, count_vec, idf, clean_up])

In [25]:
cleaner = data_prep_pipe.fit(df)

In [26]:
clean_data = cleaner.transform(df)

In [27]:
clean_data = clean_data.select('label', 'features')

In [28]:
clean_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
+-----+--------------------+
only showing top 5 rows



# train test split

In [29]:
train, test = clean_data.randomSplit([0.8,0.2])

# Modelling

In [19]:
from pyspark.ml.classification import NaiveBayes

In [20]:
nb = NaiveBayes()

In [30]:
spam_detector = nb.fit(train)

In [31]:
df.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [32]:
test_results = spam_detector.transform(test)

In [33]:
test_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,4,50,...|[-848.20646834786...|[1.0,2.9012212538...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1164.9709711336...|[1.0,6.4799744385...|       0.0|
|  0.0|(13424,[0,1,9,14,...|[-537.25297677048...|[1.0,1.7893400122...|       0.0|
|  0.0|(13424,[0,1,24,31...|[-339.25267160870...|[1.0,2.0536925359...|       0.0|
|  0.0|(13424,[0,1,416,6...|[-303.47487668321...|[0.99999999999999...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



# Model evaluation

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [35]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [36]:
acc = acc_eval.evaluate(test_results)

In [37]:
print('Accuracy of NB')
acc

Accuracy of NB


0.9151943462897526