### Spam detection system

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP').getOrCreate()

In [3]:
data = spark.read.csv('smsspam.csv',inferSchema=True,header=True,sep='\t')

In [4]:
data.show()

+----+---------------------------------------------------------------------------------------------------------------+
| ham|Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...|
+----+---------------------------------------------------------------------------------------------------------------+
| ham|                                                                                           Ok lar... Joking ...|
|spam|                                                                                           Free entry in 2 a...|
| ham|                                                                                           U dun say so earl...|
| ham|                                                                                           Nah I don't think...|
|spam|                                                                                           FreeMsg Hey there...|
| ham|                                          

In [5]:
data.columns

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']

In [6]:
data = data.withColumnRenamed('ham', 'class')

In [7]:
data = data.withColumnRenamed('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...','text')

In [8]:
data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
|  ham|Is that seriously...|
+-----+--------------------+
only showing top 20 rows



In [9]:
print(data.describe())

DataFrame[summary: string, class: string, text: string]


In [10]:
from pyspark.sql.functions import length
data = data.withColumn('length',length(data['text']))

In [11]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.44612515540821|
| spam|138.6706827309237|
+-----+-----------------+



In [12]:
from pyspark.ml.feature import (Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer)

In [13]:
tokenizer = Tokenizer(inputCol='text',outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text',outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token',outputCol='c_vec')
idf = IDF(inputCol='c_vec',outputCol='tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol='class',outputCol='label')

In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

In [16]:
from pyspark.ml.classification import NaiveBayes

In [17]:
nb = NaiveBayes()

In [18]:
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[ham_spam_to_numeric,tokenizer,stop_remove,count_vec,idf,clean_up])

In [19]:
cleaner = data_prep_pipe.fit(data)

In [20]:
clean_data = cleaner.transform(data)

In [22]:
clean_data = clean_data.select('features','label')

In [24]:
train,test = clean_data.randomSplit([0.7,0.3])

In [25]:
spam_detector = nb.fit(train)

In [28]:
data.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [29]:
test_results = spam_detector.transform(test)

In [30]:
test_results.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(13419,[0,1,2,4,3...|  1.0|[-1204.7996105808...|[3.68528876146571...|       1.0|
|(13419,[0,1,2,5,5...|  1.0|[-933.21999007962...|[0.99999999999979...|       0.0|
|(13419,[0,1,2,12,...|  1.0|[-1129.5879507551...|[1.58684546358926...|       1.0|
|(13419,[0,1,2,12,...|  1.0|[-1122.2841220531...|[2.38265787795616...|       1.0|
|(13419,[0,1,2,13,...|  0.0|[-611.85159827953...|[0.99999999999999...|       0.0|
|(13419,[0,1,4,13,...|  1.0|[-1421.9035661521...|[3.61868481283933...|       1.0|
|(13419,[0,1,5,15,...|  0.0|[-999.89064446695...|[1.0,9.4657681561...|       0.0|
|(13419,[0,1,14,78...|  0.0|[-686.93297490287...|[1.0,4.4058224236...|       0.0|
|(13419,[0,1,18,20...|  0.0|[-842.59186068667...|[1.0,1.5940616375...|       0.0|
|(13419,[0,1,20,

In [31]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [32]:
acc_eval = MulticlassClassificationEvaluator()

In [33]:
acc = acc_eval.evaluate(test_results)

In [35]:
print('Acc of NB model ' +str(acc))

Acc of NB model 0.9265568934408892
