In [45]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.linalg import Vector
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [46]:
sparksession=SparkSession.builder.appName('textClassification').getOrCreate()

In [63]:
textdata=sparksession.read.option("header","false").option("inferSchema", "false").csv("SPAM text message 20170820 - Data.csv")

In [64]:
textdata=textdata.filter(textdata['_c0']!='Category')
textdata=textdata.withColumnRenamed('_c0','Category').withColumnRenamed('_c1','Message')
textdata.show()

+--------+--------------------+
|Category|             Message|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
|    spam|FreeMsg Hey there...|
|     ham|Even my brother i...|
|     ham|As per your reque...|
|    spam|WINNER!! As a val...|
|    spam|Had your mobile 1...|
|     ham|I'm gonna be home...|
|    spam|SIX chances to wi...|
|    spam|URGENT! You have ...|
|     ham|I've been searchi...|
|     ham|I HAVE A DATE ON ...|
|    spam|XXXMobileMovieClu...|
|     ham|Oh k...i'm watchi...|
|     ham|Eh u remember how...|
|     ham|Fine if thats th...|
|    spam|England v Macedon...|
+--------+--------------------+
only showing top 20 rows



In [65]:
textdata=textdata.withColumn('textlength',length(textdata['Message']))
#textdata.show()
textgroupeddata=textdata.groupby('Category').mean()

In [66]:
textgroupeddata.show()

+--------------------+-----------------+
|            Category|  avg(textlength)|
+--------------------+-----------------+
|ham\tHI BABE UAWA...|             19.0|
|                 ham| 71.2058031088083|
|                spam|137.7550200803213|
|           ham\tYeah|             38.0|
+--------------------+-----------------+



In [67]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [72]:
message=textdata.select('Message').collect()
messages=[]
for i in message:
    messages.append(i["Message"])
#print(messages)

messagelabel=textdata.select('Category').collect()
messagelabels=[]
for i in messagelabel:
    if(i['Category']=='ham'):
        messagelabels.append(1)
    elif(i['Category']=='spam'):
        messagelabels.append(0)
    else:
        messagelabels.append(1)
print(messagelabels)

[1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [73]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(messages)
word_index=tokenizer.word_index
sequences=tokenizer.texts_to_sequences(messages)
padded_sequences=pad_sequences(sequences)

In [74]:
print(padded_sequences)

[[   0    0    0 ...   59 4409  145]
 [   0    0    0 ...  471    7 1936]
 [   0    0    0 ...  659  388 2990]
 ...
 [   0    0    0 ...  108  251 8976]
 [   0    0    0 ...  199   13   48]
 [   0    0    0 ...    3   62  267]]


In [76]:
trainingSize=int(len(messages)*0.7)
trainingSeq=padded_sequences[:trainingSize]
trainingLabel=np.array(messagelabels[:trainingSize])
testingSeq=padded_sequences[trainingSize:]
testingLabel=np.array(messagelabels[trainingSize:])

In [78]:
print(trainingSeq.dtype)
print(trainingLabel.dtype)

int32
int64


In [79]:
tfmodel = tf.keras.Sequential([\
                               tf.keras.layers.Embedding(input_dim=len(word_index) + 1,output_dim=16,input_length=trainingSeq.shape[1]),\
                               tf.keras.layers.Flatten(),\
                               tf.keras.layers.Dense(32, activation='relu'),\
                               tf.keras.layers.Dense(1, activation='sigmoid')
])

In [80]:
tfmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [81]:
tfmodel.fit(trainingSeq,trainingLabel,epochs=5,validation_data=(testingSeq,testingLabel))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7a4c6cb0dc90>

In [83]:
loss,accuracy=tfmodel.evaluate(testingSeq,testingLabel)
print("accuracy : {0}".format(accuracy*100))

accuracy : 98.98386001586914
