In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Dataset: https://www.kaggle.com/uciml/sms-spam-collection-dataset/home

In [34]:
!gdown --id 13RB_Jvkvf97so7b4NE2Ba7Cg2W3ESlVb

Downloading...
From: https://drive.google.com/uc?id=13RB_Jvkvf97so7b4NE2Ba7Cg2W3ESlVb
To: /content/spam.csv
  0% 0.00/504k [00:00<?, ?B/s]100% 504k/504k [00:00<00:00, 72.9MB/s]


In [35]:
df = pd.read_csv("spam.csv", encoding="latin1")

In [36]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


## Cleaning the dataset
Checking whether columns unnamed 2,3,4 have any useful information 

In [38]:
print(sum(df.iloc[:, 2].notna()))
df.iloc[:, 2].unique()

50


array([nan, ' PO Box 5249',
       ' the person is definitely special for u..... But if the person is so special',
       ' HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE JEN XXX\\""',
       ' wanted to say hi. HI!!!\\" Stop? Send STOP to 62468"',
       'this wont even start........ Datz confidence.."', 'GN',
       '.;-):-D"',
       'just been in bedbut mite go 2 thepub l8tr if uwana mt up?loads a luv Jenxxx.\\""',
       ' bt not his girlfrnd... G o o d n i g h t . . .@"',
       ' I\'ll come up"',
       ' don\'t miss ur best life for anything... Gud nyt..."',
       ' just as a shop has to give a guarantee on what they sell. B. G."',
       ' But at d end my love compromised me for everything:-(\\".. Gud mornin:-)"',
       ' the toughest is acting Happy with all unspoken pain inside..\\""',
       ' smoke hella weed\\""', '\\" not \\"what i need to do.\\""',
       'JUST GOT PAYED2DAY & I HAVBEEN GIVEN Aå£50 PAY RISE 4MY WORK & HAVEBEEN MADE PRESCHOOLCO-ORDINATOR 2I AM FEELINGOOD

In [39]:
print(sum(df.iloc[:, 3].notna()))
df.iloc[:, 3].unique()

12


array([nan, ' MK17 92H. 450Ppw 16"', ' why to miss them', 'GE',
       'U NO THECD ISV.IMPORTANT TOME 4 2MORO\\""',
       'i wil tolerat.bcs ur my someone..... But',
       ' ILLSPEAK 2 U2MORO WEN IM NOT ASLEEP...\\""',
       'whoever is the KING\\"!... Gud nyt"', ' TX 4 FONIN HON',
       ' \\"OH No! COMPETITION\\". Who knew', 'IåÕL CALL U\\""'],
      dtype=object)

In [40]:
print(sum(df.iloc[:, 4].notna()))
df.iloc[:, 4].unique()

6


array([nan, ' just Keep-in-touch\\" gdeve.."', 'GNT:-)"',
       ' Never comfort me with a lie\\" gud ni8 and sweet dreams"',
       ' CALL 2MWEN IM BK FRMCLOUD 9! J X\\""',
       ' one day these two will become FREINDS FOREVER!"'], dtype=object)

It seems majority of the columns contain null values. The unique values contain some additional comments. 
It is safe to remove them.

In [41]:
df_new = df[["v1", "v2"]]

In [42]:
df_new.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
#adding labels to the columns
df_new.columns = ["class", "text"]

In [44]:
df_new.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [45]:
df_new["class"].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

data is very skewed, may lead to incorrect classification

## Balancing the dataset

In [46]:
spam = df_new[df_new["class"]=="spam"]

In [47]:
ham = df_new[df_new["class"]=="ham"]

In [48]:
ham.shape, spam.shape

((4825, 2), (747, 2))

In [49]:
ham = ham.sample(spam.shape[0])

In [50]:
ham.shape, spam.shape

((747, 2), (747, 2))

join the two dataframes together

In [51]:
data = ham.append(spam, ignore_index = True)

In [52]:
data.head()

Unnamed: 0,class,text
0,ham,Yes when is the appt again?
1,ham,"Alright we're hooked up, where you guys at"
2,ham,Haha mayb u're rite... U know me well. Da feel...
3,ham,I'm watching lotr w my sis dis aft. So u wan 2...
4,ham,"Dear, will call Tmorrow.pls accomodate."


In [53]:
data.tail()

Unnamed: 0,class,text
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...
1493,spam,This is the 2nd time we have tried 2 contact u...


## Preprocessing using NLTK

*   Remove all non-words
*   Change to lower case
*   Remove stopwords
*   Perform stemming








In [54]:
import re
import nltk
from nltk.stem.porter import PorterStemmer #lancaster stemmer can be too agressive
from nltk.corpus import stopwords

In [55]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
def text_cleaning(text):
  text = re.sub("[^A-Za-z]", " ", text) #substitute all non-alphabetic characters with a space
  text = text.lower()
  text = text.split()
  stemmer = PorterStemmer()
  text = [stemmer.stem(word) for word in text if word not in set(stopwords.words("english"))] #removing stopwords and stemming 
  text = " ".join(text)
  return text


In [57]:
#Testing the function
test = data.text[747]
print(test)

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [58]:
test = text_cleaning(test)
print(test)

free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli


In [59]:
# Appying the function to the entire dataset
dataset = []
for i in range(0, len(data)):
  text = text_cleaning(data.text[i])
  dataset.append(text)

In [60]:
dataset[1489]

'want explicit sex sec ring cost p min gsex pobox wc n xx'

In [61]:
len(dataset)

1494

In [62]:
len(data["class"])

1494

## Train test split & Creating the pipeline

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [64]:
features_train, features_test, labels_train, labels_test = train_test_split(dataset, data["class"], 
                                                                            test_size = 0.20, shuffle = True,
                                                                            stratify = data["class"], random_state = 42)

Pipeline

In [77]:
models = [MultinomialNB(), LogisticRegression(), RandomForestClassifier(), DecisionTreeClassifier(), SVC()]
for md in models:
  classifier = Pipeline([("tfidf", TfidfVectorizer()), ("clf", md)])
  classifier.fit(features_train, labels_train)
  labels_predict = classifier.predict(features_test)
  print(type(md))
  print("Accuracy score: ", accuracy_score(labels_test, labels_predict,"\n"))
  print("Confusion matrix: \n", confusion_matrix(labels_test, labels_predict, labels = ["ham", "spam"]), "\n")
  print("Classification report: \n", classification_report(labels_test, labels_predict, labels = ["ham", "spam"]),"\n")


<class 'sklearn.naive_bayes.MultinomialNB'>
Accuracy score:  0.9331103678929766
Confusion matrix: 
 [[137  13]
 [  7 142]] 

Classification report: 
               precision    recall  f1-score   support

         ham       0.95      0.91      0.93       150
        spam       0.92      0.95      0.93       149

    accuracy                           0.93       299
   macro avg       0.93      0.93      0.93       299
weighted avg       0.93      0.93      0.93       299
 

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Accuracy score:  0.939799331103679
Confusion matrix: 
 [[146   4]
 [ 14 135]] 

Classification report: 
               precision    recall  f1-score   support

         ham       0.91      0.97      0.94       150
        spam       0.97      0.91      0.94       149

    accuracy                           0.94       299
   macro avg       0.94      0.94      0.94       299
weighted avg       0.94      0.94      0.94       299
 

<class 'sklearn.ensemble._f