In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Dataset: https://www.kaggle.com/uciml/sms-spam-collection-dataset/home

In [125]:
!gdown --id 13RB_Jvkvf97so7b4NE2Ba7Cg2W3ESlVb

Downloading...
From: https://drive.google.com/uc?id=13RB_Jvkvf97so7b4NE2Ba7Cg2W3ESlVb
To: /content/spam.csv
  0% 0.00/504k [00:00<?, ?B/s]100% 504k/504k [00:00<00:00, 72.4MB/s]


In [126]:
df = pd.read_csv("spam.csv", encoding="latin1")

In [127]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


## Cleaning the dataset
Checking whether columns unnamed 2,3,4 have any useful information 

In [129]:
print(sum(df.iloc[:, 2].notna()))
df.iloc[:, 2].unique()

50


array([nan, ' PO Box 5249',
       ' the person is definitely special for u..... But if the person is so special',
       ' HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE JEN XXX\\""',
       ' wanted to say hi. HI!!!\\" Stop? Send STOP to 62468"',
       'this wont even start........ Datz confidence.."', 'GN',
       '.;-):-D"',
       'just been in bedbut mite go 2 thepub l8tr if uwana mt up?loads a luv Jenxxx.\\""',
       ' bt not his girlfrnd... G o o d n i g h t . . .@"',
       ' I\'ll come up"',
       ' don\'t miss ur best life for anything... Gud nyt..."',
       ' just as a shop has to give a guarantee on what they sell. B. G."',
       ' But at d end my love compromised me for everything:-(\\".. Gud mornin:-)"',
       ' the toughest is acting Happy with all unspoken pain inside..\\""',
       ' smoke hella weed\\""', '\\" not \\"what i need to do.\\""',
       'JUST GOT PAYED2DAY & I HAVBEEN GIVEN Aå£50 PAY RISE 4MY WORK & HAVEBEEN MADE PRESCHOOLCO-ORDINATOR 2I AM FEELINGOOD

In [130]:
print(sum(df.iloc[:, 3].notna()))
df.iloc[:, 3].unique()

12


array([nan, ' MK17 92H. 450Ppw 16"', ' why to miss them', 'GE',
       'U NO THECD ISV.IMPORTANT TOME 4 2MORO\\""',
       'i wil tolerat.bcs ur my someone..... But',
       ' ILLSPEAK 2 U2MORO WEN IM NOT ASLEEP...\\""',
       'whoever is the KING\\"!... Gud nyt"', ' TX 4 FONIN HON',
       ' \\"OH No! COMPETITION\\". Who knew', 'IåÕL CALL U\\""'],
      dtype=object)

In [131]:
print(sum(df.iloc[:, 4].notna()))
df.iloc[:, 4].unique()

6


array([nan, ' just Keep-in-touch\\" gdeve.."', 'GNT:-)"',
       ' Never comfort me with a lie\\" gud ni8 and sweet dreams"',
       ' CALL 2MWEN IM BK FRMCLOUD 9! J X\\""',
       ' one day these two will become FREINDS FOREVER!"'], dtype=object)

It seems majority of the columns contain null values. The unique values contain some additional comments. 
It is safe to remove them.

In [132]:
df_new = df[["v1", "v2"]]

In [133]:
df_new.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [134]:
#adding labels to the columns
df_new.columns = ["class", "text"]

In [135]:
df_new.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [136]:
df_new["class"].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

data is very skewed, may lead to incorrect classification

## Balancing the dataset

In [137]:
spam = df_new[df_new["class"]=="spam"]

In [138]:
ham = df_new[df_new["class"]=="ham"]

In [139]:
ham.shape, spam.shape

((4825, 2), (747, 2))

In [140]:
ham = ham.sample(spam.shape[0])

In [141]:
ham.shape, spam.shape

((747, 2), (747, 2))

join the two dataframes together

In [142]:
data = ham.append(spam, ignore_index = True)

In [143]:
data.head()

Unnamed: 0,class,text
0,ham,Home so we can always chat
1,ham,Will do. Was exhausted on train this morning. ...
2,ham,That's my honeymoon outfit. :)
3,ham,Huh i cant thk of more oredi how many pages do...
4,ham,We know TAJ MAHAL as symbol of love. But the o...


In [144]:
data.tail()

Unnamed: 0,class,text
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...
1493,spam,This is the 2nd time we have tried 2 contact u...


## Preprocessing using NLTK

*   Remove all non-words
*   Change to lower case
*   Remove stopwords
*   Perform stemming








In [145]:
import re
import nltk
from nltk.stem.porter import PorterStemmer #lancaster stemmer can be too agressive
from nltk.corpus import stopwords

In [146]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [147]:
def text_cleaning(text):
  text = re.sub("[^A-Za-z]", " ", text) #substitute all non-alphabetic characters with a space
  text = text.lower()
  text = text.split()
  stemmer = PorterStemmer()
  text = [stemmer.stem(word) for word in text if word not in set(stopwords.words("english"))] #removing stopwords and stemming 
  text = " ".join(text)
  return text


In [148]:
#Testing the function
test = data.text[747]
print(test)

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [149]:
test = text_cleaning(test)
print(test)

free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli


In [150]:
# Appying the function to the entire dataset
dataset = []
for i in range(0, len(data)):
  text = text_cleaning(data.text[i])
  dataset.append(text)

In [151]:
dataset[1489]

'want explicit sex sec ring cost p min gsex pobox wc n xx'

## Modeling the data using bag of words


In [152]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [153]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(dataset).toarray()

In [154]:
features.shape

(1494, 3159)

In [155]:
labels = data["class"].values

In [156]:
labels[5:]

array(['ham', 'ham', 'ham', ..., 'spam', 'spam', 'spam'], dtype=object)

## Train-test split

In [157]:
from sklearn.model_selection import train_test_split

In [158]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.20, stratify = labels, shuffle= True, random_state = 42)

In [159]:
features_test.shape, features_train.shape, labels_test.shape, labels_train.shape

((299, 3159), (1195, 3159), (299,), (1195,))

In [160]:
print(vectorizer.get_feature_names()[:10])
print(vectorizer.get_feature_names()[-10:])

['aathi', 'ab', 'abel', 'aberdeen', 'abi', 'abiola', 'abl', 'abroad', 'absenc', 'absolut']
['yummi', 'yuou', 'yup', 'zealand', 'zebra', 'zed', 'zf', 'zoe', 'zouk', 'zs']


## Training a multinomial naive-bayes classifier

In [161]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [162]:
classifier_nb = MultinomialNB()
k_fold = StratifiedKFold(n_splits = 10)
scores = cross_val_score(classifier_nb, features_train, labels_train, cv= k_fold)

In [163]:
print("mean: ", scores.mean(), "std: ", scores.std())

mean:  0.9531302521008402 std:  0.012588771400575764


In [164]:
classifier_nb.fit(features_train, labels_train)
labels_predicted = classifier_nb.predict(features_test)


In [170]:
#labels_predicted

In [165]:
print("Accuracy score: ", accuracy_score(labels_test, labels_predicted))
print("Confusion matrix: \n", confusion_matrix(labels_test, labels_predicted, labels = ["ham", "spam"]))
print("Classification report: \n", classification_report(labels_test, labels_predicted, labels = ["ham", "spam"]))


Accuracy score:  0.9464882943143813
Confusion matrix: 
 [[141   9]
 [  7 142]]
Classification report: 
               precision    recall  f1-score   support

         ham       0.95      0.94      0.95       150
        spam       0.94      0.95      0.95       149

    accuracy                           0.95       299
   macro avg       0.95      0.95      0.95       299
weighted avg       0.95      0.95      0.95       299



## Testing other classifiers

*   Logistic regression
*   Decision tree
*   Random Forest
*   LinearSVC






In [166]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [167]:
models = [LogisticRegression(), RandomForestClassifier(), DecisionTreeClassifier(), SVC()]
for md in models:
  md.fit(features_train, labels_train)
  k_fold = StratifiedKFold(n_splits = 10)
  scores = cross_val_score(md, features_train, labels_train, cv = k_fold)
  print(type(md))
  print("mean: ", scores.mean(), "std: ", scores.std())
  predictions = md.predict(features_test)
  accuracy_score(labels_test, predictions)
  labels_predicted = md.predict(features_test)
  print("Accuracy score: ", accuracy_score(labels_test, labels_predicted,"\n"))
  print("Confusion matrix: \n", confusion_matrix(labels_test, labels_predicted, labels = ["ham", "spam"]), "\n")
  print("Classification report: \n", classification_report(labels_test, labels_predicted, labels = ["ham", "spam"]),"\n")


<class 'sklearn.linear_model._logistic.LogisticRegression'>
mean:  0.9422619047619047 std:  0.018850345263669983
Accuracy score:  0.9464882943143813
Confusion matrix: 
 [[150   0]
 [ 16 133]] 

Classification report: 
               precision    recall  f1-score   support

         ham       0.90      1.00      0.95       150
        spam       1.00      0.89      0.94       149

    accuracy                           0.95       299
   macro avg       0.95      0.95      0.95       299
weighted avg       0.95      0.95      0.95       299
 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
mean:  0.94640756302521 std:  0.013706184974226657
Accuracy score:  0.9531772575250836
Confusion matrix: 
 [[150   0]
 [ 14 135]] 

Classification report: 
               precision    recall  f1-score   support

         ham       0.91      1.00      0.96       150
        spam       1.00      0.91      0.95       149

    accuracy                           0.95       299
   macro avg       