In [26]:
# Source : https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html

In [27]:
import pandas as pd
from collections import Counter
from tqdm import tqdm
import re

In [28]:
df_sms = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

print(df_sms.shape)
df_sms.head()

# ham means normal message

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
df_sms['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [30]:
df_sms['SMS_cleaned'] = df_sms['SMS'].str.replace('\W', ' ').str.lower() # Removes punctuation and lower case
df_sms.head()

  df_sms['SMS_cleaned'] = df_sms['SMS'].str.replace('\W', ' ').str.lower() # Removes punctuation and lower case


Unnamed: 0,Label,SMS,SMS_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...


In [31]:
df_sms['SMS_words'] = df_sms['SMS_cleaned'].str.split()
df_sms

Unnamed: 0,Label,SMS,SMS_cleaned,SMS_words
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,..."
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that so any other s...,"[pity, was, in, mood, for, that, so, any, othe..."
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i d...,"[the, guy, did, some, bitching, but, i, acted,..."


In [32]:
vocabulary = []
for sms in df_sms['SMS_words']:
    vocabulary.extend(sms)

vocabulary = list(set(vocabulary))

In [33]:
len(vocabulary)

8753

In [34]:
# Create a new column in the DataFrame for every word in the vocabulary
# Initialise the column values to be zero
# These column will be used to store the number of times each word occurs in the given SMS



n_rows = len(df_sms)

# [0] * N creates a list of N zeros
# create a dictionary whose keys are the words from vocabulary and values are lists with all zero values
data = {col: [0] * n_rows for col in vocabulary}

# convert the dictionary into a dataframe
df_temp = pd.DataFrame(data)

# concatenate this df_temp dataframe to the original df_sms dataframe
df_sms = pd.concat([df_sms, df_temp], axis=1)

In [35]:
df_sms

Unnamed: 0,Label,SMS,SMS_cleaned,SMS_words,let,8am,first,12hrs,0,sehwag,...,neo69,pocy,creepy,arnt,thia,rest,daddy,wks,thet,dungerees
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,ham,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that so any other s...,"[pity, was, in, mood, for, that, so, any, othe...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i d...,"[the, guy, did, some, bitching, but, i, acted,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df_sms.head()

Unnamed: 0,Label,SMS,SMS_cleaned,SMS_words,let,8am,first,12hrs,0,sehwag,...,neo69,pocy,creepy,arnt,thia,rest,daddy,wks,thet,dungerees
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Create a dictionary word_dict and populate it with the vocabulary words as keys and zero valued lists as values
word_dict = {}
for word in vocabulary:
  word_dict[word] = list(df_sms[word])

for i in tqdm(range(len(df_sms))):
  # for each row of the dataframe, get the words in the SMS 
  sms_word_list = df_sms.loc[i]['SMS_words'] 

  # for each SMS, get the number of times every word in the vocabulary occurs in it
  sms_word_counts = Counter(sms_word_list)

  # add this information to the word_dict dictionary
  for word in sms_word_list:
    word_dict[word][i] = sms_word_counts[word]

# write this word count information back in the df_sms dataframe
for word in vocabulary:
  df_sms[word] = word_dict[word]

100%|██████████| 5572/5572 [00:25<00:00, 221.51it/s]


In [38]:
df_sms.head()

Unnamed: 0,Label,SMS,SMS_cleaned,SMS_words,let,8am,first,12hrs,0,sehwag,...,neo69,pocy,creepy,arnt,thia,rest,daddy,wks,thet,dungerees
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Isolating spam and ham messages first
df_spam = df_sms[df_sms['Label'] == 'spam']
df_ham = df_sms[df_sms['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(df_spam) / len(df_sms)
p_ham = len(df_ham) / len(df_sms)

print(p_spam,p_ham)

0.13406317300789664 0.8659368269921034


In [40]:
# N_Spam
n_words_per_spam_message = df_spam['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = df_ham['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

print(n_spam,n_ham,n_vocabulary)

103587 344903 8753


In [41]:
# Initiate parameters
prob_spam = {unique_word:0 for unique_word in vocabulary}
prob_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_spam = df_spam[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   prob_spam[word] = p_word_given_spam

   n_word_given_ham = df_ham[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   prob_ham[word] = p_word_given_ham

In [42]:
def classify_sms(message):
  message = re.sub('\W', ' ', message)
  message = message.lower().split()

  p_spam_given_message = p_spam
  p_ham_given_message = p_ham

  for word in message:
    if word in prob_spam:
      p_spam_given_message *= prob_spam[word]

    if word in prob_ham: 
      p_ham_given_message *= prob_ham[word]

  # print('P(Spam|message):', p_spam_given_message)
  # print('P(Ham|message):', p_ham_given_message)

  if p_ham_given_message > p_spam_given_message:
    return 'ham'
  elif p_ham_given_message < p_spam_given_message:
    return 'spam'
  else:
    return 'undecided'

In [43]:
message = 'WINNER!! This is the secret code to unlock the money: C3421.'
classify_sms(message)

'spam'

In [44]:
message = 'You are a lucky person.'
classify_sms(message)

'ham'

In [45]:
df_sms['predicted'] = df_sms['SMS'].apply(classify_sms)

In [46]:
df_sms

Unnamed: 0,Label,SMS,SMS_cleaned,SMS_words,let,8am,first,12hrs,0,sehwag,...,pocy,creepy,arnt,thia,rest,daddy,wks,thet,dungerees,predicted
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
5568,ham,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5569,ham,"Pity, * was in mood for that. So...any other s...",pity was in mood for that so any other s...,"[pity, was, in, mood, for, that, so, any, othe...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5570,ham,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i d...,"[the, guy, did, some, bitching, but, i, acted,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [47]:
correct = len(df_sms[df_sms['Label'] == df_sms['predicted']])
total = len(df_sms)

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 5517
Incorrect: 55
Accuracy: 0.9901292175161522


## Splitting Dataset into Training and Testing Sets

In the above code, we have used the whole dataset for training our classifier. This is not a good practice!

We need to split our dataset into a training and testing set, and train the classifier only using the training set. The testing set should then be used for checking the accuracy. This is to prevent overfitting, which we will learn about soon.

In [48]:
# Randomize the dataset
df_sms_randomized = df_sms.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(df_sms_randomized) * 0.8)

# Split into training and test sets
training_set = df_sms_randomized[:training_test_index].reset_index(drop=True)
test_set = df_sms_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 8758)
(1114, 8758)
