In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('spam_ham_dataset.csv',encoding='ISO-8859-1')
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [3]:
# Drop unnecessary columns
df.drop(["Unnamed: 0", "label_num"],axis=1,inplace=True)
df.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [4]:
# Change column names
df.columns = ['labels','content']
df.head()

Unnamed: 0,labels,content
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [5]:
# Check missing data in the dataframe
df.isnull().sum()

labels     0
content    0
dtype: int64

In [6]:
# Print shape of the dataframe
print(df.shape) 

(5171, 2)


In [7]:
# Calculate index for splittin the data
training_test_index = round(len(df) * 0.75)

# Split data into training and test sets
training_set = df[:training_test_index].reset_index(drop=True)
test_set = df[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(3878, 2)
(1293, 2)


In [8]:
# Removes punctuations and unnecesarry words from the content
training_set['content'] = training_set['content'].str.replace('\W',' ') 
training_set['content'] = training_set['content'].str.lower()
training_set['content'] = training_set['content'].str.replace('subject', '')
training_set['content'] = training_set['content'].str.replace('and', '')
training_set['content'] = training_set['content'].str.replace('or', '')
training_set['content'] = training_set['content'].str.replace('an', '')
training_set['content'] = training_set['content'].str.replace('a', '')
training_set['content'] = training_set['content'].str.replace('to', '')
training_set['content'] = training_set['content'].str.replace('from', '')
training_set['content'] = training_set['content'].str.replace('in', '')
training_set['content'] = training_set['content'].str.replace('for', '')
training_set['content'] = training_set['content'].str.replace('from', '')
training_set['content'] = training_set['content'].str.replace('on', '')
training_set['content'] = training_set['content'].str.replace('at', '')
training_set.head(3)

  training_set['content'] = training_set['content'].str.replace('\W',' ')


Unnamed: 0,labels,content
0,ham,enr methol meter 988291 this is foll...
1,ham,hpl nom f jury 9 2001 see ttched file ...
2,ham,ne retret ho ho ho we re round tht mos...


In [9]:
# Creating the Vocabulary
training_set['content'] = training_set['content'].str.split()

vocabulary = []
for text in training_set['content']:
    for word in text:
        vocabulary.append(word)
vocabulary = list(set(vocabulary))

len(vocabulary) # Number of unique word in all the messages

39972

In [10]:
word_counts_per_text = {unique_word: [0] * len(training_set['content']) for unique_word in vocabulary}

for index, data in enumerate(training_set['content']):
    for word in data:
        word_counts_per_text[word][index] +=1

In [11]:
word_counts = pd.DataFrame(word_counts_per_text)

In [12]:
# Concatenate training set df and words df
training_set_clean = pd.concat([training_set, word_counts],axis=1)
training_set_clean.head()

Unnamed: 0,labels,content,vcent,7400,2234,welfre,ims,tilimited,fqzryjnmo,tught,...,distributi,neighbourg,cusmer,pologies,westerly,gdoxnbmuf,ccchthm,5553,mroper,cssius
0,ham,"[enr, methol, meter, 988291, this, is, follow,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[hpl, nom, f, jury, 9, 2001, see, ttched, file...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[ne, retret, ho, ho, ho, we, re, round, tht, m...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,spam,"[phoshop, wdows, office, chep, m, trendg, bsem...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[re, di, sprgs, this, del, is, book, the, teco...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Isolating spam and ham messages 
spam_mail = training_set_clean[training_set_clean['labels'] == 'spam']
ham_mail = training_set_clean[training_set_clean['labels'] == 'ham']

# Calculating Probability(spam) , Probability(ham)
p_spam = len(spam_mail) / len(training_set_clean)
p_ham = len(ham_mail) / len(training_set_clean)

# Calculating number of spam
n_words_per_spam_mail = spam_mail['content'].apply(len)
n_spam = n_words_per_spam_mail.sum()

# Calculating number of ham
n_words_per_ham_mail = ham_mail['content'].apply(len)
n_ham = n_words_per_ham_mail.sum()

# Calculating number of vocabulary
n_vocabulary = len(vocabulary)

# Define Gaussian Smoothing constant
alpha = 1

In [14]:
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

In [15]:
for word in vocabulary:
    n_word_given_spam = spam_mail[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam

    n_word_given_ham = ham_mail[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [16]:
def classify(message):
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_mail = p_spam
    p_ham_given_mail = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_mail *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_mail *= parameters_ham[word]
            
    print('P(Spam|message):',p_spam_given_mail)
    print('P(Ham|message):',p_ham_given_mail)
    
    if (p_ham_given_mail > p_spam_given_mail):
        print('Label: Ham')
    elif (p_ham_given_mail < p_spam_given_mail):
        print('Label: Spam')
    else:
        print('Equal probabilities, need a human classify')

In [17]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 4.247935713669387e-22
P(Ham|message): 3.455156318537606e-23
Label: Spam


In [18]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 6.205877764759641e-22
P(Ham|message): 1.9489577884550845e-21
Label: Ham


In [26]:
def classify_test_set(mail):
   
    mail = re.sub('\W', ' ', mail)
    mail = mail.lower().split()

    p_spam_given_mail = p_spam
    p_ham_given_mail = p_ham

    for word in mail:
        if word in parameters_spam:
            p_spam_given_mail *= parameters_spam[word]

        if word in parameters_ham:
            p_ham_given_mail *= parameters_ham[word]

    if p_ham_given_mail >= p_spam_given_mail:
        return 'ham'
    elif p_spam_given_mail > p_ham_given_mail:
        return 'spam'

test_set['predicted'] = test_set['content'].apply(classify_test_set)
test_set 

Unnamed: 0,labels,content,predicted
0,ham,"Subject: enron / hpl actuals - nov . 7 , 2000\...",ham
1,spam,Subject: exclusive positions in montanayoocwo\...,spam
2,ham,Subject: natural gas nomination for december 2...,ham
3,ham,Subject: imperial sugar ' s volumes will be 14...,ham
4,ham,Subject: re : first delivery - wagner oil\r\nv...,ham
...,...,...,...
1288,ham,Subject: put the 10 on the ft\r\nthe transport...,ham
1289,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,ham
1290,ham,Subject: calpine daily gas nomination\r\n>\r\n...,ham
1291,ham,Subject: industrial worksheets for august 2000...,ham


In [27]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['labels'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1164
Incorrect: 129
Accuracy: 0.9002320185614849
