In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

In [2]:
# load the sms database
# the sms database has real emails
sms_database = pd.read_csv('SMSSpamCollection', sep='\t',
header=None, names = ['LABEL', 'CONTENT'])

# the structure of the database
print(sms_database['LABEL'].value_counts())
sms_database.head()

LABEL
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,LABEL,CONTENT
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# clean the data
# transform all words to lowercase
sms_database['CONTENT'] = sms_database['CONTENT'].str.lower()

# split per sentence
sms_database['CONTENT'] = sms_database['CONTENT'].str.split()

sms_database.head()

Unnamed: 0,LABEL,CONTENT
0,ham,"[go, until, jurong, point,, crazy.., available..."
1,ham,"[ok, lar..., joking, wif, u, oni...]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor..., u, c, already..."
4,ham,"[nah, i, don't, think, he, goes, to, usf,, he,..."


In [4]:
# randomrize the database
randomized_database = sms_database.sample(frac=1, random_state=42).reset_index(drop=True)

# calculate the size of train and test database
# 80% is train database and 20% is test
train_size = int(sms_database.shape[0] * 0.8)

# split the database into train_database and test_database
train_database = randomized_database[ : train_size]
test_database = randomized_database[train_size : ]

# the size of train and test database
print(f'train database: {train_database.shape}')
print(f'test database: {test_database.shape}')

train database: (4457, 2)
test database: (1115, 2)


In [5]:
# calculate all the words in the train dataset
total_words = []

for content in train_database['CONTENT']:
    for word in content:
        total_words.append(word)

# record words without repetition
total_words = list(set(total_words))
print(f'There are different {len(total_words)} words in train database.')

There are different 11917 words in train database.


In [6]:
# calculate the frequency of each word in each email
word_freq_per_email = {}
row_size = train_database.shape[0]

# create a new void dictionary to record the frequency
# the size of dictionary = len(total_words) * rows of train_database
for word in total_words:
    word_freq_per_email[word] = [0] * row_size

# calculate the frequency of per word and save in the dictionary
for index, content in enumerate(train_database['CONTENT']):
    for word in content:
        word_freq_per_email[word][index] += 1

# transform word_freq from dictionary to dataframe
word_freq = pd.DataFrame(word_freq_per_email)
word_freq.head()

Unnamed: 0,classes,800,1-month,k.i,birds,iknow,germany,asia.,gong.,skye,...,armand,references..,londn,18,price,lac,lunch:),collages,08448350055,goodnite
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
word_freq = pd.concat([train_database, word_freq], axis=1, ignore_index=False)
word_freq.head()

Unnamed: 0,LABEL,CONTENT,classes,800,1-month,k.i,birds,iknow,germany,asia.,...,armand,references..,londn,18,price,lac,lunch:),collages,08448350055,goodnite
0,ham,"[squeeeeeze!!, this, is, christmas, hug.., if,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[and, also, i've, sorta, blown, him, off, a, c...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[mmm, thats, better, now, i, got, a, roast, do...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[mm, have, some, kanji, dont, eat, anything, h...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[so, there's, a, ring, that, comes, with, the,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# split the dataframe accroding to LABEL
word_freq_spam = word_freq.groupby('LABEL').get_group('spam')
word_freq_ham = word_freq.groupby('LABEL').get_group('ham')

In [9]:
# P(spam) = the number of words in spam / total number of words
prob_prior_spam = len(word_freq_spam) / len(word_freq)

# P(ham) = the number of words in ham / total number of words
prob_prior_ham = len(word_freq_ham) / len(word_freq)

# the number of words in spam email
number_words_spam = word_freq_spam['CONTENT'].apply(len).sum()

# the number of words in ham email
number_words_ham = word_freq_ham['CONTENT'].apply(len).sum()

# the number of total words
number_words = len(total_words)

# set the parameter of Laplace smoothing
alpha = 1

In [10]:
words_prob_spam = {word : 0 for word in total_words}
words_prob_ham = {word : 0 for word in total_words}

for word in total_words:
    # sum fo frequency of a unique word in spam email
    # just like the sum of a column in the dataframe
    word_total_freq_spam = word_freq_spam[word].sum()
    
    # P(word|spam) = the total frequency / total number of words in spam
    # Laplace smoothing: P(word|spam) = (the total frequency + 1) / (total number of words in spam + 1 * number of words)
    # Laplace is used to avoid P(word|spam) = 0, because there are some frequencies equal to 0
    # https://towardsdatascience.com/laplace-smoothing-in-na%C3%AFve-bayes-algorithm-9c237a8bdece
    word_prob_spam = (word_total_freq_spam + alpha) / (number_words_spam + alpha * number_words)
    words_prob_spam[word] = word_prob_spam

    # sum fo frequency of a unique word in ham email
    word_total_freq_ham = word_freq_ham[word].sum()

    # it is same as the P(word|spam)
    word_prob_ham = (word_total_freq_ham + alpha) / (number_words_ham + alpha * number_words)
    words_prob_ham[word] = word_prob_ham

In [11]:
def classify(email):

    # accroding to naive bayes, P(spam|word1, word2, ..., wordn) = P(spam) * P(word1|spam) * P(word2|spam) * ... * P(wordn|spam) 
    prob_spam = prob_prior_spam
    prob_ham = prob_prior_ham

    for word in email:
        if word in total_words:
            prob_spam *= words_prob_spam[word]
            prob_ham *= words_prob_ham[word]

    if prob_spam > prob_ham:
        return 'spam'
    else:
        return 'ham'

In [12]:
true_positive = 0
true_negtive = 0
false_positive = 0
false_negtive = 0

for label, content in zip(test_database['LABEL'], test_database['CONTENT']):
    if classify(content) == label:
        if label == 'spam':
            true_positive += 1
        else:
            true_negtive += 1
    else:
        if label == 'spam':
            false_positive += 1
        else:
            false_negtive += 1

In [13]:
print(f'ture positive: {true_positive}')
print(f'ture negtive: {true_negtive}')
print(f'false positive: {false_positive}')
print(f'false negtive: {false_negtive}')

print(f'accuracy: {(true_positive + true_negtive) / (true_positive + true_negtive + false_positive + false_negtive)}')

ture positive: 130
ture negtive: 957
false positive: 24
false negtive: 4
accuracy: 0.9748878923766816
