# Creating an Email Spam Filter using Naive Bayes

In [1]:
import pandas as pd
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label','SMS'])

In [2]:
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [4]:
df.info

<bound method DataFrame.info of      Label                                                SMS
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [5]:
ham_perc = df['Label'][df['Label']=='ham'].count()/df['Label'].count()*100
ham_perc

86.59368269921033

In [6]:
spam_perc = 100 - ham_perc
spam_perc

13.406317300789667

In [7]:
df_rand = df.sample(frac=1, random_state=1)

In [8]:
df_t = df_rand[0:4458].reset_index(drop=True)
df_t.shape

(4458, 2)

In [9]:
df_test = df_rand[4458:].reset_index(drop=True)
df_test.shape

(1114, 2)

In [10]:
df_t['Label'][df_t['Label']=='ham'].count()/df_t['Label'].count()

0.8654104979811574

In [11]:
df_test['Label'][df_test['Label']=='ham'].count()/df_test['Label'].count()

0.8680430879712747

In [12]:
df_t['SMS'] = df_t['SMS'].str.replace('\W',' ')
df_t['SMS'] = df_t['SMS'].str.lower()

In [13]:
len(df_t)

4458

In [14]:
vocabulary = []

In [15]:
df_t['SMS'] = df_t['SMS'].str.split()

In [16]:
for lst in df_t['SMS']:
    for word in lst:
        vocabulary.append(word)
vocabulary = list(set(vocabulary))
len(df_t)

4458

In [17]:
len(vocabulary)

7783

In [40]:
word_counts_per_sms = {unique_word: [0] * len(df_t['SMS']) for unique_word in vocabulary}

In [19]:
for index, sms in enumerate(df_t['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
len(df_t)

4458

In [20]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.shape

(4458, 7783)

In [21]:
df_t.shape

(4458, 2)

In [22]:
df_train = pd.concat([df_t, word_counts], axis=1)
len(df_train)

4458

In [23]:
df_train.shape

(4458, 7785)

In [24]:
ham_messages = df_train[df_train['Label']=='ham']
ham_messages.shape

(3858, 7785)

In [25]:
spam_messages = df_train[df_train['Label']=='spam']
spam_messages.shape

(600, 7785)

In [26]:
p_spam = len(spam_messages)/len(df_train)
p_spam

0.13458950201884254

In [27]:
p_ham = len(ham_messages)/len(df_train)
p_ham

0.8654104979811574

In [28]:
p_spam + p_ham

1.0

In [29]:
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()
n_spam

15190

In [30]:
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()
n_ham

57237

In [31]:
n_voc = len(vocabulary)
n_voc

7783

In [32]:
alpha = 1

In [33]:
param_spam = {unique_word:0 for unique_word in vocabulary}
param_ham = {unique_word:0 for unique_word in vocabulary}

In [34]:
for word in vocabulary: 
    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha)/(n_spam + alpha*n_voc)
    param_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha)/(n_ham + alpha*n_voc)
    param_ham[word] = p_word_given_ham

In [35]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message: 
        if word in param_spam: 
            p_spam_given_message *= param_spam[word]
        if word in param_ham:
            p_ham_given_message *= param_ham[word]


    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [36]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [37]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


In [48]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in param_spam:
            p_spam_given_message *= param_spam[word]

        if word in param_ham:
            p_ham_given_message *= param_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [49]:
df_test['predicted'] = df_test['SMS'].apply(classify_test_set)
df_test.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [53]:
correct = 0
total = 1114
for row in df_test.iterrows():
    if row[1]['Label'] == row[1]['predicted']:
        correct += 1
accuracy = correct/total
accuracy

0.9874326750448833