In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

In [3]:
df.shape

(5572, 2)

In [4]:
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [6]:
training = df.sample(frac=1, random_state=1)[:int((len(df)*.8))].reset_index(drop=True)

In [7]:
test = df.sample(frac=1, random_state=1)[int((len(df)*.8)):].reset_index(drop=True)

In [8]:
training['Label'].value_counts(normalize=True)

ham     0.86538
spam    0.13462
Name: Label, dtype: float64

In [9]:
test['Label'].value_counts(normalize=True)

ham     0.868161
spam    0.131839
Name: Label, dtype: float64

In [10]:
training['SMS'] = training['SMS'].str.replace('\W',' ').str.lower().str.split()

In [11]:
vocabulary = []
for row in training['SMS']:
    for word in row:
        vocabulary.append(word)     

In [12]:
vocabulary = set(vocabulary)

In [13]:
vocabulary = list(vocabulary)

In [14]:
vocabulary

['honi',
 'pizza',
 'offdam',
 'ollu',
 '650',
 'talking',
 'caller',
 'yan',
 '1million',
 'upping',
 'bright',
 'cliffs',
 'lobby',
 '69876',
 'pairs',
 'txtstop',
 'event',
 'downloaded',
 'sum',
 'windy',
 'dint',
 'show',
 'scream',
 'theatre',
 'yalrigu',
 'gv',
 '4d',
 'window',
 'walmart',
 'topic',
 'goverment',
 'uv',
 'fancy',
 'hudgi',
 'bbd',
 'lands',
 'standard',
 'resent',
 'figure',
 'stars',
 'scoring',
 'abeg',
 'serving',
 'wun',
 'tuesday',
 '500',
 'bruv',
 'opt',
 'twilight',
 'compass',
 'blur',
 'mob',
 'blood',
 'mutations',
 'bothering',
 'onum',
 'agent',
 'sambar',
 'joanna',
 'oblisingately',
 'intend',
 '09061790121',
 'six',
 'those',
 'jide',
 'prin',
 'coz',
 'unconvinced',
 'brothers',
 'heart',
 'camcorder',
 'algebra',
 'stash',
 'lotto',
 'desparately',
 'curfew',
 'upd8',
 'bluff',
 'quiet',
 'womdarfull',
 'speciale',
 'pics',
 'stones',
 'atten',
 'njan',
 'slightly',
 'lmao',
 'aftr',
 '440',
 'kisi',
 'congratulations',
 'dearly',
 '04',
 'fit

In [15]:
word_counts_per_sms = {unique_word: [0] * len(training['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [16]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [17]:
training = pd.concat([training,word_counts], axis=1)

In [18]:
p_spam = training['Label'].value_counts(normalize=True)[1]
p_ham = training['Label'].value_counts(normalize=True)[0]

In [19]:
n_spam = training[training['Label']=='spam'].iloc[:,2:].apply(sum).sum()

In [20]:
n_ham = training[training['Label']=='ham'].iloc[:,2:].apply(sum).sum()

In [21]:
n_vocab = len(training.iloc[:,2:].columns)

In [22]:
alpha = 1

In [23]:
p_words_given_spam = {unique_word: 0 for unique_word in vocabulary}
p_words_given_ham = {unique_word: 0 for unique_word in vocabulary}

In [24]:
spam_df = training[training['Label']=='spam']
ham_df = training[training['Label']=='ham']

In [25]:
for key in p_words_given_ham:
    n_word_given_ham = ham_df[key].sum()
    n_word_given_spam = spam_df[key].sum()
    p_words_given_ham[key] = (n_word_given_ham + alpha)/(n_ham + alpha * n_vocab)
    p_words_given_spam[key] = (n_word_given_spam + alpha)/(n_spam + alpha * n_vocab)

In [29]:
def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in vocabulary:
            p_spam_given_message *= p_words_given_spam[word]
            p_ham_given_message *= p_words_given_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'unclassifiable'

In [31]:
test['classification'] = test['SMS'].apply(classify)

In [32]:
test.head()

Unnamed: 0,Label,SMS,classification
0,ham,Wherre's my boytoy ? :-(,ham
1,ham,Later i guess. I needa do mcat study too.,ham
2,ham,But i haf enuff space got like 4 mb...,ham
3,spam,Had your mobile 10 mths? Update to latest Oran...,spam
4,ham,All sounds good. Fingers . Makes it difficult ...,ham


In [33]:
correct = 0
total = len(test)

In [47]:
for index, row in test.iterrows():
    if row['Label'] == row['classification']:
        correct +=1

In [51]:
correct/total * 100

98.7443946188341