In [28]:
# IMPORTS

import pandas as pd
import re


In [29]:
# LOAD DATA INTO TRAINING SET

training_set = pd.read_csv('SMSSpamCollection', sep='\t',
                           header=None, names=['Label', 'SMS'])

training_set.head()


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
# FUNCTION TO CLEAN SMS

def clean(sms):
    # Remove special characters and convert to lowercase
    return re.sub('\W', ' ', sms).lower()


In [31]:
# CLEAN DATA

training_set.SMS = training_set.SMS.apply(clean)

training_set.head()


Unnamed: 0,Label,SMS
0,ham,go until jurong point crazy available only ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives aro...


In [32]:
# COUNT WORD OCCURRENCES IN SPAM AND HAM MESSAGES

table = {
    'spam': {},
    'ham': {}
}

# For each row in training set
for row in training_set.itertuples():
    # Split sms string into an array of words
    words = row.SMS.split()

    # For each word in sms
    for word in words:
        # Add word to occurrence table if not exist
        # Default value is 1 to avoid "the 0 problem"
        if word not in table['spam']:
            table['spam'][word] = 1
            table['ham'][word] = 1

        # Increment word occurrence by label
        table[row.Label][word] += 1

table = pd.DataFrame(table)

table.head()


Unnamed: 0,spam,ham
go,32,253
until,6,23
jurong,1,2
point,1,14
crazy,6,11


In [33]:
# CALCULATE CONSTANTS

# Probability that a message is spam or ham
p = training_set.Label.value_counts(normalize=True).to_dict()

# Word count in spam and ham messages
n = table.sum().to_dict()

pd.DataFrame({
    'words': n,
    'probability': p
}).head()


Unnamed: 0,words,probability
spam,27789,0.134063
ham,80100,0.865937


In [34]:
# CALCULATE WORD OCCURRENCE PROBABILITY IN SPAM AND HAM MESSAGES

p_table = pd.DataFrame({
  'spam': table['spam'].apply(lambda x: x / n['spam']),
  'ham': table['ham'].apply(lambda x: x / n['ham'])
})

p_table.head()


Unnamed: 0,spam,ham
go,0.001152,0.003159
until,0.000216,0.000287
jurong,3.6e-05,2.5e-05
point,3.6e-05,0.000175
crazy,0.000216,0.000137


In [35]:
# CLASSIFY A NEW MESSAGE

def is_spam(sms):
    words = clean(sms).split()

    p_spam_given_message = p['spam']
    p_ham_given_message = p['ham']

    for word in words:
        if word not in p_table['spam']:
            continue
        p_spam_given_message *= p_table['spam'][word]
        p_ham_given_message *= p_table['ham'][word]

    print('P(spam|message):', p_spam_given_message)
    print('P(ham|message):', p_ham_given_message)
    return p_spam_given_message > p_ham_given_message


file = open("TestData", "r")
result = is_spam(file.read())

print('Label:', 'spam' if result else 'ham')


P(spam|message): 1.642916025269595e-66
P(ham|message): 2.077687710984501e-89
Label: spam
