In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("SMSSpamCollection.csv", delimiter="\t", header=None, names=['Label', 'SMS'])

In [3]:
data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [4]:
data.tail()

Unnamed: 0,Label,SMS
5567,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other suggestions?"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free
5571,ham,Rofl. Its true to its name


In [5]:
data['Label'].value_counts(normalize=True)*100

NumExpr defaulting to 8 threads.


ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [6]:
new_data = data.sample(frac=1, random_state=1)

In [7]:
new_data.head()

Unnamed: 0,Label,SMS
1078,ham,"Yep, by the pretty sculpture"
4028,ham,"Yes, princess. Are you going to make me moan?"
958,ham,Welp apparently he retired
4642,ham,Havent.
4674,ham,I forgot 2 ask ü all smth.. There's a card on da present lei... How? Ü all want 2 write smth or sign on it?


In [8]:
training_set = new_data[0:4458].reset_index(drop=True)

In [9]:
test_set = new_data[4458:].reset_index(drop=True)

In [10]:
training_set.shape

(4458, 2)

In [11]:
test_set.shape

(1114, 2)

In [12]:
# training set percentage spam
training_set['Label'].value_counts(normalize=True)*100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [13]:
# test set percentage spam
test_set['Label'].value_counts(normalize=True)*100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

In [14]:
training_set["SMS"] = training_set["SMS"].str.replace('\W', ' ', regex=True).str.lower()

In [15]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))

In [16]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [17]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,kl341,oveable,research,wedlunch,nordstrom,wannatell,grace,shivratri,reduce,bath,...,miles,gettin,65,excuses,slightly,goodmorning,link,prefer,basket,laughed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,kl341,oveable,research,wedlunch,nordstrom,wannatell,grace,shivratri,...,miles,gettin,65,excuses,slightly,goodmorning,link,prefer,basket,laughed
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me, moan]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a, card, on, da, present, lei, how, ü, all, want, 2, write, smth, or, sign, on, it]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
training_set_clean["Label"].value_counts()

ham     3858
spam     600
Name: Label, dtype: int64

In [20]:
training_set_clean.shape

(4458, 7785)

In [21]:
prob_spam = 600/4458
prob_ham = 3858/4458
n_vocabulary = len(vocabulary)
n_spam = training_set_clean.loc[training_set_clean.Label=="spam"].iloc[:,2:].sum().sum()
n_ham = training_set_clean.loc[training_set_clean.Label=="ham"].iloc[:,2:].sum().sum()
alpha=1

In [22]:
training_spam = training_set_clean[training_set_clean["Label"] == "spam"]
training_ham = training_set_clean[training_set_clean["Label"] == "ham"]

In [23]:
spam_prob_dict = {i : 0 for i in vocabulary}
ham_prob_dict = {i : 0 for i in vocabulary}

In [24]:
for word in spam_prob_dict:
    value = (training_spam[word].sum()+alpha)/(n_spam+alpha*n_vocabulary)
    spam_prob_dict[word] = value
for word in ham_prob_dict:
    value = (training_ham[word].sum()+alpha)/(n_ham+alpha*n_vocabulary)
    ham_prob_dict[word] = value

In [25]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = prob_spam 
    p_ham_given_message = prob_ham
    
    for word in message:
        if word in spam_prob_dict:
            p_spam_given_message *= spam_prob_dict[word]
        if word in ham_prob_dict:
            p_ham_given_message *= ham_prob_dict[word]    

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [26]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [27]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


In [28]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = prob_spam 
    p_ham_given_message = prob_ham
    
    for word in message:
        if word in spam_prob_dict:
            p_spam_given_message *= spam_prob_dict[word]
        if word in ham_prob_dict:
            p_ham_given_message *= ham_prob_dict[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [29]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head(15)

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Orange camera/video phones for FREE. Save £s with Free texts/weekend calls. Text YES for a callback orno to opt out,spam
3,ham,All sounds good. Fingers . Makes it difficult to type,ham
4,ham,"All done, all handed in. Don't know if mega shop in asda counts as celebration but thats what i'm doing!",ham
5,ham,But my family not responding for anything. Now am in room not went to home for diwali but no one called me and why not coming. It makes me feel like died.,ham
6,ham,U too...,ham
7,ham,Boo what time u get out? U were supposed to take me shopping today. :(,ham
8,ham,Genius what's up. How your brother. Pls send his number to my skype.,ham
9,ham,I liked the new mobile,ham


In [33]:
total = test_set["Label"].count()
test_set['correct'] = test_set.apply(lambda x: 1 if x["Label"] == x["predicted"] else 0, axis=1)
correct = test_set['correct'].sum()
print("Accuracy of our test was", correct/total)

Accuracy of our test was 0.9874326750448833


In [34]:
test_set["correct"].value_counts()

1    1100
0      14
Name: correct, dtype: int64