# Filter SMS spam with Naive Bayes

# Import dataset

In [118]:
import pandas as pd

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Randomize dataset

In [119]:
# randomly mix the entire dataset, use random_state to ensure reproducibility of results
sms_spam['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

# Divide dataset on train and test dataset

In [120]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Training/Test split
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


# Count the percent of spam in train, test and full datasets

As we can see, spam percents in train, test and full datasets are almost the same.

# Clean the dataset

In [121]:
# After cleaning
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


# Make a vocabulary from train set

In [122]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))

# Make a dictionary with word frequency for each SMS in dataset

In [123]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

# Transform dictionary to DataFrame

In [124]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,m227xy,groovying,dental,rounds,06,la3,2geva,piece,80488,managed,...,percent,sufficient,invnted,care,lemme,series,salary,agalla,mileage,lasting
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,m227xy,groovying,dental,rounds,06,la3,2geva,piece,...,percent,sufficient,invnted,care,lemme,series,salary,agalla,mileage,lasting
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Count probabilities for Spam and Ham 

In [126]:
ham_len = training_set_clean.loc[training_set_clean['Label']=='ham'].shape[0]
P_ham = ham_len/training_set_clean.shape[0]
P_spam = 1 - P_ham
print(P_ham)
print(P_spam)

0.8654104979811574
0.13458950201884257


# Count number of words in Ham and Spam categories and Vocabulary

In [150]:
ham_set = training_set_clean.loc[training_set_clean['Label']=='ham'].copy()
spam_set = training_set_clean.loc[training_set_clean['Label']=='spam'].copy()

N_ham = ham_set['SMS'].apply(len).sum()
N_spam = spam_set['SMS'].apply(len).sum()
N_voc = len(vocabulary)
print(N_ham)
print(N_spam)
print(N_voc)

57237
15190
7783


# Count frequencies for each word in Ham and Spam categories

# Write function for Naive Bayes

In [151]:
# make a dictionary for word classification

parameters_ham = {unique_word:0 for unique_word in vocabulary}
parameters_spam = {unique_word:0 for unique_word in vocabulary}

for unique_word in vocabulary:
    P_word_ham = (ham_set[unique_word].sum() + 1)/(N_ham + N_voc)
    parameters_ham[unique_word] = P_word_ham
    P_word_spam = (spam_set[unique_word].sum() + 1)/(N_spam + N_voc)
    parameters_spam[unique_word] = P_word_spam

In [152]:
import re

def naive_bayes(sms):
    sms = re.sub('\W', ' ', sms)
    sms = sms.lower().split()
    
    P_sms_ham = P_ham
    P_sms_spam = P_spam
    
    for word in sms:
        if word in parameters_ham:
            P_sms_ham *= parameters_ham[word]
        if word in parameters_spam:
            P_sms_spam *= parameters_spam[word]
    
    if P_word_ham > P_word_spam:
        return 'ham'
    if P_word_spam > P_word_ham:
        return 'spam'
    return 'unknown'

In [153]:
naive_bayes('WINNER!! This is the secret code to unlock the money: C3421.')

'spam'

In [154]:
test_set['Filter'] = test_set['SMS'].apply(naive_bayes)
filter_precise = test_set.loc[test_set['Label'] == test_set['Filter']].shape[0]/test_set.shape[0]
filter_precise

0.1319569120287253