# SMS Spam Filter Using Naive Bayes

This file reads a dataset of SMS messages, and based on historical knowledge, will classify messages as spam or not spam.

In [1]:
import pandas as pd

# read csv file into dataframe, file is tab separated with no header row
sms_data = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

In [3]:
# view the first five rows
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# return the number of rows and columns
sms_data.shape

(5572, 2)

In [18]:
# return percentage of ham vs. spam messages
ham = 0
spam = 0
total_messages = len(sms_data)

for row in sms_data['Label']:
    if row == "ham":
        ham += 1
    else:
        spam += 1

proportion_ham = ham / total_messages
proportion_spam = spam / total_messages

print('Ham: {0:.2f}%'.format(proportion_ham * 100))
print('Spam: {0:.2f}%'.format(proportion_spam * 100))

Ham: 86.59%
Spam: 13.41%


In [22]:
# print number of ham and spam records
sms_data['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

## Divide Data into Training Set and Test Set

Below, the Training Data Set will contain 80% of the entire data set, while the Test Data Set will contain 20% of the data from the whole SMS data set.

In [23]:
# randomize the entire data set into new data frame
random_sms = sms_data.sample(frac=1, random_state=1)

# view the first 5 rows
random_sms.head()

Unnamed: 0,Label,SMS
1078,ham,"Yep, by the pretty sculpture"
4028,ham,"Yes, princess. Are you going to make me moan?"
958,ham,Welp apparently he retired
4642,ham,Havent.
4674,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [41]:
# define length of each data set
train_len_index = round(len(random_sms) * 0.8)

# separate the data set into test and training sets
sms_train = random_sms[:train_len_index].reset_index(drop=True)
sms_test = random_sms[train_len_index:].reset_index(drop=True)

print(sms_train.shape)
print(sms_test.shape)

(4458, 2)
(1114, 2)


In [42]:
# find percentage of ham and spam in each of the split data sets
percent_ham_train = sms_train['Label'].value_counts()[0] / len(sms_train) * 100
percent_spam_train = sms_train['Label'].value_counts()[1] / len(sms_train) * 100
percent_ham_test = sms_test['Label'].value_counts()[0] / len(sms_test) * 100
percent_spam_test = sms_test['Label'].value_counts()[1] / len(sms_test) * 100

print('Ham in training set: {0:.2f}%'.format(percent_ham_train))
print('Spam in training set: {0:.2f}%'.format(percent_spam_train))
print('Ham in test set: {0:.2f}%'.format(percent_ham_test))
print('Spam in test set: {0:.2f}%'.format(percent_spam_test))

Ham in training set: 86.54%
Spam in training set: 13.46%
Ham in test set: 86.80%
Spam in test set: 13.20%


In [43]:
# clean the test and training data sets by eliminating punctuation and lowering case
sms_train['SMS'] = sms_train['SMS'].str.replace('\W', ' ')
sms_test['SMS'] = sms_test['SMS'].str.replace('\W', ' ')
sms_train['SMS'] = sms_train['SMS'].str.lower()
sms_test['SMS'] = sms_test['SMS'].str.lower()

In [44]:
# verify the cleaned data
sms_train.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [45]:
# verified cleaned test set
sms_test.head()

Unnamed: 0,Label,SMS
0,ham,later i guess i needa do mcat study too
1,ham,but i haf enuff space got like 4 mb
2,spam,had your mobile 10 mths update to latest oran...
3,ham,all sounds good fingers makes it difficult ...
4,ham,all done all handed in don t know if mega sh...


In [46]:
# create a vocabulary (list of unique words) from the data set (SMS column)
sms_train['SMS'] = sms_train['SMS'].str.split()

vocabulary = []
for msg in sms_train['SMS']:
    for word in msg:
        vocabulary.append(word)
        
# create a set to remove duplicates
vocabulary = set(vocabulary)
# turn the set back into a list
vocabulary = list(vocabulary)

In [49]:
# create a dictionary of the unique words and their counts
word_counts_per_sms = {unique_word: [0] * len(sms_train['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(sms_train['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

# convert dictionary to dataframe
word_counts_df = pd.DataFrame(word_counts_per_sms)

# concatenate the data frames
train_word_count_combo = pd.concat([sms_train, word_counts_df], axis=1)

# verify the data in the data set
train_word_count_combo.head()

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [50]:
# determine number of unique words in the vocabulary
print(len(vocabulary))

7783


## Perform Calculations for Algorithm

The code below will define the necessary variables, and then use the Naive Bayes algorithm to calculate probability of spam vs. not spam, and ultimately classify messages from the test set.

In [62]:
# calculate probability of spam and ham
p_spam = train_word_count_combo['Label'].value_counts()[1] / len(train_word_count_combo)
p_ham = train_word_count_combo['Label'].value_counts()[0] / len(train_word_count_combo)

print(p_spam)
print(p_ham)

0.13458950201884254
0.8654104979811574


In [58]:
# calculate the number of words in spam, ham, and vocabulary
spam_temp = train_word_count_combo[train_word_count_combo['Label'] == 'spam']
n_spam = 0
for l in spam_temp['SMS']:
    n_spam += len(l)

ham_temp = train_word_count_combo[train_word_count_combo['Label'] == 'ham']
n_ham = 0
for i in ham_temp['SMS']:
    n_ham += len(i)

n_vocabulary = len(vocabulary)

# initiate smoothing variable
alpha = 1

# verify the variables
print('Number of words from spam messages: ', n_spam)
print('Number of words from ham messages: ', n_ham)
print('Number of words in vocabulary: ', n_vocabulary)

Number of words from spam messages:  15190
Number of words from ham messages:  57237
Number of words in vocabulary:  7783


In [64]:
# create a dictionary for each spam and ham parameters
spam_parameters = {word:0 for word in vocabulary}
ham_parameters = {word:0 for word in vocabulary}

# calculate parameters
for wrd in vocabulary:
    num_word_in_spam = spam_temp[wrd].sum()
    num_word_in_ham = ham_temp[wrd].sum()
    p_wrd_given_spam = (num_word_in_spam + alpha) / (n_spam + alpha * n_vocabulary)
    p_wrd_given_ham = (num_word_in_ham + alpha) / (n_ham + alpha * n_vocabulary)
    
    # update the dictionaries
    spam_parameters[wrd] = p_wrd_given_spam
    ham_parameters[wrd] = p_wrd_given_ham

## Create the spam filter

The code below will classify whether a new message is spam or not spam.

In [65]:
import re

# define a function to classify each new message
def classify_message(msg):
    message = re.sub('\W', ' ', msg)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    # calculate the probabilities
    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]
        if word in ham_parameters:
            p_ham_given_message *= ham_parameters[word]
    
    # print the calculations
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    # provide the label
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [66]:
# test the function
classify_message('WINNER!! This is the secret code to unlock the money: C3421.')
classify_message('Sounds good, Tom, then see u there')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam
P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


## Test the classifier function on the test data set

In [72]:
# define a function to classify the test set, that returns results instead of printing them
def classify_test_set(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]

        if word in ham_parameters:
            p_ham_given_message *= ham_parameters[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [73]:
# create a non cleaned version of test set
test_set = random_sms[train_len_index:].reset_index(drop=True)

# create a new column in the test set for predictions
test_set['prediction'] = test_set['SMS'].apply(classify_test_set)

# view several predictions
test_set.head(10)

Unnamed: 0,Label,SMS,prediction
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham
5,ham,But my family not responding for anything. Now...,ham
6,ham,U too...,ham
7,ham,Boo what time u get out? U were supposed to ta...,ham
8,ham,Genius what's up. How your brother. Pls send h...,ham
9,ham,I liked the new mobile,ham


## Model Accuracy

The code below will determine how accurate the predictions of the test data set were against the real values from the initial SMS data set.

In [81]:
# calculate accuracy of predictions
correct = 0
total = len(test_set)

for row in test_set.iterrows():
    row = row[1]
    
    if row['Label'] == row['prediction']:
        correct += 1

accuracy = (correct / total) * 100

print('The model predicted with accuracy percentage of ', accuracy)

The model predicted with accuracy percentage of  98.74326750448833


# Conclusion

The model created using Naive Bayes algorithm for predicting whether SMS messages are spam or not can be considered reliable, since it predicted data from the test set with almost 99% accuracy