# Classification of messages as spam or not spam using Naive Bayes algorithm 

In [23]:
import pandas as pd

In [24]:
# Import Dataset
df = pd.read_table('SMS', sep='\t', header=None,names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [25]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label'] = df.label.map({'ham':0,'spam':1})

In [26]:
# Randomize the dataset
df = df.sample(frac=1, random_state=1)
df

Unnamed: 0,label,sms_message
1078,0,"Yep, by the pretty sculpture"
4028,0,"Yes, princess. Are you going to make me moan?"
958,0,Welp apparently he retired
4642,0,Havent.
4674,0,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
905,0,"We're all getting worried over here, derek and..."
5192,0,Oh oh... Den muz change plan liao... Go back h...
3980,0,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,1,Text & meet someone sexy today. U can find a d...


In [27]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)

training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

print(training.shape)
print(test.shape)

(4458, 2)
(1114, 2)


In [28]:
#  data cleaning 
training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
training['sms_message'] = training['sms_message'].str.lower() ### making all the words lowercase

test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation
test['sms_message'] = test['sms_message'].str.lower() ### making all the words lowercase
training

  training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
  test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation


Unnamed: 0,label,sms_message
0,0,yep by the pretty sculpture
1,0,yes princess are you going to make me moan
2,0,welp apparently he retired
3,0,havent
4,0,i forgot 2 ask ü all smth there s a card on ...
...,...,...
4453,0,sorry i ll call later in meeting any thing re...
4454,0,babe i fucking love you too you know fuck...
4455,1,u ve been selected to stay in 1 of 250 top bri...
4456,0,hello my boytoy geeee i miss you already a...


In [29]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = []

# Step 2
n_ham = 0
n_spam = 0

# Step 3
n_w_ham = {}
n_w_spam = {}

# Commandeering this nested for loop to do the things we want it to do :)
i = 0
for sms in training['sms_message']:
    for word in sms:
        vocabulary.append(word)
        
        # Check if given sms is ham
        if training['label'][i] == 0:
            # Calculating the number of words in a spam message
            n_ham += 1
            
            # Check if the current word is in the n_w_ham dictionary
            # If so, increment its value
            if word in n_w_ham.keys():
                n_w_ham[word] += 1
            # Otherwise, append it to the dictionary with a value of 1
            else:
                n_w_ham[word] = 1
        # Otherwise, given sms is spam
        else:
            # Calculating the number of words in a spam message
            n_spam += 1
            
            # Check if the current word is in the n_w_spam dictionary
            # If so, increment its value
            if word in n_w_spam.keys():
                n_w_spam[word] += 1
            # Otherwise, append it to the dictionary with a value of 1
            else:
                n_w_spam[word] = 1
    i += 1

vocabulary = list(set(vocabulary))  ### only count the number of unique words

In [30]:
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(training['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,phrase,6,09066362206,noooooooo,das,onluy,top,acl03530150pm,randomly,aids,...,tmw,lotto,james,giv,sankranti,kolathupalayam,freefone,69696,wanted,restaurant
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
training_new = pd.concat([training, word_counts], axis=1)

training_new

Unnamed: 0,label,sms_message,phrase,6,09066362206,noooooooo,das,onluy,top,acl03530150pm,...,tmw,lotto,james,giv,sankranti,kolathupalayam,freefone,69696,wanted,restaurant
0,0,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,"[sorry, i, ll, call, later, in, meeting, any, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,"[babe, i, fucking, love, you, too, you, know, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,1,"[u, ve, been, selected, to, stay, in, 1, of, 2...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Laplace smoothing
alpha = 1

# You will start from here.

In [33]:
# Hints:
# Step 1: caculate P(Spam) and P(Ham)
spam_count = 0
ham_count = 0
label_col = df['label']
sms_col = df['sms_message']
spam_word_dict = {}
ham_word_dict = {}

import re 
#label_col.shape[0]
for i in range(100):
    # ham row
    sms_message = sms_col[i]
    sms_message = re.sub(r'[^\w\s]', '', sms_message)
    sms_message = sms_message.lower()
    
    if label_col[i] == 0:
        ham_count += 1
        for sms in sms_message.split():
            if sms not in ham_word_dict:
                ham_word_dict[sms] = 1
            else :
                ham_word_dict[sms] += 1
                
    else: #spam row 
        spam_count +=1
        
        for sms in sms_message.split():
            if sms not in spam_word_dict:
                spam_word_dict[sms] = 1
            else :
                spam_word_dict[sms] += 1

p_spam = spam_count / len(label_col)
p_ham = ham_count / len(label_col)


# yNums = []
# for i in range(y.shape[0]):
#     if y[i] == "sdfd":
#        yNums.append(1)
#     else:
#        yNums.append(0)

# print(ham_word_dict)

# Step 2: count N_Spam, N_Ham, N_Vocabulary
# number of total spam words, number of total ham words, number of UNIQUE words in whole data set len(vocabulary) already calculated
n_vocabulary = len(vocabulary)
# COMPLETED ABOVE

# Step 3: count the number of times the word w occurs in spam/ham message: N_w_spam
# COMPLETED ABOVE

# Step 4: p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)

p_w_given_spam = {}
p_w_given_ham = {}

# Calculates probability for each unique word in vocabulary
for word in vocabulary:
    if word in n_w_spam:
        p_w_given_spam[word] = (n_w_spam.get(word)+alpha)/(n_spam+alpha*n_vocabulary)
    else:
        p_w_given_spam[word] = 0
    
    if word in n_w_ham:
        p_w_given_ham[word] = (n_w_ham.get(word)+alpha)/(n_ham+alpha*n_vocabulary)
    else:
        p_w_given_ham[word] = 0

p_w_given_spam

{'phrase': 0,
 '6': 0.0003482348844295477,
 '09066362206': 8.705872110738693e-05,
 'noooooooo': 0,
 'das': 0,
 'onluy': 0,
 'top': 0.0006094110477517085,
 'acl03530150pm': 8.705872110738693e-05,
 'randomly': 0.0001305880816610804,
 'aids': 0,
 'olage': 0,
 'outsomewhere': 0,
 'thinl': 0,
 'tait': 0,
 'paracetamol': 0,
 'jerry': 0,
 'cm': 8.705872110738693e-05,
 'specs': 0,
 'er': 0,
 'thkin': 0,
 'contented': 0,
 'praises': 0,
 'ws': 0,
 '6days': 0.0001305880816610804,
 'habit': 0,
 'yuou': 0,
 'magic': 0,
 'pop': 0,
 'shld': 0,
 'realized': 0,
 'brandy': 0,
 'hits': 0,
 'sacked': 0,
 'wiv': 8.705872110738693e-05,
 'disconnect': 0,
 'prizeswith': 8.705872110738693e-05,
 'bffs': 0,
 'mystery': 8.705872110738693e-05,
 'cam': 8.705872110738693e-05,
 'meaning': 0,
 'causing': 0,
 'pen': 0,
 'different': 0,
 'alaikkum': 0,
 'profile': 0,
 'phne': 0,
 'sozi': 0,
 'tactless': 0,
 '0906346330': 8.705872110738693e-05,
 'geeee': 0,
 '89555': 8.705872110738693e-05,
 'bag': 0,
 'age': 0.0003047055

# Calculate accuracy, precision, recall and F1_score. 

In [34]:
# Model Evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# print('Accuracy score: {}'.format(accuracy_score(test['label'], test['predicted'])))
# print('Precision score: {}'.format(precision_score(test['label'], test['predicted'])))
# print('Recall score: {}'.format(recall_score(test['label'], test['predicted'])))
# print('F1 score: {}'.format(f1_score(test['label'], test['predicted'])))
