In [84]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix  
import matplotlib.pyplot as plt
import regex as re

In [85]:
data = pd.read_csv("sms_spam.csv")
data.columns.values[0] = "Label"
data.columns.values[1] = "SMS"
data

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [86]:
data['SMS'].value_counts(normalize=True)

SMS
Sorry, I'll call later                                                                                                                                      0.005382
I cant pick the phone right now. Pls send a message                                                                                                         0.002153
Ok...                                                                                                                                                       0.001794
Ok                                                                                                                                                          0.000718
Ok.                                                                                                                                                         0.000718
                                                                                                                                                              ...   
I gott

In [87]:
data_randomized = data.sample(frac=1, random_state=1)
data_randomized

Unnamed: 0,Label,SMS
1447,ham,Looks like u wil b getting a headstart im leav...
2032,ham,"I noe la... U wana pei bf oso rite... K lor, o..."
4432,ham,2mro i am not coming to gym machan. Goodnight.
4888,spam,Todays Vodafone numbers ending with 4882 are s...
5276,ham,"Hi. Hope ur day * good! Back from walk, table ..."
...,...,...
905,ham,"We're all getting worried over here, derek and..."
5192,spam,Our records indicate u maybe entitled to 5000 ...
3980,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,spam,Text & meet someone sexy today. U can find a d...


In [88]:
'''
Here we can't use train_test_split as the data is not processed. There are white spaces, punctuation
marks and so on. We need a bag of words to work on.
'''
training_test_index = round(len(data_randomized)*0.8)
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)
training_set.shape, test_set.shape

((4459, 2), (1115, 2))

In [89]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,Looks like u wil b getting a headstart im leav...
1,ham,"I noe la... U wana pei bf oso rite... K lor, o..."
2,ham,2mro i am not coming to gym machan. Goodnight.
3,spam,Todays Vodafone numbers ending with 4882 are s...
4,ham,"Hi. Hope ur day * good! Back from walk, table ..."


In [90]:
training_set['SMS'] = training_set['SMS'].str.replace(r'\W', ' ', regex=True)# Removes punctuation
training_set['SMS'] = training_set['SMS'].str.lower()
training_set['SMS'][1]

'i noe la    u wana pei bf oso rite    k lor  other days den   '

In [91]:
training_set['SMS'] = training_set['SMS'].str.split()

In [92]:
vocabulary = set()
for text in training_set['SMS']:
   for word in text:
      vocabulary.add(word)

vocabulary = list(vocabulary)
len(vocabulary)
if "text" in vocabulary:
   print("yes")

yes


In [93]:
word_counts_per_text = {'secret': [2,1,1],
                       'prize': [2,0,1],
                       'claim': [1,0,1],
                       'now': [1,0,1],
                       'coming': [0,1,0],
                       'to': [0,1,0],
                       'my': [0,1,0],
                       'party': [0,1,0],
                       'winner': [0,0,1]
                      }
word_counts = pd.DataFrame(word_counts_per_text)
word_counts.head()

Unnamed: 0,secret,prize,claim,now,coming,to,my,party,winner
0,2,2,1,1,0,0,0,0,0
1,1,0,0,0,1,1,1,1,0
2,1,1,1,1,0,0,0,0,1


In [94]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
    
word_counts_per_sms.keys()



1. Remove punctuations
2. Create vocabulary
3. Do train test split
4. Feed to NB classifier
5. Share the accuracy

In [95]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,nimya,evng,3hrs,laundry,pouch,traffic,rum,deliveredtomorrow,family,practical,...,08708034412,realised,horny,couldn,mention,improved,mobypobox734ls27yf,successful,077xxx,hardcore
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,nimya,evng,3hrs,laundry,pouch,traffic,rum,deliveredtomorrow,...,08708034412,realised,horny,couldn,mention,improved,mobypobox734ls27yf,successful,077xxx,hardcore
0,ham,"[looks, like, u, wil, b, getting, a, headstart...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[i, noe, la, u, wana, pei, bf, oso, rite, k, l...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[2mro, i, am, not, coming, to, gym, machan, go...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,spam,"[todays, vodafone, numbers, ending, with, 4882...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[hi, hope, ur, day, good, back, from, walk, ta...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']
spam_messages.head()

Unnamed: 0,Label,SMS,nimya,evng,3hrs,laundry,pouch,traffic,rum,deliveredtomorrow,...,08708034412,realised,horny,couldn,mention,improved,mobypobox734ls27yf,successful,077xxx,hardcore
3,spam,"[todays, vodafone, numbers, ending, with, 4882...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,spam,"[we, tried, to, contact, you, re, our, offer, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,spam,"[freemsg, why, haven, t, you, replied, to, my,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,spam,"[loans, for, any, purpose, even, if, you, have...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,spam,"[would, you, like, to, see, my, xxx, pics, the...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)
p_spam, p_ham

(0.1347835837631756, 0.8652164162368244)

In [99]:
# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()
n_words_per_ham_message, n_ham

(0       34
 1       14
 2        9
 4       20
 5       15
         ..
 4453    24
 4454     6
 4455    17
 4456    10
 4458    27
 Name: SMS, Length: 3858, dtype: int64,
 57367)

In [100]:
# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()
n_words_per_spam_message, n_spam
print(n_spam)

15239


In [101]:
# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = np.int64(1)
n_vocabulary, alpha

(7802, 1)

In [102]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha)/ (alpha * n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham
parameters_spam

{'nimya': 0.0001281722635221738,
 'evng': 0.0001281722635221738,
 '3hrs': 0.0002563445270443476,
 'laundry': 0.0001281722635221738,
 'pouch': 0.0001281722635221738,
 'traffic': 0.0001281722635221738,
 'rum': 0.0001281722635221738,
 'deliveredtomorrow': 0.0005126890540886952,
 'family': 0.0001281722635221738,
 'practical': 0.0001281722635221738,
 'blackberry': 0.0001281722635221738,
 'sacked': 0.0001281722635221738,
 'one': 0.0008972058446552166,
 'boss': 0.0001281722635221738,
 'accidentally': 0.0001281722635221738,
 'y': 0.0001281722635221738,
 'clas': 0.0001281722635221738,
 'mahfuuz': 0.0001281722635221738,
 'raining': 0.0001281722635221738,
 'painful': 0.0001281722635221738,
 'waaaat': 0.0001281722635221738,
 'mornin': 0.0001281722635221738,
 '07973788240': 0.0002563445270443476,
 'curfew': 0.0001281722635221738,
 'showr': 0.0001281722635221738,
 'vivek': 0.0001281722635221738,
 'howz': 0.0001281722635221738,
 'quoting': 0.0005126890540886952,
 'ahmad': 0.0001281722635221738,
 'tom

In [103]:
import re

def classify(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham: 
         p_ham_given_message *= parameters_ham[word]

   print('P(Spam|message):', p_spam_given_message)
   print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      print('Label: Ham')
   elif p_ham_given_message < p_spam_given_message:
      print('Label: Spam')
   else:
      print('Equal proabilities, have a human classify this!')

In [104]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 2.2593967154472026e-21
P(Ham|message): 1.9256070327302398e-27
Label: Spam


In [105]:
def classify_test_set(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 'ham'
   elif p_spam_given_message > p_ham_given_message:
      return 'spam'
   else:
      return 'needs human classification'

In [106]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
print(test_set.head())
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['Label'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total * 100)

  Label                                                SMS predicted
0   ham                           Wherre's my boytoy ? :-(       ham
1   ham          Later i guess. I needa do mcat study too.       ham
2   ham             But i haf enuff space got like 4 mb...       ham
3  spam  Had your mobile 10 mths? Update to latest Oran...      spam
4   ham  All sounds good. Fingers . Makes it difficult ...       ham
Correct: 846
Incorrect: 269
Accuracy: 75.8744394618834
