### Problem Statement

  - Implement Spam Detection and Spam Filtering

### Loading SMS Spam Dataset

In [42]:
import pandas as pd

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',
header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
sms_spam['Label'].value_counts(normalize=True)

Label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

### Data Preparation, Pre-Processing and Vocab Building

- Train, Test and Split

In [44]:
data_randomized = sms_spam.sample(frac=1, random_state=1)

training_test_index = round(len(data_randomized) * 0.8)

training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [45]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head(3)


Unnamed: 0,Label,SMS
0,ham,"yep, by the pretty sculpture"
1,ham,"yes, princess. are you going to make me moan?"
2,ham,welp apparently he retired


- Vocabulary Generation

In [46]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

- Bag of Words Representation

In [47]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_per_sms[word][index] += 1

word_counts = pd.DataFrame(word_counts_per_sms)

training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head(3)

Unnamed: 0,Label,SMS,ge:-)..,nitros.,repeating,08002988890,"movie,",lessons..,dumb?,gumby's,...,cried,plan.,"reason,",filthyguys.,meh...,disclose,renewal.,needed.salary,can't.,k...k:)why
0,ham,"[yep,, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes,, princess., are, you, going, to, make, m...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Naive Bayes Algorithm (Classification Approach)

Calculating:
- ```P(Spam)``` and ```P(Ham)```
- ```NSpam```, ```NHam```, ```NVocabulary```
    - NSpam is equal to the number of words in all the spam messages.
    - NHam is equal to the number of words in all the non-spam messages.

In [48]:
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

n_vocabulary = len(vocabulary)

alpha = 1

 - Calculating ```P(w|Spam)``` and ```P(w|Ham)```

In [49]:
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

### Defining a function to accept a message and classify

In [61]:
import re

def classify(message):

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

  #  print('P(Spam|message):', p_spam_given_message)
  #  print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      # print('Label: Ham')
      return "ham"
   elif p_ham_given_message < p_spam_given_message:
      # print('Label: Spam')
      return "spam"
   else:
      # print('Equal proabilities, have a human classify this!')
      return "needs human classification"

- Test Case 1

In [57]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.1680023632078457e-26
P(Ham|message): 6.088544142463393e-28


'spam'

- Test Case 2

In [58]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.234299283967944e-26
P(Ham|message): 8.376346103813855e-22


'ham'

### Sample Filtering of Emails

In [59]:
emails = {
    1: "Hi there,\n\nHow are you doing today?\n\nJust wanted to check in and see if you had any questions about your recent order.\n\nThanks,\n\nThe Customer Service Team",
    2: "Hey,\n\nI'm writing to you today to let you know about a new product that we're launching next week.\n\nIt's a new type of coffee that we think you'll really enjoy.\n\nWe're offering a special discount to our existing customers, so be sure to check it out.\n\nThanks,\n\nThe Coffee Team",
    3: "Hi,\n\nI'm writing to you today to let you know about a new promotion that we're running.\n\nFor a limited time, you can get 20% off your next purchase.\n\nJust use the code SAVE20 at checkout.\n\nThanks,\n\nThe Marketing Team",
    4: "Hey,\n\nJust wanted to give you a quick update on your order.\n\nIt's currently being processed and should ship out within the next few days.\n\nYou'll receive an email notification once it's shipped.\n\nThanks for your patience.\n\nThe Shipping Team",
    5: "Hi,\n\nI'm writing to you today to let you know about a new feature that we've added to our website.\n\nYou can now track your order status online.\n\nJust go to our website and click on the 'Track Your Order' link.\n\nYou'll need to enter your order number and email address.\n\nThanks,\n\nThe Web Team",
    6: "You've been selected to receive a FREE iPhone!\n\nJust click on the link below to claim your prize.\n\n[link]",
    7: "Congratulations! You've won a $100 Amazon gift card!\n\nJust click on the link below to claim your prize.\n\n[link]",
    8: "Your computer is infected with a virus!\n\nClick on the link below to download a free antivirus software.\n\n[link]",
    9: "You're pre-approved for a loan!\n\nClick on the link below to get started.\n\n[link]",
    10: "Act now! This offer expires soon!\n\nClick on the link below to take advantage of this amazing offer.\n\n[link]"
}


In [62]:
print("Genuine or Non-Spam Emails:")
for key, value in emails.items():
   if classify(value) == "ham":
    print(key,end="\t")

Genuine or Non-Spam Emails:
1	2	3	4	5	

<hr>