In [None]:
import pandas as pd

### Loading the dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/machine_learning_from_scratch/naive_bayes_spam_classification/emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
df.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


### Preparing the dataset

1. Extract text message from `text` column

In [None]:
df['text'] = df['text'].str.replace("Subject: ", "")
df.head()

Unnamed: 0,text,spam
0,naturally irresistible your corporate identity...,1
1,the stock trading gunslinger fanny is merrill...,1
2,unbelievable new homes made easy im wanting t...,1
3,4 color printing special request additional i...,1
4,"do not have money , get software cds from here...",1


2. Strip, lower and remove punctuation from each message

In [None]:
import string
df['text'] = df['text'].str.strip().str.lower()

for char in string.punctuation:
  df['text'] = df['text'].str.replace(char, "")

df.head()

  df['text'] = df['text'].str.replace(char, "")


Unnamed: 0,text,spam
0,naturally irresistible your corporate identity...,1
1,the stock trading gunslinger fanny is merrill...,1
2,unbelievable new homes made easy im wanting t...,1
3,4 color printing special request additional i...,1
4,do not have money get software cds from here ...,1


3. Split each message into set of words

In [None]:
df['words'] = df['text'].str.split()
df.head()

Unnamed: 0,text,spam,words
0,naturally irresistible your corporate identity...,1,"[naturally, irresistible, your, corporate, ide..."
1,the stock trading gunslinger fanny is merrill...,1,"[the, stock, trading, gunslinger, fanny, is, m..."
2,unbelievable new homes made easy im wanting t...,1,"[unbelievable, new, homes, made, easy, im, wan..."
3,4 color printing special request additional i...,1,"[4, color, printing, special, request, additio..."
4,do not have money get software cds from here ...,1,"[do, not, have, money, get, software, cds, fro..."


### What are we trying to estimate?

* $p(spam|w_1 ∧ w_2 ∧ w_3 \ldots w_n) = \frac{p(spam) * p(w_1|spam) * p(w_2|spam) * \ldots p(w_n | spam)}{p(spam)*p(w_1|spam) * p(w_2|spam) * \ldots p(w_n | spam) + p(ham)*p(w_1|ham) * p(w_2|ham) * \ldots p(w_n | ham)}$

* $p(w_i|spam) = \frac{\# \ of \ w_i \ words \ in \ spam \ messages}{\# \ of \ w_i \ words \ in \ all \ messages}$

* $p(w_i|ham) = \frac{\# \ of \ w_i \ words \ in \ ham \ messages}{\# \ of \ w_i \ words \ in \ all \ messages}$


### What fraction of all the messages are spam e-mails? (estimating p(`spam`) )

In [None]:
df['spam'].value_counts(normalize = True)

0    0.761173
1    0.238827
Name: spam, dtype: float64

p(`spam`) = 0.238827

In [None]:
p_spam = 0.238827

### Creating word count dictionary

In [None]:
word_count = dict()

labels = df['spam'].to_list()
words = df['words'].to_list()

for label, words in zip(labels, words):
  for word in words:
    if word not in word_count:
      word_count[word] = {'spam': 1, 'ham': 1} # we add new word to both spam and ham to avoid potential division by zero
    else:
      if label == 1:
        word_count[word]['spam'] += 1
      else:
        word_count[word]['ham'] += 1

### Building a classifier

In [None]:
def spam_or_ham(message, p_spam = p_spam, word_count = word_count):
  """
  Classifies message as either spam or ham.

  Args:
    message (string): email being classified
    p_spam (float): probability of a message being a spam
    word_count (dict): dictionary of word counts, obtained from training data

  Returns:
    (p, decision): p (float): the probability of a message being spam, decision (string): either 'spam' if p >= 0.5 or 'ham if p < 0.5
  """
  p_ham = 1 - p_spam
  message_clean = message.lower().strip() # clean the message
  for char in string.punctuation: # remove punctuation
    message_clean.replace(char, "")
  words_in_message = message_clean.split() # split message into words

  p_message_given_spam = 1.0
  p_message_given_ham = 1.0

  for word in words_in_message:
    if word not in word_count:
      continue # if word is unknown - skip it
    else:
      p_word_given_spam = word_count[word]['spam']/(word_count[word]['spam'] + word_count[word]['ham'])
      p_word_given_ham = word_count[word]['ham']/(word_count[word]['spam'] + word_count[word]['ham'])
      p_message_given_spam *= p_word_given_spam # part of the numerator of Bayes' formula
      p_message_given_ham *= p_word_given_ham # part of the denominator of Bayes' formula

  p = p_spam * p_message_given_spam/ (p_spam * p_message_given_spam + p_ham * p_message_given_ham) # multiply answer by p_spam

  return (p, 'spam' if p >= 0.5 else 'ham')

In [None]:
print(spam_or_ham('congratulation, you won easy money in lottery'))
print(spam_or_ham("jdhjqskdhsjdnks")) # random word, should return p_spam
print(spam_or_ham("Hey mom, I'm coming on monday"))

(0.8252316688832554, 'spam')
(0.238827, 'ham')
(7.300843735124657e-05, 'ham')
