## Import the necessary libraries

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('punkt')

#### Text preprocessing
* Decontraction (I'm => I am)
* Remove all punctuation, tags, etc.
* Convert string into an array of words
* Stemming or Lemmatization
* Remove stopwords (words that occur a lot such as [I, he, this, there, is, had, ...]

In [2]:
# Import stopwords and stemmer
stopwords_to_remove = stopwords.words('english')
stemmer = SnowballStemmer('english')

# Decontract abbreviations
def decontracted(phrase):
    # Special cases
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # General cases
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def preprocess_text(sentence):
    sentence =  decontracted(sentence.lower())# Expand words (I'm to I am) and change all letters to lowercase
    sentence =  re.sub(r'[^\w\s]','',sentence)# Remove all characters that are not english words
    sentence =  nltk.word_tokenize(sentence)# Convert string into an array of words
    sentence =  [word for word in sentence if word not in stopwords_to_remove]# Remove stopwords
    sentence =  [stemmer.stem(word) for word in sentence]# Stemming
    return sentence

preprocess_text("Hi, this is a nice day isn't it. Hope your are doing great!")

['hi', 'nice', 'day', 'hope', 'great']

## Creating the naive bayes classifier

In [3]:
class naive_bayes():
    # takes input the data as a numpy array that contains multiple arrays (same as the number of classes), each contains preprocessed text data belonging to a class. The class is indicated by the index of the array
    def __init__(self,data_X, data_y):
        self.data_X = data_X
        self.data_y = data_y
        self.num_classes = 2
        self.class_priors = np.zeros(self.num_classes) #[0,0] = [P(not spam),P(spam)]
        self.word_dict_likelihood = {}
        self.word_dict_total = {}
        
    def train(self):
        # Calculating class priors
        self.class_priors[1] = np.sum(self.data_y)/len(self.data_y)
        self.class_priors[0] = 1-self.class_priors[1]
        
        # Calculate word likelihoods
        for email,label in tqdm(zip(self.data_X,self.data_y)):
            preprocessed_email = preprocess_text(email)
            for word in preprocessed_email:
                if word not in self.word_dict_total:
                    self.word_dict_total[word] = 1
                else:
                    self.word_dict_total[word] += 1
            if label == 1:
                for word in preprocessed_email:
                    if word not in self.word_dict_likelihood:
                        self.word_dict_likelihood[word] = 1
                    else:
                        self.word_dict_likelihood[word] += 1
                    
        # Normalize likelihoods
        for word in self.word_dict_likelihood:
            self.word_dict_likelihood[word] /= self.word_dict_total[word]
    
    def predict(self,text_to_predict):
        # Preprocess given text
        preprocessed_email = preprocess_text(text_to_predict)
        
        # Multiply probabilities for all words and apply bayes rule
        P_spam = 1
        P_not_spam = 1
        for word in preprocessed_email:
            if word in self.word_dict_likelihood:
                P_b_a = self.word_dict_likelihood[word]
                P_b_not_a = 1 - P_b_a
                P_spam *= P_b_a
                P_not_spam *= P_b_not_a
        
        P_spam *= self.class_priors[1]
        P_not_spam *= self.class_priors[0]
        if P_spam > P_not_spam:
            return 1
        else:
            return 0

## Read, explore, and prepare our dataset

In [4]:
df = pd.read_csv("emails.csv")
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,text,spam
1961,Subject: telephone interview with the houston ...,0
5304,Subject: entouch newsletter business highligh...,0
4891,Subject: free two week ft - online trial - - ...,0
4919,Subject: re : fw : energy book promotion juli...,0
2182,Subject: re : fw : energy book promotion than...,0


In [5]:
training_data = df[:5300]
test_data = df[5300:]
training_x = training_data["text"]
training_y = training_data["spam"]
testing_x = test_data["text"]
testing_y = test_data["spam"]

## Create our model,train, and test it

In [6]:
naive_bayes_classifier = naive_bayes(training_x,training_y)
naive_bayes_classifier.train()

5300it [00:15, 344.70it/s]


In [7]:
from sklearn.metrics import accuracy_score

preds = []
for text in tqdm(testing_x):
    prediction = naive_bayes_classifier.predict(text)
    preds.append(prediction)
accuracy = accuracy_score(preds,testing_y)
accuracy

100%|███████████████████████████████████████████████████████████████████████████████| 428/428 [00:01<00:00, 350.03it/s]


0.8995327102803738

### Our naive bayes model achieved 89.9% 