### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('punkt')

### Text Preprocessing

In [2]:
stopwords_to_remove = stopwords.words('english')
stemmer = SnowballStemmer('english')

def decontracted(phrase):
    # Special cases
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # General cases
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def preprocess_text(sentence):
    sentence = decontracted(sentence.lower()) # Expand words (I'm to I am) and change all letters to lowercase
    sentence = re.sub(r'[^\w\s]','',sentence) # Remove all characters that are not english words
    sentence = nltk.word_tokenize(sentence) # Convert string into an array of words
    sentence = [stemmer.stem(word) for word in sentence] # Stemming
    sentence = [word for word in sentence if word not in stopwords_to_remove] # Remove stopwords
    return sentence

preprocess_text("Hi, this is a nice day isn't it. Hope your are doing great!")

['hi', 'nice', 'day', 'hope', 'great']

### Preparing the data

In [3]:
#Import the data and shuffle the rows
df = pd.read_csv("emails.csv")
df = df.reindex(np.random.permutation(df.index))

# Divide the dataset into training and testing, each divided into X and y
training_data = df.iloc[:5300]
testing_data = df.iloc[5300:]
training_X = training_data["text"]
training_y = training_data["spam"]
test_x = testing_data["text"]
test_y = testing_data["spam"]

### Creating and training our model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
count_vect = CountVectorizer()
X_new_counts = count_vect.fit_transform(df["text"])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_new_counts)
clf = MultinomialNB()
clf.fit(X_train_tfidf[:5300], training_y[:5300])
print("Training Complete")

Training Complete


### Testing our model

In [5]:
from sklearn.metrics import accuracy_score

preds = []
for text in tqdm(X_train_tfidf[5300:]):
    prob = clf.predict(text)
    label = prob[0]
    preds.append(label)
accuracy = accuracy_score(preds,test_y)
accuracy 

428it [00:00, 2804.88it/s]


0.8878504672897196

### Our Naive Bayes model achieved 88.9% accuracy on the email spam classification task