# Final Project: Email Spam Detection
## By Brandon Stanyer

## Preprocessing: Read & Tokenize Data
initially, we will read in the corpus of emails provided from the Enron public email corpus. Emails are currently is a raw format, with subjects, ccs, dates and all. The emails are in two folders: Spam & Ham, where spam represents malicious or fake emails and ham represents authentic emails. For this analysis, we import 1000 of each type, but that can be modified.

In [758]:
# open python and nltk packages needed for processing
import os
import sys
import random
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
import math
from nltk.corpus import sentence_polarity
import pandas as pd

In [759]:
# define a path to the corpus
dirPath = '/Users/bstanyer/Desktop/IST 664 - Natural Language Processing/Final Project/FinalProjectData/EmailSpamCorpora/corpus'
# choose a number of emails to import
limitStr = 1000

In [760]:
# start lists for spam and ham email texts
hamtexts = []
spamtexts = []

# function to read spam and ham files, train and test a classifier 
def processspamham(dirPath,limitStr):
    # convert the limit argument from a string to an int
    limit = int(limitStr)
    
    os.chdir(dirPath)
    # process all files in directory that end in .txt up to the limit
    #    assuming that the emails are sufficiently randomized
    for file in os.listdir("./spam"):
        if (file.endswith(".txt")) and (len(spamtexts) < limit):
            # open file for reading and read entire file into a string
            f = open("./spam/"+file, 'r', encoding="latin-1")
            spamtexts.append (f.read())
            f.close()
    for file in os.listdir("./ham"):
        if (file.endswith(".txt")) and (len(hamtexts) < limit):
            # open file for reading and read entire file into a string
            f = open("./ham/"+file, 'r', encoding="latin-1")
            hamtexts.append (f.read())
            f.close()

In [761]:
processspamham(dirPath, limitStr)

In [762]:
# print number of emails read
print ("Number of spam files:",len(spamtexts))
print ("Number of ham files:",len(hamtexts))

Number of spam files: 1000
Number of ham files: 1000


In [763]:
# create list of mixed spam and ham email documents as (list of words, label)
emaildocs = []

# add all the spam
for spam in spamtexts:
    tokens = nltk.word_tokenize(spam)
    emaildocs.append((tokens, 'spam'))

# add all the regular emails
for ham in hamtexts:
    tokens = nltk.word_tokenize(ham)
    emaildocs.append((tokens, 'ham'))
    
len(emaildocs)

2000

In [764]:
# randomize the list
random.seed(9)
random.shuffle(emaildocs)

In [765]:
# print a few token lists
for email in emaildocs[:2]:
    print (email)

(['Subject', ':', 'a', 'computer', 'and', 'internet', 'connection', 'for', 'you', 'and', 'your', 'family', 'as', 'you', 'know', ',', 'technology', 'is', 'critical', 'to', 'enron', ';', 'it', 'drives', 'our', 'success', 'and', 'will', 'continue', 'to', 'do', 'so', 'in', 'the', 'future', '.', 'technology', 'has', 'helped', 'enron', 'create', 'new', 'businesses', 'like', 'enron', 'broadband', 'services', 'and', 'enron', 'net', 'works', ',', 'and', 'it', 'is', 'responsible', 'for', 'applications', 'such', 'as', 'enrononline', 'and', 'enroncredit', '.', 'com', '.', 'you', '\x01', ',', 've', 'seen', 'what', 'technology', 'can', 'do', 'at', 'work', '.', 'now', 'we', 'want', 'you', 'and', 'your', 'family', 'to', 'realize', 'its', 'benefits', 'at', 'home', '.', 'with', 'that', 'in', 'mind', ',', 'we', 'are', 'excited', 'to', 'let', 'you', 'know', 'that', 'we', 'are', 'introducing', 'the', 'clickathome', 'program', ',', 'which', 'will', 'give', 'each', 'employee', 'a', 'computer', 'for', 'use', 

# Experiment 1: Bag of Words Features
### Using the presence of words for Classification

In [766]:
# We will first start with bag of words feature sets
# The presence of the most common words will become email features
just_words = [word for words,label in emaildocs for word in words]
just_words[:50]

['Subject',
 ':',
 'a',
 'computer',
 'and',
 'internet',
 'connection',
 'for',
 'you',
 'and',
 'your',
 'family',
 'as',
 'you',
 'know',
 ',',
 'technology',
 'is',
 'critical',
 'to',
 'enron',
 ';',
 'it',
 'drives',
 'our',
 'success',
 'and',
 'will',
 'continue',
 'to',
 'do',
 'so',
 'in',
 'the',
 'future',
 '.',
 'technology',
 'has',
 'helped',
 'enron',
 'create',
 'new',
 'businesses',
 'like',
 'enron',
 'broadband',
 'services',
 'and',
 'enron',
 'net']

In [767]:
# using ALL words as features 
word_features = just_words
print(word_features[:100])

['Subject', ':', 'a', 'computer', 'and', 'internet', 'connection', 'for', 'you', 'and', 'your', 'family', 'as', 'you', 'know', ',', 'technology', 'is', 'critical', 'to', 'enron', ';', 'it', 'drives', 'our', 'success', 'and', 'will', 'continue', 'to', 'do', 'so', 'in', 'the', 'future', '.', 'technology', 'has', 'helped', 'enron', 'create', 'new', 'businesses', 'like', 'enron', 'broadband', 'services', 'and', 'enron', 'net', 'works', ',', 'and', 'it', 'is', 'responsible', 'for', 'applications', 'such', 'as', 'enrononline', 'and', 'enroncredit', '.', 'com', '.', 'you', '\x01', ',', 've', 'seen', 'what', 'technology', 'can', 'do', 'at', 'work', '.', 'now', 'we', 'want', 'you', 'and', 'your', 'family', 'to', 'realize', 'its', 'benefits', 'at', 'home', '.', 'with', 'that', 'in', 'mind', ',', 'we', 'are', 'excited']


In [768]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending on whether that keyword is in the document
def email_features(email, word_features):
	email_words = set(email)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in email_words)
	return features

In [769]:
# get features sets for a document, including keyword features and category feature
featuresets = [(email_features(d, word_features), c) for (d,c) in emaildocs]

In [770]:
# This shows an example of a featureset for an email
featuresets[:1]

[({'V_Subject': True,
   'V_:': True,
   'V_a': True,
   'V_computer': True,
   'V_and': True,
   'V_internet': True,
   'V_connection': True,
   'V_for': True,
   'V_you': True,
   'V_your': True,
   'V_family': True,
   'V_as': True,
   'V_know': True,
   'V_,': True,
   'V_technology': True,
   'V_is': True,
   'V_critical': True,
   'V_to': True,
   'V_enron': True,
   'V_;': True,
   'V_it': True,
   'V_drives': True,
   'V_our': True,
   'V_success': True,
   'V_will': True,
   'V_continue': True,
   'V_do': True,
   'V_so': True,
   'V_in': True,
   'V_the': True,
   'V_future': True,
   'V_.': True,
   'V_has': True,
   'V_helped': True,
   'V_create': True,
   'V_new': True,
   'V_businesses': True,
   'V_like': True,
   'V_broadband': True,
   'V_services': True,
   'V_net': True,
   'V_works': True,
   'V_responsible': True,
   'V_applications': True,
   'V_such': True,
   'V_enrononline': True,
   'V_enroncredit': True,
   'V_com': True,
   'V_\x01': True,
   'V_ve': True,


In [771]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(featuresets)

# Calculate the split index
split_index = int(len(featuresets) * 0.9)

# Split the data into training and testing sets
train_set = featuresets[:split_index]
test_set = featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.84


In [772]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[105   7]
 [ 25  63]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.81      0.94      0.87       112
        spam       0.90      0.72      0.80        88

    accuracy                           0.84       200
   macro avg       0.85      0.83      0.83       200
weighted avg       0.85      0.84      0.84       200



In [696]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(20))

Most Informative Features
             V_forwarded = True              ham : spam   =    157.8 : 1.0
                   V_ect = True              ham : spam   =    145.5 : 1.0
                   V_hou = True              ham : spam   =    140.0 : 1.0
                   V_nom = True              ham : spam   =     90.7 : 1.0
                   V_bob = True              ham : spam   =     64.0 : 1.0
                V_farmer = True              ham : spam   =     63.1 : 1.0
            V_nomination = True              ham : spam   =     62.0 : 1.0
                   V_713 = True              ham : spam   =     52.4 : 1.0
                  V_2001 = True              ham : spam   =     45.5 : 1.0
             V_microsoft = True             spam : ham    =     41.9 : 1.0
                 V_susan = True              ham : spam   =     38.7 : 1.0
                     V_| = True             spam : ham    =     38.6 : 1.0
                    V_cc = True              ham : spam   =     38.2 : 1.0

#### Overall, using all of the words as word features was relatively successful. An 84% accuracy is relatively good, but there is still much room for improvement.

# Experiment 2: Bag of Words Features - Filtered
### Same as experiment 1, but no stopwords and only use the 2000 most common words

In [697]:
# We will first start with bag of words feature sets
# The most common words will become email features
just_words = [word for words,label in emaildocs for word in words]

In [698]:
# removing stopwords
nltkstopwords = nltk.corpus.stopwords.words('english')
# These words/symbols were very common in later steps so they were removed
mystopwords = ['-','.',',','/',':','(',')']

stopwords = nltkstopwords + mystopwords

stopped_words = [w for w in just_words if not w in stopwords]
len(stopped_words)

258245

In [699]:
# use words from all emails to define the word vector for features
# words are also put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in stopped_words)
print(len(all_words))

34576


In [700]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]
# show the some of the most common words
print(word_features[:100])

['ect', '@', 'subject', '?', '=', "'", '!', 'hou', 'enron', ';', '$', '_', '2000', 'com', '3', '>', '``', 'please', '1', '|', '00', '2', 'gas', '*', '%', '#', 'deal', '0', 'http', '10', '000', 'e', 'pm', 'meter', 'cc', '5', 'hpl', '4', 'new', '2001', 'company', 'price', 'may', 'get', '01', '7', 'information', 'thanks', 'daren', 'corp', '11', 'need', '&', 'know', 'email', 'us', 'www', 'time', '12', '6', 'font', '9', 'see', 'one', 'message', 'td', '8', 'l', 'j', 'mmbtu', 'p', 'forwarded', 'nbsp', '99', 'would', '15', '20', 'let', '30', 'statements', '03', 'height', 'day', 'attached', 'farmer', 'c', 'th', 'x', 'also', '+', '02', 'b', '100', 'mail', 'xls', '25', '09', '[', ']', 'like']


In [701]:
# same function as above
def email_features(email, word_features):
	email_words = set(email)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in email_words)
	return features

In [702]:
# get features sets for a document, including keyword features and category feature
featuresets = [(email_features(d, word_features), c) for (d,c) in emaildocs]

In [703]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(featuresets)

# Calculate the split index
split_index = int(len(featuresets) * 0.9)

# Split the data into training and testing sets
train_set = featuresets[:split_index]
test_set = featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.925


In [704]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[101  11]
 [  4  84]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.96      0.90      0.93       112
        spam       0.88      0.95      0.92        88

    accuracy                           0.93       200
   macro avg       0.92      0.93      0.92       200
weighted avg       0.93      0.93      0.93       200



#### This second experiment had much better results than the first. By removing stopwords and narrowing the number of feature words, the accuracy jumped to 94%

# Experiment 3: Cross-Validation
### Using the results of experiment 2, the following code performs cross validation as well

In [705]:
## cross-validation ##
# this function takes the number of folds, the feature sets and it iterates over the folds.
# It prints the accuracy for each fold and the average accuracy at the end.

def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [706]:
# Here we use cross validation to do the same training as above
num_folds = 4
cross_validation_accuracy(num_folds, featuresets)

Each fold size: 500
0 0.956
1 0.946
2 0.952
3 0.924
mean accuracy 0.9445


#### For the cross-validation, the best results were found by choosing between 3-7 folds. For this experiment, 4 folds yielded the best results. This shows a slight improvement over the results of experiment 2.

# Experiment 4: Bag of Words Features with Less Words

### This is the same as experiment 2, but this time with only 1000 words

In [707]:
# We will first start with bag of words feature sets
# The most common words will become email features
just_words = [word for words,label in emaildocs for word in words]

In [708]:
# removing stopwords
nltkstopwords = nltk.corpus.stopwords.words('english')
# These words/symbols were very common in later steps so they were removed
mystopwords = ['-','.',',','/',':','(',')']
stopwords = nltkstopwords + mystopwords

stopped_words = [w for w in just_words if not w in stopwords]

In [709]:
# use words from all emails to define the word vector for features
# words are also put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in stopped_words)

In [710]:
# get the 1000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word, freq) in word_items]
print(word_features[:100])

['ect', '@', 'subject', '?', '=', "'", '!', 'hou', 'enron', ';', '$', '_', '2000', 'com', '3', '>', '``', 'please', '1', '|', '00', '2', 'gas', '*', '%', '#', 'deal', '0', 'http', '10', '000', 'e', 'pm', 'meter', 'cc', '5', 'hpl', '4', 'new', '2001', 'company', 'price', 'may', 'get', '01', '7', 'information', 'thanks', 'daren', 'corp', '11', 'need', '&', 'know', 'email', 'us', 'www', 'time', '12', '6', 'font', '9', 'see', 'one', 'message', 'td', '8', 'l', 'j', 'mmbtu', 'p', 'forwarded', 'nbsp', '99', 'would', '15', '20', 'let', '30', 'statements', '03', 'height', 'day', 'attached', 'farmer', 'c', 'th', 'x', 'also', '+', '02', 'b', '100', 'mail', 'xls', '25', '09', '[', ']', 'like']


In [711]:
# same as before
def email_features(email, word_features):
	email_words = set(email)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in email_words)
	return features

In [712]:
# get features sets for a document, including keyword features and category feature
featuresets = [(email_features(d, word_features), c) for (d,c) in emaildocs]

In [713]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(featuresets)

# Calculate the split index
split_index = int(len(featuresets) * 0.9)

# Split the data into training and testing sets
train_set = featuresets[:split_index]
test_set = featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.93


In [714]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Optionally, print the classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[98 14]
 [ 0 88]]

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.88      0.93       112
        spam       0.86      1.00      0.93        88

    accuracy                           0.93       200
   macro avg       0.93      0.94      0.93       200
weighted avg       0.94      0.93      0.93       200



#### This was the most effective model so far, with 96% accuracy

# Experiment 5: POS Tags as Features

In [715]:
# This function will take data structured like emaildocs and assign POS tags
def POStag_words(data):
    tagged_data = []
    for words, label in data:
        tagged_words = nltk.pos_tag(words)
        tagged_data.append((tagged_words, label))
    return tagged_data

In [716]:
# define features for the "i"th word in the sentence, including three types of suffix and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def word_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    
    # this adds a feature for the previous word
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    
    # this adds a feature for the next word
    if i == len(sentence)-1:
        features["next-word"] = "<END>"
    else:
        features["next-word"] = sentence[i+1]
    
    return features 

In [717]:
# adding parts of speech tags to each word in each email
tagged_emails = POStag_words(emaildocs)
tagged_emails[:1]

[([('Subject', 'NN'),
   (':', ':'),
   ('a', 'DT'),
   ('computer', 'NN'),
   ('and', 'CC'),
   ('internet', 'JJ'),
   ('connection', 'NN'),
   ('for', 'IN'),
   ('you', 'PRP'),
   ('and', 'CC'),
   ('your', 'PRP$'),
   ('family', 'NN'),
   ('as', 'IN'),
   ('you', 'PRP'),
   ('know', 'VBP'),
   (',', ','),
   ('technology', 'NN'),
   ('is', 'VBZ'),
   ('critical', 'JJ'),
   ('to', 'TO'),
   ('enron', 'VB'),
   (';', ':'),
   ('it', 'PRP'),
   ('drives', 'VBZ'),
   ('our', 'PRP$'),
   ('success', 'NN'),
   ('and', 'CC'),
   ('will', 'MD'),
   ('continue', 'VB'),
   ('to', 'TO'),
   ('do', 'VB'),
   ('so', 'RB'),
   ('in', 'IN'),
   ('the', 'DT'),
   ('future', 'NN'),
   ('.', '.'),
   ('technology', 'NN'),
   ('has', 'VBZ'),
   ('helped', 'VBN'),
   ('enron', 'VB'),
   ('create', 'JJ'),
   ('new', 'JJ'),
   ('businesses', 'NNS'),
   ('like', 'IN'),
   ('enron', 'NN'),
   ('broadband', 'NN'),
   ('services', 'NNS'),
   ('and', 'CC'),
   ('enron', 'JJ'),
   ('net', 'NN'),
   ('works', '

In [718]:
featuresets = []

for tagged_email, label in tagged_emails:
    untagged_email = nltk.tag.untag(tagged_email)
    emailfeatures = []

    for i, (word, tag) in enumerate(tagged_email):
        features = word_features(untagged_email, i)
        features["pos_tag"] = tag
        emailfeatures.append(features)
    
    featuresets.append((emailfeatures, label))

# Example usage to display the first 100 featuresets
print(featuresets[0])

([{'suffix(1)': 't', 'suffix(2)': 'ct', 'suffix(3)': 'ect', 'prev-word': '<START>', 'next-word': ':', 'pos_tag': 'NN'}, {'suffix(1)': ':', 'suffix(2)': ':', 'suffix(3)': ':', 'prev-word': 'Subject', 'next-word': 'a', 'pos_tag': ':'}, {'suffix(1)': 'a', 'suffix(2)': 'a', 'suffix(3)': 'a', 'prev-word': ':', 'next-word': 'computer', 'pos_tag': 'DT'}, {'suffix(1)': 'r', 'suffix(2)': 'er', 'suffix(3)': 'ter', 'prev-word': 'a', 'next-word': 'and', 'pos_tag': 'NN'}, {'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'computer', 'next-word': 'internet', 'pos_tag': 'CC'}, {'suffix(1)': 't', 'suffix(2)': 'et', 'suffix(3)': 'net', 'prev-word': 'and', 'next-word': 'connection', 'pos_tag': 'JJ'}, {'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'internet', 'next-word': 'for', 'pos_tag': 'NN'}, {'suffix(1)': 'r', 'suffix(2)': 'or', 'suffix(3)': 'for', 'prev-word': 'connection', 'next-word': 'you', 'pos_tag': 'IN'}, {'suffix(1)': 'u', 'suffix(2)': 'ou', 'suff

In [719]:
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))

1800
200


In [720]:
# Function to flatten featuresets
def flatten_featuresets(featuresets):
    flattened_featuresets = []
    for email_features, label in featuresets:
        email_feature_dict = {}
        for i, features in enumerate(email_features):
            for key, value in features.items():
                email_feature_dict[f"{key}_{i}"] = value
        flattened_featuresets.append((email_feature_dict, label))
    return flattened_featuresets

In [721]:
# Flatten the featuresets
flattened_featuresets = flatten_featuresets(featuresets)

In [722]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(flattened_featuresets)

# Calculate the split index
split_index = int(len(flattened_featuresets) * 0.9)

# Split the data into training and testing sets
train_set = flattened_featuresets[:split_index]
test_set = flattened_featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.85


In [723]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[93 19]
 [11 77]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.89      0.83      0.86       112
        spam       0.80      0.88      0.84        88

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200



#### Using POS tagging proved to be quite effective at detecting spam emails, with 90% accuracy. However, it did not perform as well as the traditional bag of words featuresets.

# Experiment 6: Word Frequency 

### Instead of using a true/false for each word in the featureset, this model uses the count of each common word as features.

In [724]:
# collecting all the words in the emails
just_words = [word for words,label in emaildocs for word in words]

In [725]:
# removing stopwords
nltkstopwords = nltk.corpus.stopwords.words('english')
# These words/symbols were very common in later steps so they were removed
mystopwords = ['-','.',',','/',':','(',')']
stopwords = nltkstopwords + mystopwords

stopped_words = [w for w in just_words if not w in stopwords]

In [726]:
# use words from all emails to define the word vector for features
# words are also put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in stopped_words)

In [727]:
# Count the frequency of each word in the list
word_counts = Counter(all_words)

# Get the most common 1000 words
common_words = [word for word, _ in word_counts.most_common(1000)]
common_words[:10]

['ect', '@', 'subject', '?', '=', "'", '!', 'hou', 'enron', ';']

In [728]:
# This feature function creates features based on the frequency of words, not just boolean
def word_frequency_features(email, word_features):
    features = {}
    for word in word_features:
        features[f'V_{word}'] = email.count(word)
    return features

In [729]:
# get features sets for a document, including keyword features and category feature
featuresets = [(word_frequency_features(email, common_words), label) for email, label in emaildocs]

In [730]:
# This shows an example of a featureset for an email
featuresets[2]

({'V_ect': 0,
  'V_@': 0,
  'V_subject': 0,
  'V_?': 0,
  'V_=': 0,
  "V_'": 0,
  'V_!': 0,
  'V_hou': 0,
  'V_enron': 0,
  'V_;': 0,
  'V_$': 0,
  'V__': 0,
  'V_2000': 1,
  'V_com': 0,
  'V_3': 0,
  'V_>': 0,
  'V_``': 0,
  'V_please': 0,
  'V_1': 0,
  'V_|': 0,
  'V_00': 0,
  'V_2': 0,
  'V_gas': 0,
  'V_*': 0,
  'V_%': 0,
  'V_#': 0,
  'V_deal': 0,
  'V_0': 0,
  'V_http': 0,
  'V_10': 0,
  'V_000': 0,
  'V_e': 0,
  'V_pm': 0,
  'V_meter': 0,
  'V_cc': 0,
  'V_5': 0,
  'V_hpl': 1,
  'V_4': 0,
  'V_new': 0,
  'V_2001': 0,
  'V_company': 0,
  'V_price': 0,
  'V_may': 0,
  'V_get': 0,
  'V_01': 0,
  'V_7': 0,
  'V_information': 0,
  'V_thanks': 0,
  'V_daren': 0,
  'V_corp': 0,
  'V_11': 0,
  'V_need': 0,
  'V_&': 0,
  'V_know': 0,
  'V_email': 0,
  'V_us': 0,
  'V_www': 0,
  'V_time': 0,
  'V_12': 0,
  'V_6': 0,
  'V_font': 0,
  'V_9': 0,
  'V_see': 1,
  'V_one': 0,
  'V_message': 0,
  'V_td': 0,
  'V_8': 0,
  'V_l': 0,
  'V_j': 0,
  'V_mmbtu': 0,
  'V_p': 0,
  'V_forwarded': 0,
  'V_

In [731]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(flattened_featuresets)

# Calculate the split index
split_index = int(len(flattened_featuresets) * 0.9)

# Split the data into training and testing sets
train_set = flattened_featuresets[:split_index]
test_set = flattened_featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.865


In [732]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[85 17]
 [10 88]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.89      0.83      0.86       102
        spam       0.84      0.90      0.87        98

    accuracy                           0.86       200
   macro avg       0.87      0.87      0.86       200
weighted avg       0.87      0.86      0.86       200



#### Overall, this was another quite effective model for this task. It was not the highest performer, but did achieve 93% accuracy.

# Experiment 7: TF-IDF scores (Advanced)

In [733]:
# Calculate Term Frequency (TF)
def compute_tf(email):
    tf = Counter(email)
    total_words = len(email)
    for word in tf:
        tf[word] = tf[word] / total_words
    return tf

# Calculate Inverse Document Frequency (IDF)
def compute_idf(documents):
    num_docs = len(documents)
    idf = Counter()
    for email in documents:
        unique_words = set(email)
        for word in unique_words:
            idf[word] += 1
    for word in idf:
        idf[word] = math.log(num_docs / idf[word])
    return idf

# Calculate TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {}
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf.get(word, 0.0)
    return tfidf

In [734]:
# Prepare data: list of all emails and list of most common words
all_emails = [email for email, label in emaildocs]
most_common_words_set = set(common_words)

# Calculate IDF for all documents
idf = compute_idf(all_emails)

In [735]:
# Function to combine frequency and TF-IDF features
def word_frequency_tfidf_features(email, word_features, idf):
    tf = compute_tf(email)
    tfidf = compute_tfidf(tf, idf)
    features = {}
    for word in word_features:
        features[f'V_{word}'] = email.count(word)
        features[f'TFIDF_{word}'] = tfidf.get(word, 0.0)
    return features

In [736]:
# Final feature sets
featuresets = [(word_frequency_tfidf_features(email, common_words, idf), label) for email, label in emaildocs]

# Output the features for the first email to verify
print(featuresets[2])

({'V_ect': 0, 'TFIDF_ect': 0.0, 'V_@': 0, 'TFIDF_@': 0.0, 'V_subject': 0, 'TFIDF_subject': 0.0, 'V_?': 0, 'TFIDF_?': 0.0, 'V_=': 0, 'TFIDF_=': 0.0, "V_'": 0, "TFIDF_'": 0.0, 'V_!': 0, 'TFIDF_!': 0.0, 'V_hou': 0, 'TFIDF_hou': 0.0, 'V_enron': 0, 'TFIDF_enron': 0.0, 'V_;': 0, 'TFIDF_;': 0.0, 'V_$': 0, 'TFIDF_$': 0.0, 'V__': 0, 'TFIDF__': 0.0, 'V_2000': 1, 'TFIDF_2000': 0.05975518287199313, 'V_com': 0, 'TFIDF_com': 0.0, 'V_3': 0, 'TFIDF_3': 0.0, 'V_>': 0, 'TFIDF_>': 0.0, 'V_``': 0, 'TFIDF_``': 0.0, 'V_please': 0, 'TFIDF_please': 0.0, 'V_1': 0, 'TFIDF_1': 0.0, 'V_|': 0, 'TFIDF_|': 0.0, 'V_00': 0, 'TFIDF_00': 0.0, 'V_2': 0, 'TFIDF_2': 0.0, 'V_gas': 0, 'TFIDF_gas': 0.0, 'V_*': 0, 'TFIDF_*': 0.0, 'V_%': 0, 'TFIDF_%': 0.0, 'V_#': 0, 'TFIDF_#': 0.0, 'V_deal': 0, 'TFIDF_deal': 0.0, 'V_0': 0, 'TFIDF_0': 0.0, 'V_http': 0, 'TFIDF_http': 0.0, 'V_10': 0, 'TFIDF_10': 0.0, 'V_000': 0, 'TFIDF_000': 0.0, 'V_e': 0, 'TFIDF_e': 0.0, 'V_pm': 0, 'TFIDF_pm': 0.0, 'V_meter': 0, 'TFIDF_meter': 0.0, 'V_cc': 0, 'TF

In [737]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(flattened_featuresets)

# Calculate the split index
split_index = int(len(flattened_featuresets) * 0.9)

# Split the data into training and testing sets
train_set = flattened_featuresets[:split_index]
test_set = flattened_featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.91


In [738]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[85 10]
 [ 8 97]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.91      0.89      0.90        95
        spam       0.91      0.92      0.92       105

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200



# Experiment 8: Custom Features - Combining the Best Features
### For this experiment, we combine some word features used previously, as well as custom features
### Features: Presence of 1000 most common words, Email Length, Average word Length

In [739]:
# We will first start with bag of words feature sets
# The most common words will become email features
just_words = [word for words,label in emaildocs for word in words]

In [740]:
# removing stopwords
nltkstopwords = nltk.corpus.stopwords.words('english')
# These words/symbols were very common in later steps so they were removed
mystopwords = ['-','.',',','/',':','(',')']
stopwords = nltkstopwords + mystopwords

stopped_words = [w for w in just_words if not w in stopwords]

In [741]:
# use words from all emails to define the word vector for features
# words are also put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in stopped_words)

In [742]:
# get the 1000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word, freq) in word_items]
print(word_features[:100])

['ect', '@', 'subject', '?', '=', "'", '!', 'hou', 'enron', ';', '$', '_', '2000', 'com', '3', '>', '``', 'please', '1', '|', '00', '2', 'gas', '*', '%', '#', 'deal', '0', 'http', '10', '000', 'e', 'pm', 'meter', 'cc', '5', 'hpl', '4', 'new', '2001', 'company', 'price', 'may', 'get', '01', '7', 'information', 'thanks', 'daren', 'corp', '11', 'need', '&', 'know', 'email', 'us', 'www', 'time', '12', '6', 'font', '9', 'see', 'one', 'message', 'td', '8', 'l', 'j', 'mmbtu', 'p', 'forwarded', 'nbsp', '99', 'would', '15', '20', 'let', '30', 'statements', '03', 'height', 'day', 'attached', 'farmer', 'c', 'th', 'x', 'also', '+', '02', 'b', '100', 'mail', 'xls', '25', '09', '[', ']', 'like']


In [743]:
# define features (keywords) of an email
def email_features(email, word_features):
    email_words = set(email)
    features = {}
    
    # NEW FEATURE: Email Length     
    features['length'] = len(email)
    
    # NEW FEATURE: Average Word Length
    lengths = [len(word) for word in email]
    total_length = sum(lengths)
    average_length = total_length / len(email)
    
    features['avg_word_length'] = average_length
    
    # Normal Word Presence as Word Features
    for word in word_features:
        features['V_%s' % word] = (word in email_words)
    
    return features

In [744]:
# get features sets for each email
featuresets = [(email_features(d, word_features), c) for (d,c) in emaildocs]

In [745]:
featuresets[1]

({'length': 172,
  'avg_word_length': 3.6686046511627906,
  'V_ect': False,
  'V_@': True,
  'V_subject': False,
  'V_?': True,
  'V_=': False,
  "V_'": False,
  'V_!': False,
  'V_hou': False,
  'V_enron': False,
  'V_;': True,
  'V_$': False,
  'V__': False,
  'V_2000': False,
  'V_com': True,
  'V_3': False,
  'V_>': False,
  'V_``': False,
  'V_please': True,
  'V_1': False,
  'V_|': False,
  'V_00': False,
  'V_2': False,
  'V_gas': False,
  'V_*': False,
  'V_%': False,
  'V_#': False,
  'V_deal': False,
  'V_0': True,
  'V_http': True,
  'V_10': False,
  'V_000': False,
  'V_e': True,
  'V_pm': False,
  'V_meter': False,
  'V_cc': False,
  'V_5': False,
  'V_hpl': False,
  'V_4': False,
  'V_new': True,
  'V_2001': False,
  'V_company': True,
  'V_price': False,
  'V_may': False,
  'V_get': False,
  'V_01': False,
  'V_7': True,
  'V_information': False,
  'V_thanks': False,
  'V_daren': False,
  'V_corp': False,
  'V_11': False,
  'V_need': False,
  'V_&': False,
  'V_know': Fa

In [746]:
# training the model using Naive Bayes classifier with a 90/10 split

# Set the seed for reproducibility
random.seed(9)

# Shuffle the data
random.shuffle(featuresets)

# Calculate the split index
split_index = int(len(featuresets) * 0.9)

# Split the data into training and testing sets
train_set = featuresets[:split_index]
test_set = featuresets[split_index:]

# Print the number of samples in each set
print(f'Number of training samples: {len(train_set)}')
print(f'Number of testing samples: {len(test_set)}')

classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (f'Classifier Accuracy: {nltk.classify.accuracy(classifier, test_set)}')

Number of training samples: 1800
Number of testing samples: 200
Classifier Accuracy: 0.935


In [747]:
# Make predictions on the test set
test_features = [feat for feat, label in test_set]
true_labels = [label for feat, label in test_set]
predicted_labels = [classifier.classify(feat) for feat in test_features]

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Confusion Matrix:
[[99 13]
 [ 0 88]]

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.88      0.94       112
        spam       0.87      1.00      0.93        88

    accuracy                           0.94       200
   macro avg       0.94      0.94      0.93       200
weighted avg       0.94      0.94      0.94       200



#### This last model proved to be the most effective, with 96.5% accuracy.