# **Importing libraries**

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import os               #import the 'os' module to interact with the operating system (ANDREA)
import numpy as np      #import "numpy' library for numerial operations (ANDREA)
from collections import Counter    #import 'counter' class for counting occurrences of elements (ANDREA)

# Import all other necessary libraries.
from sklearn.naive_bayes import GaussianNB # Import the Gaussian Naive Bayes classifier from scikit-learn
from sklearn import metrics                # Import the 'metrics' module from scikit-learn for performance evaluation
import pandas as pd                        # Import the 'pandas' library and alias it as 'pd' for data manipulation

In [21]:

def make_Dictionary(root_dir):  # initialize empty lists to store words and email file paths

  all_words = []                                                    # 1. Initializes an empty list called all_words which will be used to store words from email files
  emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)] # 2. extract each file name in our specified folder (i.e., our train- and test-email folders). It's able to do this by using the os module to navigate file and directory operations.

  for mail in emails:          # 1. for loop to iterate over each email file. Translates to (for each mail in our emails)
    with open(mail) as m:      # 2. the "open()" function opens a file whereas "as m" renames it as m. The "open()"" takes 3 arguments open(filename, mode, encoding).
      for line in m:           # 3. for loop to iterate over each line in our OPENED EMAIL (iterate over each line, ONE EMAIL AT A TIME)
        words = line.split()   # 4. split each line into "words" and save it as "words"
        all_words += words     # 5. aggregates all of the saved "words" into a list called "all_words"

  dictionary = Counter(all_words)    # 1. Counter() from collections collect "all_words" list and converts it into a dictionary by takiing the word and stores it as an unique key in a key-value pair.
  list_to_remove = list(dictionary)  # 2. convert the dict to a list, so that we can iterate over it and remove unwanted characters and words (e.g., 2, "and", ".")

  for item in list_to_remove:        # 1. for loop to iterate over each item in our newly generated list
    if item.isalpha() == False:      # 2. the ".alpha()" checks if the item contains only alphabetical letters. if it only contains alphabetical letters, then it returns True, otherwise it returns False
      del dictionary[item]           # 3. since we're checking for non-alphabetical items (line above), we're deleting the item ONLY if the item contains a non-alphabetical character.
    elif len(item) == 1:             # 4. if the item ONLY contains alphabetical characters, then we're checking if the item is a single-character (e.g., "a").
      del dictionary[item]           # 5. if it's single-character, then we're deleting it

  dictionary = dictionary.most_common(3000) # 1. the ".most_common()" function comes from the collections library. It only keeps the 3,000 (specified argument) most common words in the dictionary, based on each words frequency
  return dictionary                         # 2. return the dictionary (see example below)
    # example of how the dict can look like = [('the', 1143), ('and', 966), ..., ('to', 762)]

In [22]:
def extract_features(mail_dir): # define a function that takes a directory path as input

  files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)] # 1. the "os.path.join()" takes multiple paths as arguments (i.e., path1 and path2), and joins them intelligently. It concatenates paths with "/". Documentation: https://www.geeksforgeeks.org/python-os-path-join-method/
  features_matrix = np.zeros((len(files),3000)) # 2. features only refers to IVs, and not the target (i.e., DV). The "np.zeros" returns an array with zeros with the given shape and type. In this case, the shape is 2D with the length of files and 3,000. Documentation: https://www.geeksforgeeks.org/numpy-zeros-python/
  train_labels = np.zeros(len(files))           # 3. the "np.zeros" is only given one argument, which makes the shape a 1D array filled with zeros. Thus, train_labels is assigned as a 1D array filled with zeros. 0 will represent NOT SPAM whereas 1 will represent SPAM.
  count = 1;                                    # 4. initialize a counter variable called "count"
  docID = 0;                                    # 5. initialize a tracker variable of the document ID (or email)

  for fil in files:                             # 1. for loop with fil in each "files", which represents our intelligently joined files
    with open(fil) as fi:                       # 2. the "open()" function opens a file whereas "as fi" renames it as fi. The "open()"" takes 3 arguments open(filename, mode, encoding).
      for i, line in enumerate(fi):             # 3. loops through each line in the file 'fi'
        if i ==2:                               # 4. checks if the current line number is 2
          words = line.split()                  # 5. splits line into list of words
          for word in words:                    # 6. loops through each word in the list
            wordID = 0                          # 7. indicates 'wordID' to 0
            for i, d in enumerate(dictionary):  # 8. loop through items in 'dictionary' with their index
              if d[0] == word:                  # 9. Checks if the first element of dictorniary item matches the current word
                wordID = i                      # 10. If a match is found, 'wordID' is set to the index of that word in the dictionary
                features_matrix[docID,wordID] = words.count(word) # 11. Updates the features_matrix at [docID, wordID] with the count of the current word third line
      train_labels[docID] = 0;                               # 12. sets the 'train labels' at ;+'docId index to 0
      filepathTokens = fil.split('/')                        # 13. splits the file path by using '/' ()
      lastToken = filepathTokens[len(filepathTokens)-1]      # 14. Retrieve the final segment of the file path
      if lastToken.startswith("spmsg"):                      # 15. Taking the final segment of the file path and seeing if it "spmsg"is in the last token or not
        train_labels[docID] = 1;                             # 16. If it is not in the toekn then we assume it is not spam. Update the label
        count = count + 1                                    # 17. Updating the label
      docID = docID + 1                                      # 18. Iterate the docID file
  return features_matrix, train_labels                       # 19. Finish here and then start all over again with reading the next file

In [23]:
# Enter the "path" of your "train_mails" and "test-mails"
# for example: TRAIN_DIR = '../../train-mails'
#              TEST_DIR = '../../test-mails'


TRAIN_DIR = r"/content/drive/MyDrive/Data/train-mails"
TEST_DIR = r"/content/drive/MyDrive/Data/test-mails"

In [24]:
dictionary = make_Dictionary(TRAIN_DIR)

print ("reading and processing emails from TRAIN and TEST folders")
features_matrix, labels = extract_features(TRAIN_DIR)
test_features_matrix, test_labels = extract_features(TEST_DIR)

print("working")

reading and processing emails from TRAIN and TEST folders
working


In [25]:
# instantiate a Gaussian NB classifier = nb_classifier
nb_classifier = GaussianNB() # nb_classifier becomes an object of GaussianNB class, which can be used to train the model on data

# fit the classifier to the training data
nb_classifier.fit(features_matrix, labels) # features_matrix = features/IV/columns TRAINING data; labels = target/DV/predictor TRAINING data

# create the predicted tags
pred = nb_classifier.predict(test_features_matrix) # test_features_matrix = IV/column/feature TESTING data

# calculate the accuracy score
score = metrics.accuracy_score(test_labels, pred)
print("The accuracy score is: {:.4f}%".format(score*100)) # our accuracy score is the same as the one given in the template

# calculate the confusion matrix
spam_no_spam = metrics.confusion_matrix(test_labels, pred, labels=[0,1])

# convert confusion matrix to pandas df and display the matrix as a table
confusion_df = pd.DataFrame(spam_no_spam, index=['Not Spam', 'Spam'], columns=["Not Spam", "Spam"])
confusion_df
    # 8 SPAM emails are incorrectly labeled as NOT SPAM whereas 1 NOT SPAM is incorrectly labeled as SPAM



# additional control - uncomment if you want to check it out
# 1. the total numbers from the confusion matrix adds up to 260 emails, which represents the TEST data (i.e., unseen data)
    # 260 emails (test data) out of 702 emails (entire data) entails that we used 37% of the entire dataset as TEST data
# print(129 + 1 + 8 + 122) # numbers based on the confusion matrix
    # 260 emails in the unseen data sets, which equals 37% of the entire dataset

# 2. the accuracy score 96.538%
    # double check that the inaccuracy score matches the accuracy score by calculating the number of incorrectly labeled items (in %)
# print((9/260)*100) number of incorrectly labeled items based on the confusion matrix
# print(96.53846153846153+3.4615384615384617) By adding the accuracy and inaccuracy score which it equals 100.

The accuracy score is: 96.5385%


Unnamed: 0,Not Spam,Spam
Not Spam,129,1
Spam,8,122
