### Download Spam Mail Dataset


In [1]:
import nltk
nltk.download("stopwords")
nltk.download("names")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ben8169/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /Users/ben8169/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ben8169/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Dataset

In [2]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whetere the gien email is spam or ham
  - spam: 1
  - ham: 0
"""

emails, labels = [], []
parition = 0

In [3]:
# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: # [!important] check encofing format
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f:
      emails.append(f.read())
      labels.append(0)

print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))

# of emails = 5172
# of labels = 5172


### Data Preprocessing
  - remove number and punctuation
  - remove name entity
  - remove stopword
  - lemmatization

In [4]:
# remove number and punctuation
def letters_only(word):
    return word.isalpha()

# remove name entity
from nltk.corpus import names
all_names = set(names.words())

# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# put all together to clean texts
def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '): # split doc. by blank (' ')
        word = word.lower() # ABD -> abd
        if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word))

    return ' '.join(cleaned_doc)

cleaned_emails = [clean_text(doc) for doc in emails]

### Data Preparation
* Split data into train and test set.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=486)

cv = CountVectorizer(stop_words='english', max_features=500)

term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
term_docs_test = cv.transform(X_test) # get counter vector for X_test

### Import Model and Train

In [6]:
import numpy as np
from naivebayes import NaiveBayesClassifier

gnb = NaiveBayesClassifier()
gnb.fit(term_docs_train.toarray(), Y_train)
y_pred = gnb.predict(term_docs_test.toarray())

[9.64840556e-05 7.35895339e-05 6.54129191e-05]
[9.64840556e-05 7.35895339e-05 6.54129191e-05]
[0.7724030315694503, 0.22759696835330928]
[1.1628103722983156e-22, 1.4384713139390404e-52]
[6.417232206653173e-21, 1.2197051587441163e-37]
[1.5871215407698757e-93, 5.247635027776813e-170]
[8.871161242775693e-06, 7.275903074323451e-34]
[0.00013199695149996836, 1.2417824253702314e-27]
[0.9999999926829956, 7.216926753230566e-09]
[1.2662477756353852e-207, 6.984281818351729e-219]
[1.7973550810883953e-33, 6.567844084652539e-38]
[1.4383151009644662e-120, 7.516001052762138e-209]
[0.0, 0.0]
[1.159665214807366e-286, 0.0]
[4.17280452591833e-27, 3.4926832525708887e-63]
[2.0260131380758257e-265, 2.743474993405362e-168]
[0.0002075520403912045, 1.7935501855594745e-27]
[5.3103218262353243e-132, 3.2745333431727e-182]
[0.9999999987181384, 1.1810300446325472e-09]
[2.3040475353325043e-09, 3.484287313719377e-26]
[8.297503651205367e-249, 0.0]
[1.0847908252328022e-15, 2.4202494961121103e-32]
[0.0, 0.0]
[2.6537222357

### Simple Evaluation

In [7]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, y_pred)
print("Accuracy of the model is: {:.2f}".format(acc))

Accuracy of the model is: 0.76
