### Download Spam Mail Dataset


In [1]:
import nltk
nltk.download("stopwords")
nltk.download("names")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ben81\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\ben81\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ben81\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Dataset

In [2]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whetere the gien email is spam or ham
  - spam: 1
  - ham: 0
"""

emails, labels = [], []
parition = 0

In [3]:
# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: # [!important] check encofing format
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f:
      emails.append(f.read())
      labels.append(0)

print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))

# of emails = 5172
# of labels = 5172


### Data Preprocessing
  - remove number and punctuation
  - remove name entity
  - remove stopword
  - lemmatization

In [4]:
# remove number and punctuation
def letters_only(word):
    return word.isalpha()

# remove name entity
from nltk.corpus import names
all_names = set(names.words())

# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# put all together to clean texts
def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '): # split doc. by blank (' ')
        word = word.lower() # ABD -> abd
        if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word))

    return ' '.join(cleaned_doc)

cleaned_emails = [clean_text(doc) for doc in emails]

### Data Preparation
* Split data into train and test set.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=486)

cv = CountVectorizer(stop_words='english', max_features=500)

term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
term_docs_test = cv.transform(X_test) # get counter vector for X_test

### Import Model and Train

In [6]:
import numpy as np
from naivebayes import NaiveBayesClassifier

gnb = NaiveBayesClassifier()
gnb.fit(term_docs_train.toarray(), Y_train)
y_pred = gnb.predict(term_docs_test.toarray())

[1.836775682254954e-60, 9.68206113265682e-78]
[2.200711595188241e-146, 6.872110515143724e-172]
[1.0038900556258506e-133, 1.7036207698022388e-213]
[1.5000311206614335e-304, 0.0]
[3.1620300140631717e-23, 2.283423710953752e-40]
[1.5049217975272397e-223, 0.0]
[5.71982723846124e-15, 3.458174986956248e-12]
[0.0, 0.0]
[5.601619893536607e-16, 3.832598583166474e-14]
[9.400826888238206e-34, 1.6034698414020119e-41]
[0.0, 0.0]
[3.9754192223759794e-50, 5.577749353188025e-63]
[3.637705943898482e-25, 7.86220769406868e-36]
[2.370063515502829e-19, 7.454061572980774e-15]
[3.991073599964298e-27, 5.367243753365189e-43]
[1.304268637379485e-32, 4.325860835758118e-49]
[3.636370689754691e-05, 2.3088052554112524e-05]
[7.681643311253788e-17, 1.703775012964433e-19]
[2.605363401094868e-44, 6.098446338281646e-49]
[6.189402800055175e-41, 4.4085697948633744e-57]
[1.1018012465914637e-31, 1.7773719835351902e-39]
[3.6160067387086905e-32, 7.209755791027091e-49]
[7.563829069122719e-16, 7.585221419203408e-19]
[1.158544323

### Simple Evaluation

In [7]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, y_pred)
print("Accuracy of the model is: {:.2f}".format(acc))

Accuracy of the model is: 0.90
