### Download Spam Mail Dataset


In [1]:
import nltk
nltk.download("stopwords")
nltk.download("names")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ben8169/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /Users/ben8169/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ben8169/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Dataset

In [2]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whetere the gien email is spam or ham
  - spam: 1
  - ham: 0
"""

emails, labels = [], []
parition = 0

In [3]:
# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: # [!important] check encofing format
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f:
      emails.append(f.read())
      labels.append(0)

print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))

# of emails = 5172
# of labels = 5172


### Data Preprocessing
  - remove number and punctuation
  - remove name entity
  - remove stopword
  - lemmatization

In [4]:
# remove number and punctuation
def letters_only(word):
    return word.isalpha()

# remove name entity
from nltk.corpus import names
all_names = set(names.words())

# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# put all together to clean texts
def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '): # split doc. by blank (' ')
        word = word.lower() # ABD -> abd
        if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word))

    return ' '.join(cleaned_doc)

cleaned_emails = [clean_text(doc) for doc in emails]

### Data Preparation
* Split data into train and test set.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=486)

cv = CountVectorizer(stop_words='english', max_features=500)

term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
term_docs_test = cv.transform(X_test) # get counter vector for X_test

### Import Model and Train

In [6]:
import numpy as np
from naivebayes import NaiveBayesClassifier

gnb = NaiveBayesClassifier()
gnb.fit(term_docs_train.toarray(), Y_train)
y_pred = gnb.predict(term_docs_test.toarray())

[0.7053391052275036, 0.2946608946724964]
[1.1742755160373667e-32, 1.4731652010609038e-52]
[6.475214231292896e-31, 1.246306254942001e-37]
[1.6358664802165493e-103, 5.6468712493999315e-170]
[8.929441493564858e-16, 7.392954968988382e-34]
[1.3277198579030115e-14, 1.2593308809853276e-27]
[0.9460928595185744, 0.053172669868427204]
[1.3267519190728619e-217, 7.808412789937248e-219]
[1.8165839851481158e-43, 6.736948371779167e-38]
[1.4922111169079024e-130, 8.216110075126716e-209]
[0.0, 0.0]
[1.2482232724307683e-296, 0.0]
[4.2208341762493175e-37, 3.5911817629693583e-63]
[2.120749109385953e-275, 3.0252500332192614e-168]
[2.0878641056360098e-14, 1.8190303200107556e-27]
[5.500381645100764e-142, 3.563720751909421e-182]
[0.9042240653211135, 0.08826929448149932]
[2.319173278250472e-19, 3.539341457296481e-26]
[8.829541972974457e-259, 0.0]
[1.0936970524324775e-25, 2.46818281643492e-32]
[0.0, 0.0]
[2.673326833291867e-24, 2.7635078468332225e-36]
[1.254297823204917e-61, 4.2772504502088137e-75]
[2.1675547260

### Simple Evaluation

In [7]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, y_pred)
print("Accuracy of the model is: {:.2f}".format(acc))

Accuracy of the model is: 0.90


In [15]:
print(len(term_docs_test.toarray()[3]))
print(term_docs_test.toarray()[3])

500
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 