In [1]:
import os, re
import email
import nltk
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB 
from sklearn.svm import LinearSVC, SVC 
from sklearn import metrics
from sklearn.metrics import confusion_matrix 
from nltk.corpus import stopwords 
from os import path
from collections import Counter

In [2]:
TRAIN_DATA_PATH = 'Data\Train'
CROSSVAL_DATA_PATH = 'Data\CrossVal'
STOP_WORDS = set(stopwords.words('English'))
DELIMITERS = [',', '.', '!', '?', '/', '&', '-', ':', ';', '@', '"', "'", '#', '*', '+','=', '[', ']', '(', ')', '{', '}', '%', '<', '>']

In [3]:
VOCAB_SIZE = 3000
vocabulary = []

In [4]:
def extract_email_body(email_path):
    email_content = email.message_from_file(open(email_path))
    text = None
    if email_content.is_multipart():
        html = None
        for part in email_content.get_payload():
            if part.get_content_charset() is None:
                text = str(part.get_payload())
                continue
            charset = part.get_content_charset()
        
            if part.get_content_type() == 'text/plain':
                text = str(part.get_payload())
            
            if part.get_content_type() == 'text/html':
                html = str(part.get_payload())
            ## Should be indented out               
        if text is not None:
            return text.strip()
        else:
            return html.strip()   
            
    else:
        text  = str(email_content.get_payload())
        return text.strip()

In [5]:
email_body = extract_email_body('Data\Train\ham\\1011.eml')
email_body = re.sub(r"<.*?>", "", email_body)
print(email_body)

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.10 (GNU/Linux)

iEYEABECAAYFAku/UegACgkQmRvqrKWZhMezcwCgn61eFAoTG8blmc/IbhgpHC2i
QGQAn31TKsN5jiNDUMFNiJwZsAr6tc6D
=S78v
-----END PGP SIGNATURE-----


In [6]:
def process_email(email_body):
    ### Takes full body of email
    ### Returns Dictionary of email words and their counts 
    
    email_words = []
    ## Dictionary to Return
    email_word_counts = None
    
    ## Remove html
    email_body = re.sub(r"<.*?>", "", email_body)
    ## Handle http
    email_body = re.sub(r"(http|https)://[^\s]*", "httpaddr", email_body)
    ## Handle Email Addresses
    email_body = re.sub(r"[^\s]+@[^\s]+", "emailaddr", email_body)
    ## Handle Numbers
    email_body = re.sub(r"\s+[0-9]+\s+", "num", email_body)
    
    words = email_body.strip().split()
    email_words.extend(''.join(w for w in word.lower() if w not in DELIMITERS) for word in words)
    email_word_counts = Counter(email_words)
    
    email_word_counts.pop('', None)
    
    for stop_word in STOP_WORDS: 
        email_word_counts.pop(stop_word, None)
    
    return email_words, email_word_counts

In [7]:
email_words, email_word_counts = process_email(email_body)
print(email_word_counts)

Counter({'pgp': 2, 'signature': 2, 'begin': 1, 'version': 1, 'gnupg': 1, 'v1410': 1, 'gnulinux': 1, 'ieyeabecaayfakuuegacgkqmrvqrkwzhmezcwcgn61efaotg8blmcibhgphc2i': 1, 'qgqan31tksn5jindumfnijwzsar6tc6d': 1, 's78v': 1, 'end': 1})


In [8]:
def construct_vocabulary(data_directory):
    ### Takes Input path to (training) data 
    ### Return list of vocabulary (Most common N words where N = VOCAB_SIZE)

    all_words = []
    all_word_counts = None
    
    for email_class in os.listdir(data_directory):
        email_class_path = path.join(data_directory, email_class)
        for email in os.listdir(email_class_path):
            email_path = path.join(email_class_path, email)
            email_body = extract_email_body(email_path)
            email_words, email_word_counts = process_email(email_body)
        
            all_words.extend(email_words)
            
    all_word_counts = Counter(all_words) 
    
    all_word_counts.pop('', None)
    
    for stop_word in STOP_WORDS: 
        all_word_counts.pop(stop_word, None)
    
    return np.array(all_word_counts.most_common(VOCAB_SIZE))[:, 0]

In [9]:
vocabulary = construct_vocabulary(TRAIN_DATA_PATH)
print(vocabulary)

['httpaddr' 'emailaddr' 'email' ... 'experimental' 'upstream' 'contacts']


In [10]:
def extract_email_features(email_path, model = 'svm'):
    ### Takes path to an email file
    ### Returns feature vector of email
    email_body = extract_email_body(email_path)
    _ , email_word_counts = process_email(email_body)
    
    feature_vector = []
    
    ## For SVM and multinomial NB - we add word count to  feature vector 
    if model == 'svm' or model == 'multinomialNB':
        for word in vocabulary:
            feature_vector.extend([email_word_counts.get(word, 0)])
    ## For bernouolli bNB - we add 1 if word in email and 0 otherwise        
    elif model == 'bernoulliNB':
        for word in vocabulary:
            if word in email_word_counts:
                feature_vector.extend([1])
            else:
                feature_vector.extend([0])
    else:
        print('No support for such model')
    return feature_vector            

In [11]:
feature_vector = extract_email_features('Data\Train\spam\\1.eml')
print(feature_vector)

[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 1, 5, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 2, 2, 1, 4, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 2, 6, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
def prepare_data_files(data_path, model = 'svm'):
    
    ## Matrix of size (m, VOCAB_SIZE)
    features = []
    ## Vector of size (m)
    labels = []
    
    for email_class in os.listdir(data_path):
        if email_class == r'Ham':
            label = 0
        else:
            label = 1
        emails_dir = path.join(data_path, email_class)
        for email in os.listdir(emails_dir):
            email_path = path.join(emails_dir, email)
            features.append(extract_email_features(email_path, model))
            labels.extend([label])
            
    X = np.array(features)
    y = np.array(labels)
    
#    print(X.shape)
#    print(y.shape)

    return X, y    

In [13]:
X_train, y_train = prepare_data_files(TRAIN_DATA_PATH)

In [14]:
print(X_train.shape)
print(y_train.shape)
print(y_train)

(1800, 3000)
(1800,)
[0 0 0 ... 1 1 1]


In [15]:
X_cval, y_cval = prepare_data_files(CROSSVAL_DATA_PATH)

In [16]:
print(X_cval.shape)
print(y_cval.shape)
print(y_cval)

(700, 3000)
(700,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0

In [17]:
### Training Support vector Machine
## Set different values of C
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000, 2000]
train_score = np.zeros(len(C_values))
train_recall = np.zeros(len(C_values))
train_precision = np.zeros(len(C_values))

cv_score = np.zeros(len(C_values))
cv_recall = np.zeros(len(C_values))
cv_precision = np.zeros(len(C_values))

idx = 0
for c_val in C_values:
    svm = SVC(C = c_val)
    svm.fit(X_train, y_train)    
    
    train_score[idx] = svm.score(X_train, y_train)
    train_recall[idx] = metrics.recall_score(y_train, svm.predict(X_train))
    train_precision[idx] = metrics.precision_score(y_train, svm.predict(X_train))
    
    cv_score[idx] = svm.score(X_cval, y_cval)
    cv_recall[idx] = metrics.recall_score(y_cval, svm.predict(X_cval))
    cv_precision[idx] = metrics.precision_score(y_cval, svm.predict(X_cval))
    
    idx += 1

  'precision', 'predicted', average, warn_for)


In [18]:
matrix = np.matrix(np.c_[C_values, train_score, train_recall, train_precision, cv_score, cv_recall, cv_precision])
models = pd.DataFrame(data = matrix, columns = ['C', 'Train Accuracy', 'Train Recall', 'Train Precision', 'CV Accuracy', 'CV Recall', 'CV Precision'])

models.head(n = 9)

Unnamed: 0,C,Train Accuracy,Train Recall,Train Precision,CV Accuracy,CV Recall,CV Precision
0,0.001,0.666667,0.0,0.0,0.744286,0.0,0.0
1,0.01,0.666667,0.0,0.0,0.744286,0.0,0.0
2,0.1,0.668889,0.006667,1.0,0.747143,0.011173,1.0
3,1.0,0.846667,0.551667,0.97929,0.87,0.536313,0.923077
4,10.0,0.976667,0.938333,0.991197,0.961429,0.899441,0.947059
5,100.0,0.995556,1.0,0.986842,0.954286,0.955307,0.876923
6,500.0,0.998333,1.0,0.995025,0.952857,0.949721,0.876289
7,1000.0,0.998889,1.0,0.996678,0.952857,0.949721,0.876289
8,2000.0,0.999444,1.0,0.998336,0.955714,0.944134,0.889474


In [19]:
## get model with Precision = 1 and biggest Accuracy
best_model_idx =  models[models['CV Precision']==1]['CV Accuracy'].idxmax()
best_C = C_values[best_model_idx]

models.iloc[best_model_idx, :]

svm = SVC(C = best_C)
svm.fit(X_train, y_train) 

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
train_confusion_matrix = confusion_matrix(y_train, svm.predict(X_train))

In [21]:
print(train_confusion_matrix)

[[1200    0]
 [ 596    4]]


In [22]:
cv_confusion_matrix = confusion_matrix(y_cval, svm.predict(X_cval))

In [23]:
print(cv_confusion_matrix)

[[521   0]
 [177   2]]


In [24]:
### Training MultinomialNB
## Set different values of Alpha
alpha_values = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 20]
train_score = np.zeros(len(alpha_values))
train_recall = np.zeros(len(alpha_values))
train_precision = np.zeros(len(alpha_values))

cv_score = np.zeros(len(alpha_values))
cv_recall = np.zeros(len(alpha_values))
cv_precision = np.zeros(len(alpha_values))

idx = 0
for alpha_val in alpha_values:
    mNB = MultinomialNB(alpha = alpha_val)
    mNB.fit(X_train, y_train)    
    
    train_score[idx] = mNB.score(X_train, y_train)
    mNB_predictions = mNB.predict(X_train)
    train_recall[idx] = metrics.recall_score(y_train, mNB_predictions)
    train_precision[idx] = metrics.precision_score(y_train, mNB_predictions)
    
    cv_score[idx] = mNB.score(X_cval, y_cval)
    mNB_predictions = mNB.predict(X_cval)
    cv_recall[idx] = metrics.recall_score(y_cval, mNB_predictions)
    cv_precision[idx] = metrics.precision_score(y_cval, mNB_predictions)
    
    idx += 1

In [25]:
matrix = np.matrix(np.c_[alpha_values, train_score, train_recall, train_precision, cv_score, cv_recall, cv_precision])
models = pd.DataFrame(data = matrix, columns = ['Alpha', 'Train Accuracy', 'Train Recall', 'Train Precision', 'CV Accuracy', 'CV Recall', 'CV Precision'])

models.head(n = 9)

Unnamed: 0,Alpha,Train Accuracy,Train Recall,Train Precision,CV Accuracy,CV Recall,CV Precision
0,0.001,0.942222,0.856667,0.966165,0.942857,0.849162,0.921212
1,0.005,0.941667,0.855,0.966102,0.942857,0.849162,0.921212
2,0.01,0.941111,0.855,0.964286,0.944286,0.849162,0.926829
3,0.05,0.940556,0.853333,0.964218,0.945714,0.849162,0.932515
4,0.1,0.94,0.851667,0.964151,0.945714,0.849162,0.932515
5,0.5,0.938889,0.85,0.962264,0.944286,0.849162,0.926829
6,1.0,0.938333,0.848333,0.962193,0.945714,0.854749,0.927273
7,10.0,0.932778,0.828333,0.965049,0.934286,0.832402,0.90303
8,20.0,0.927778,0.81,0.968127,0.932857,0.798883,0.928571


In [26]:
## Get index of model with max precision
best_model_index = models['CV Precision'].idxmax()
best_alpha = alpha_values[best_model_index]

models.iloc[best_model_index, :]

mNB = MultinomialNB(alpha = best_alpha)
mNB.fit(X_train, y_train)

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

In [27]:
X_train_bNB, y_train_bNB = prepare_data_files(TRAIN_DATA_PATH, 'bernoulliNB')
X_cval_bNB, y_cval_bNB = prepare_data_files(CROSSVAL_DATA_PATH, 'bernoulliNB')

In [28]:
### Training MultinomialNB
## Set different values of Alpha
alpha_values = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 20]
train_score = np.zeros(len(alpha_values))
train_recall = np.zeros(len(alpha_values))
train_precision = np.zeros(len(alpha_values))

cv_score = np.zeros(len(alpha_values))
cv_recall = np.zeros(len(alpha_values))
cv_precision = np.zeros(len(alpha_values))

idx = 0
for alpha_val in alpha_values:
    bNB = BernoulliNB(alpha = alpha_val)
    bNB.fit(X_train_bNB, y_train_bNB)    
    
    train_score[idx] = bNB.score(X_train_bNB, y_train_bNB)
    bNB_predictions = bNB.predict(X_train_bNB)
    train_recall[idx] = metrics.recall_score(y_train_bNB, bNB_predictions)
    train_precision[idx] = metrics.precision_score(y_train_bNB, bNB_predictions)
    
    cv_score[idx] = bNB.score(X_cval_bNB, y_cval_bNB)
    bNB_predictions = bNB.predict(X_cval_bNB)
    cv_recall[idx] = metrics.recall_score(y_cval_bNB, bNB_predictions)
    cv_precision[idx] = metrics.precision_score(y_cval_bNB, bNB_predictions)
    
    idx += 1

In [29]:
matrix = np.matrix(np.c_[alpha_values, train_score, train_recall, train_precision, cv_score, cv_recall, cv_precision])
models = pd.DataFrame(data = matrix, columns = ['Alpha', 'Train Accuracy', 'Train Recall', 'Train Precision', 'CV Accuracy', 'CV Recall', 'CV Precision'])

models.head(n = 9)

Unnamed: 0,Alpha,Train Accuracy,Train Recall,Train Precision,CV Accuracy,CV Recall,CV Precision
0,0.001,0.901667,0.765,0.927273,0.914286,0.776536,0.874214
1,0.005,0.901667,0.765,0.927273,0.908571,0.776536,0.852761
2,0.01,0.900556,0.763333,0.925253,0.907143,0.776536,0.847561
3,0.05,0.896667,0.761667,0.914,0.902857,0.77095,0.836364
4,0.1,0.894444,0.758333,0.91,0.902857,0.77095,0.836364
5,0.5,0.885,0.741667,0.895372,0.895714,0.748603,0.82716
6,1.0,0.877778,0.721667,0.890947,0.887143,0.72067,0.816456
7,10.0,0.781111,0.436667,0.823899,0.811429,0.430168,0.719626
8,20.0,0.733333,0.28,0.777778,0.782857,0.268156,0.695652


In [30]:
## Get index of model with max precision
best_model_index = models['CV Precision'].idxmax()
best_alpha = alpha_values[best_model_index]

models.iloc[best_model_index, :]

bNB = BernoulliNB(alpha = best_alpha)
bNB.fit(X_train, y_train)

BernoulliNB(alpha=0.001, binarize=0.0, class_prior=None, fit_prior=True)