In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from html.parser import HTMLParser
import re
import os
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pickle
from sklearn.svm import SVR

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if(tag=='a'):
            for (att, val) in attrs:
                if (att=='href'):
                    self.fed.append(' hreflink ')
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


vocab = []
word_list = list(set(words.words()))
labels = []

def read_unlabeled_email(filename):
    try:
        with open(filename, 'r') as File:  
            return re.sub(r'[^a-zA-Z]', ' ', strip_tags(re.sub(r'http\S+', ' hreflink ', File.read()))).strip().lower()
    except UnicodeDecodeError:
        print('couldnt read {!s}',filename)
        return None
def read_email(filename,label):
    print(filename)
    try:
        with open(filename, 'r') as File:  
            
            email = re.sub(r'[^a-zA-Z]', ' ', strip_tags(re.sub(r'http\S+', ' hreflink ', File.read()))).strip().lower()
            for word in word_tokenize(email):
                if (word in word_list) and (len(word) >= 2):
                    vocab.append(word)
            labels.append(label)
            return email
    except UnicodeDecodeError:
        print('couldnt read {!s}',filename)
        return None



    




In [None]:
corpus = []
spam_train_path = './training/spam/'
for filename in os.listdir(spam_train_path):
    em = read_email((spam_train_path+filename),1)
    if (em is None):
        continue
    corpus.append(em)
    
    
ham_train_path = './training/ham/'
for filename in os.listdir(ham_train_path):
    em = read_email((ham_train_path+filename),0)
    if (em is None):
        continue
    corpus.append(em)


vocab = list(set(vocab))
print(len(vocab))

#saving vars
f = open('./vars/corpus.pckl', 'wb')
pickle.dump(corpus, f)
f.close()

f = open('./vars/vocab.pckl', 'wb')
pickle.dump(vocab, f)
f.close()

f = open('./vars/labels.pckl', 'wb')
pickle.dump(labels, f)
f.close()



In [4]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from html.parser import HTMLParser
import re
import os
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pickle
from sklearn.svm import SVR


# dumping vars to files (processing takes a long time, so it's better to do it once and save the vars into a file for later use)
f = open('./vars/corpus.pckl', 'rb')
corpus = pickle.load(f)
f.close()

f = open('./vars/vocab.pckl', 'rb')
vocab = pickle.load(f)
f.close()

f = open('./vars/labels.pckl', 'rb')
labels = pickle.load(f)
f.close()

#vectorization tf-idf style
vectorizer = TfidfVectorizer(vocabulary=vocab)
tfidf = vectorizer.fit_transform(corpus)
#print(tfidf.toarray())
print(len(vectorizer.get_feature_names()))
#print(vectorizer.idf_)


#SVM model

#Trying different params
C_params = [0.1,1,10,100,1000]
gamma_params = [0.01,0.1,1,10]
svr_rbf_array = []

start = time.time()
for C_param in C_params:
    for gamma_param in gamma_params:
        svr_rbf = SVR(kernel='rbf',C=C_param, gamma=gamma_param)
        svr_rbf.fit(tfidf.toarray(), labels) 
        svr_rbf_array.append(svr_rbf)
end = time.time()
print('done in %.2f s' % (end - start))



12687
done in 1978.64 s


In [25]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

# Cross-validation
ham_cv_corpus = []
spam_cv_corpus = []
ham_cv_path = './cross_validation/ham/'
spam_cv_path = './cross_validation/spam/'


# Testings
ham_test_corpus = []
spam_test_corpus = []
ham_test_path = './testing/ham/'
spam_test_path = './testing/spam/'


# read ham cross-validation data
for filename in os.listdir(ham_cv_path):
    em = read_unlabeled_email((ham_cv_path+filename))
    if(em is None):
        continue
    ham_cv_corpus.append(em)

    
# read spam cross-validation data
for filename in os.listdir(spam_cv_path):
    em = read_unlabeled_email((spam_cv_path+filename))
    if(em is None):
        continue
    spam_cv_corpus.append(em)

    
spam_cv_tfidf = vectorizer.fit_transform(spam_cv_corpus)
ham_cv_tfidf = vectorizer.fit_transform(ham_cv_corpus)

#pred = svr_rbf.predict(spam_cv_tfidf.toarray())
#print('performance on spam cross-validation set {0:.0%}'.format(sum(num >= 0.5 for num in pred)/len(pred)))


preds_spam = []
preds_ham = []
for m in svr_rbf_array:
    pred = m.predict(ham_cv_tfidf.toarray())
    preds_ham.append(sum(num < 0.5 for num in pred)/len(pred))
    pred = m.predict(spam_cv_tfidf.toarray())
    preds_spam.append(sum(num >= 0.5 for num in pred)/len(pred))
    
print('done')

    







done


In [35]:
zip_preds = zip(preds_ham, preds_spam)
best_svm = svr_rbf_array[np.argmax([x + y for x, y in zip_preds])]
print(best_svm)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [36]:
# read ham test data
i = 0
for filename in os.listdir(ham_test_path):
    #i = i+1
    #if(i>302):
    #    break
    em = read_unlabeled_email((ham_test_path+filename))
    if(em is None):
        continue
    ham_test_corpus.append(em)

    
# read spam test data
i = 0
for filename in os.listdir(spam_test_path):
    #i = i+1
    #if(i>302):
    #    break
    em = read_unlabeled_email((spam_test_path+filename))
    if(em is None):
        continue
    spam_test_corpus.append(em)

    
spam_test_tfidf = vectorizer.fit_transform(spam_test_corpus)
ham_test_tfidf = vectorizer.fit_transform(ham_test_corpus)

pred = best_svm.predict(spam_test_tfidf.toarray())
print('performance on spam test set {0:.0%}'.format(sum(num >= 0.5 for num in pred)/len(pred)))

pred = best_svm.predict(ham_test_tfidf.toarray())
print('performance on ham test set {0:.0%}'.format(sum(num < 0.5 for num in pred)/len(pred)))



performance on spam test set 96%
performance on ham test set 93%
