In [9]:
import numpy as np
import pandas as ps
import matplotlib.pyplot as plt
import os
import sklearn
import sklearn.datasets as skd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model, naive_bayes

In [10]:
# I had to load the data this way because the skd.load_files didn't work for some reason
bare_train_data = []
bare_train_target = []
bare_test_data = []
bare_test_target = []
path = "lingspam_public/bare/"
for n in range(1,9):
    for filename in os.listdir(path + "part{}".format(n)):
        f = open(path + "part{}/".format(n) +filename, "r")
        bare_train_data.append(f.read())
        bare_train_target.append((filename[:3] == 'spm'))
        
for filename in os.listdir(path+"part10"):
    f = open(path+"part10/"+filename, "r")
    bare_test_data.append(f.read())
    bare_test_target.append((filename[:3] == 'spm'))


In [11]:

lemm_train_data = []
lemm_train_target = []
lemm_test_data = []
lemm_test_target = []
path = "lingspam_public/lemm/"
for n in range(1,9):
    for filename in os.listdir(path + "part{}".format(n)):
        f = open(path + "part{}/".format(n) +filename, "r")
        lemm_train_data.append(f.read())
        lemm_train_target.append((filename[:3] == 'spm'))
        
for filename in os.listdir(path+"part10"):
    f = open(path+"part10/"+filename, "r")
    lemm_test_data.append(f.read())
    lemm_test_target.append((filename[:3] == 'spm'))


In [12]:

lemm_stop_train_data = []
lemm_stop_train_target = []
lemm_stop_test_data = []
lemm_stop_test_target = []
path = "lingspam_public/lemm_stop/"
for n in range(1,9):
    for filename in os.listdir(path + "part{}".format(n)):
        f = open(path + "part{}/".format(n) +filename, "r")
        lemm_stop_train_data.append(f.read())
        lemm_stop_train_target.append((filename[:3] == 'spm'))
        
for filename in os.listdir(path+"part10"):
    f = open(path+"part10/"+filename, "r")
    lemm_stop_test_data.append(f.read())
    lemm_stop_test_target.append((filename[:3] == 'spm'))

In [13]:
count_vect = CountVectorizer()
X_train_bare = count_vect.fit_transform(bare_train_data)
X_test_bare = count_vect.transform(bare_test_data)
X_test_bare.shape

(291, 53456)

In [14]:
X_train_lemm = count_vect.fit_transform(lemm_train_data)
X_test_lemm = count_vect.transform(lemm_test_data)

In [15]:
X_train_lemm_stop = count_vect.fit_transform(lemm_stop_train_data)
X_test_lemm_stop = count_vect.transform(lemm_stop_test_data)

In [16]:
from sklearn.metrics import mutual_info_score

# this function gets the Informatino Gain on each word and returns the words sorted on IG
def get_words_IG(data, y, words):
    IG = np.zeros(data.shape[1])
    for j in range(0,data.shape[1]):
        IG[j] = sklearn.metrics.mutual_info_score(data[:,j].toarray()[:,0], y)
    return [x for _,x in sorted(zip(-IG,words))], sorted(-IG)    
    


In [17]:
#I get a sorted list of words
words,IG = get_words_IG(X_test_lemm_stop, lemm_stop_test_target, count_vect.get_feature_names())

In [18]:
# I then create all the datasets i need for testing using sklearn
count_vect_tf_10 = CountVectorizer(vocabulary=words[:10])
X_train_tf_10 = count_vect_tf_10.fit_transform(lemm_stop_train_data)
X_test_tf_10 = count_vect_tf_10.transform(lemm_stop_test_data)
count_vect_tf_10.get_feature_names()

['1998',
 'language',
 'university',
 'linguistic',
 'papers',
 'conference',
 'remove',
 'click',
 'free',
 'research']

In [19]:
count_vect_tf_100 = CountVectorizer(vocabulary=words[:100])
X_train_tf_100 = count_vect_tf_100.fit_transform(lemm_stop_train_data)
X_test_tf_100 = count_vect_tf_100.transform(lemm_stop_test_data)

In [20]:
count_vect_tf_1000 = CountVectorizer(vocabulary=words[:1000])
X_train_tf_1000 = count_vect_tf_1000.fit_transform(lemm_stop_train_data)
X_test_tf_1000 = count_vect_tf_1000.transform(lemm_stop_test_data)

In [21]:
count_vect_bin_10 = CountVectorizer(vocabulary=words[:10], binary=True)
X_train_bin_10 = count_vect_bin_10.fit_transform(lemm_stop_train_data)
X_test_bin_10 = count_vect_bin_10.transform(lemm_stop_test_data)
count_vect_bin_10.get_feature_names()

['1998',
 'language',
 'university',
 'linguistic',
 'papers',
 'conference',
 'remove',
 'click',
 'free',
 'research']

In [22]:
count_vect_bin_100 = CountVectorizer(vocabulary=words[:100], binary=True)
X_train_bin_100 = count_vect_bin_100.fit_transform(lemm_stop_train_data)
X_test_bin_100 = count_vect_bin_100.transform(lemm_stop_test_data)

In [23]:
count_vect_bin_1000 = CountVectorizer(vocabulary=words[:1000], binary=True)
X_train_bin_1000 = count_vect_bin_1000.fit_transform(lemm_stop_train_data)
X_test_bin_1000 = count_vect_bin_1000.transform(lemm_stop_test_data)

In [24]:
bern = naive_bayes.BernoulliNB()
bern.fit(X_train_bin_10, lemm_stop_train_target)
bern_y_hat = bern.predict(X_test_bin_10)
bern_acc = bern.score(X_test_bin_10, lemm_stop_test_target)
bern_prec = sklearn.metrics.precision_score(lemm_stop_test_target, bern_y_hat)
bern_rec = sklearn.metrics.recall_score(lemm_stop_test_target, bern_y_hat)
print("bern 10 acc: {}\tprecision: {}\trecall: {}".format(bern_acc, bern_prec, bern_rec))

bern.fit(X_train_bin_100, lemm_stop_train_target)
bern_y_hat = bern.predict(X_test_bin_100)
bern_acc = bern.score(X_test_bin_100, lemm_stop_test_target)
bern_prec = sklearn.metrics.precision_score(lemm_stop_test_target, bern_y_hat)
bern_rec = sklearn.metrics.recall_score(lemm_stop_test_target, bern_y_hat)
print("bern 100 acc: {}\tprecision: {}\recall: {}".format(bern_acc, bern_prec, bern_rec))

bern.fit(X_train_bin_1000, lemm_stop_train_target)
bern_y_hat = bern.predict(X_test_bin_1000)
bern_acc = bern.score(X_test_bin_1000, lemm_stop_test_target)
bern_prec = sklearn.metrics.precision_score(lemm_stop_test_target, bern_y_hat)
bern_rec = sklearn.metrics.recall_score(lemm_stop_test_target, bern_y_hat)
print("bern 1000 acc: {}\tprecision: {}\trecall: {}".format(bern_acc, bern_prec, bern_rec))


bern 10 acc: 0.9862542955326461	precision: 0.9411764705882353	recall: 0.9795918367346939
bern 100 acc: 0.9965635738831615	precision: 1.0ecall: 0.9795918367346939
bern 1000 acc: 0.9896907216494846	precision: 1.0	recall: 0.9387755102040817


In [25]:
nm_bin = naive_bayes.MultinomialNB()
nm_bin.fit(X_train_bin_10, lemm_stop_train_target)
nm_bin_y_hat = nm_bin.predict(X_test_bin_10)
nm_bin_acc = nm_bin.score(X_test_bin_10, lemm_stop_test_target)
nm_bin_prec = sklearn.metrics.precision_score(lemm_stop_test_target, nm_bin_y_hat)
nm_bin_rec = sklearn.metrics.recall_score(lemm_stop_test_target, nm_bin_y_hat)
print("multinomial binary  10 acc: {}\tprecision: {}\trecall: {}".format(nm_bin_acc, nm_bin_prec, nm_bin_rec))

nm_bin = naive_bayes.MultinomialNB()
nm_bin.fit(X_train_bin_100, lemm_stop_train_target)
nm_bin_y_hat = nm_bin.predict(X_test_bin_100)
nm_bin_acc = nm_bin.score(X_test_bin_100, lemm_stop_test_target)
nm_bin_prec = sklearn.metrics.precision_score(lemm_stop_test_target, nm_bin_y_hat)
nm_bin_rec = sklearn.metrics.recall_score(lemm_stop_test_target, nm_bin_y_hat)
print("multinomial binary  100 acc: {}\tprecision: {}\trecall: {}".format(nm_bin_acc, nm_bin_prec, nm_bin_rec))


nm_bin = naive_bayes.MultinomialNB()
nm_bin.fit(X_train_bin_1000, lemm_stop_train_target)
nm_bin_y_hat = nm_bin.predict(X_test_bin_1000)
nm_bin_acc = nm_bin.score(X_test_bin_1000, lemm_stop_test_target)
nm_bin_prec = sklearn.metrics.precision_score(lemm_stop_test_target, nm_bin_y_hat)
nm_bin_rec = sklearn.metrics.recall_score(lemm_stop_test_target, nm_bin_y_hat)
print("multinomial binary  1000 acc: {}\tprecision: {}\trecall: {}".format(nm_bin_acc, nm_bin_prec, nm_bin_rec))

multinomial binary  10 acc: 0.9484536082474226	precision: 0.9473684210526315	recall: 0.7346938775510204
multinomial binary  100 acc: 0.9896907216494846	precision: 1.0	recall: 0.9387755102040817
multinomial binary  1000 acc: 0.9896907216494846	precision: 1.0	recall: 0.9387755102040817


In [26]:
mn_tf = naive_bayes.MultinomialNB()
mn_tf.fit(X_train_tf_10, lemm_stop_train_target)
mn_tf_y_hat = mn_tf.predict(X_test_tf_10)
mn_tf_acc = mn_tf.score(X_test_tf_10, lemm_stop_test_target)
mn_tf_prec = sklearn.metrics.precision_score(lemm_stop_test_target, mn_tf_y_hat)
mn_tf_rec = sklearn.metrics.recall_score(lemm_stop_test_target, mn_tf_y_hat)
print("multinomial TF  10 acc: {}\tprecision: {}\trecall: {}".format(mn_tf_acc, mn_tf_prec, mn_tf_rec))

mn_tf = naive_bayes.MultinomialNB()
mn_tf.fit(X_train_tf_100, lemm_stop_train_target)
mn_tf_y_hat = mn_tf.predict(X_test_tf_100)
mn_tf_acc = mn_tf.score(X_test_tf_100, lemm_stop_test_target)
mn_tf_prec = sklearn.metrics.precision_score(lemm_stop_test_target, mn_tf_y_hat)
mn_tf_rec = sklearn.metrics.recall_score(lemm_stop_test_target, mn_tf_y_hat)
print("multinomial TF  100 acc: {}\tprecision: {}\trecall: {}".format(mn_tf_acc, mn_tf_prec, mn_tf_rec))

mn_tf = naive_bayes.MultinomialNB()
mn_tf.fit(X_train_tf_1000, lemm_stop_train_target)
mn_tf_y_hat = mn_tf.predict(X_test_tf_1000)
mn_tf_acc = mn_tf.score(X_test_tf_1000, lemm_stop_test_target)
mn_tf_prec = sklearn.metrics.precision_score(lemm_stop_test_target, mn_tf_y_hat)
mn_tf_rec = sklearn.metrics.recall_score(lemm_stop_test_target, mn_tf_y_hat)
print("multinomial TF  1000 acc: {}\tprecision: {}\trecall: {}".format(mn_tf_acc, mn_tf_prec, mn_tf_rec))


multinomial TF  10 acc: 0.9484536082474226	precision: 0.9473684210526315	recall: 0.7346938775510204
multinomial TF  100 acc: 0.9965635738831615	precision: 1.0	recall: 0.9795918367346939
multinomial TF  1000 acc: 0.9896907216494846	precision: 1.0	recall: 0.9387755102040817


In [27]:
# I then begin to create my own SVM to evaluate the data

from sklearn import svm
from sklearn.model_selection import cross_val_score
SV = svm.SVC()


In [28]:
# I use cross validation to pick the best vocabulary size
for i in range(10,1000,10):
    cv = CountVectorizer(vocabulary=words[:i])
    X_train_svm = cv.fit_transform(lemm_stop_train_data)
    l = X_train_svm.shape[0]
    trainx = X_train_svm[:(l//4)*3,:]
    valx = X_train_svm[(l//4)*3:,:]
    SV.fit(trainx, lemm_stop_train_target[:(l//4)*3])
    print(i, " : ", SV.score(valx, lemm_stop_train_target[(l//4)*3:]))

10  :  0.9568221070811744
20  :  0.9585492227979274
30  :  0.9602763385146805
40  :  0.9654576856649395
50  :  0.9620034542314335
60  :  0.9654576856649395
70  :  0.9620034542314335
80  :  0.9568221070811744
90  :  0.9533678756476683
100  :  0.9533678756476683
110  :  0.9499136442141624
120  :  0.9516407599309153
130  :  0.9516407599309153
140  :  0.9516407599309153
150  :  0.9516407599309153
160  :  0.9481865284974094
170  :  0.9533678756476683
180  :  0.9550949913644214
190  :  0.9516407599309153
200  :  0.9516407599309153
210  :  0.9516407599309153
220  :  0.9516407599309153
230  :  0.9533678756476683
240  :  0.9499136442141624
250  :  0.9499136442141624
260  :  0.9533678756476683
270  :  0.9550949913644214
280  :  0.9568221070811744
290  :  0.9533678756476683
300  :  0.9499136442141624
310  :  0.9516407599309153
320  :  0.9516407599309153
330  :  0.9499136442141624
340  :  0.9499136442141624
350  :  0.9533678756476683
360  :  0.9533678756476683
370  :  0.9533678756476683
380  :  0.

In [29]:
# The best vocab size i got was 60
# I then use cross validatoin to get the best kernel, linear was the best
cv = CountVectorizer(vocabulary=words[:60])
X_train_svm = cv.fit_transform(lemm_stop_train_data)
l = X_train_svm.shape[0]
trainx = X_train_svm[:(l//4)*3,:]
valx = X_train_svm[(l//4)*3:,:]
kernels = ['poly', 'linear', 'rbf', 'sigmoid']
for k in kernels:
    SV = svm.SVC(kernel=k)
    SV.fit(trainx, lemm_stop_train_target[:(l//4)*3])
    print(k, " : ", SV.score(valx, lemm_stop_train_target[(l//4)*3:]))
    

poly  :  0.8963730569948186
linear  :  0.9861830742659758
rbf  :  0.9654576856649395
sigmoid  :  0.9360967184801382


In [30]:
# I then finally test it on the test data
SV = svm.SVC(kernel='linear')
SV.fit(X_train_svm, lemm_stop_train_target)
X_test_svm = cv.transform(lemm_stop_test_data)
acc = SV.score(X_test_svm, lemm_stop_test_target)
print(acc)

0.979381443298969


In [36]:
# EVAL
#loading in the data
eval_data = []
eval_target = []
path = './eval/all/'
for filename in os.listdir(path):
    f = open(path + filename, "r")
    eval_data.append(f.read())
    eval_target.append((filename[:3] == 'spm'))

# getting the X and y
X_eval = cv.transform(eval_data)
eval_y_hat = SV.predict(X_eval)

# Scoring the data
acc = SV.score(X_eval, eval_target)
prec = sklearn.metrics.precision_score(eval_target, eval_y_hat)
rec = sklearn.metrics.recall_score(eval_target, eval_y_hat)

results = open('eval/results.txt', 'w+')
for y in eval_y_hat:
    results.write('{}\n'.format(int(y)))

In [4]:
# Extra credit
NB10 = naive_bayes.MultinomialNB()
NB10.fit(X_train_bin_10, lemm_stop_train_target)
spam_emails = [x for i,x in enumerate(X_train_bin_10) if  lemm_stop_train_target[i]== True]
len(spam_emails)

NameError: name 'X_train_bin_10' is not defined

In [136]:
def add_words(email, classifier):
    for i in range(1,10-np.sum(email)):
        for j in range(0,i):
            for w in range(0,len(email)):
                if(email[0,w] == 0):
                    email[0,w] = 1
                    if(not classifier.predict(email)):
                        return email, i
                    
    return None, 0

for email in spam_emails:
    new_email, words_added = add_words(email[0].todense(), NB10)
    if(words_added):
        print("changed {} words to classify email as legit".format(words_added))
        print("from: {}\t to {}\n\n".format(email[0].todense(),new_email))



changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	 to [[1 0 0 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	 to [[1 0 0 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	 to [[1 0 0 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	 to [[1 0 0 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	 to [[1 0 0 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 1 0 0 1 0]]	 to [[1 0 0 0 0 1 0 0 1 0]]


changed 1 words to classify email as legit
from: [[0 0 1 0 0 0 0 0 0 0]]	 to [[1 0 1 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 1]]	 to [[1 0 0 0 0 0 0 0 0 1]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	 to [[1 0 0 0 0 0 0 0 0 0]]


changed 1 words to classify email as legit
from: [[0 0 0 0 0 0 0 0 0 0]]	