In [1]:
%matplotlib inline

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.optimize as opt
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC

In [3]:
def readFile(name):
    path = './data/ex6/' + name
    try:
        fh = open(path, 'r')
        mail = fh.read()
        fh.close()
    except Exception as e:
        print("Unable to open file", str(e))
    return mail

In [4]:
sample_mail = readFile('emailSample1.txt')


In [5]:
re.findall('\n', sample_mail)

['\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n']

In [6]:
sample_mail

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [7]:
def getVocab():
    fh_vocab = open('./data/ex6/vocab.txt', 'r')
    word_dict = {}
    vocab_list = []
    line = fh_vocab.readline()
    while (line):
    #     print("line", line)
        line = re.sub('[\t]', ' ', line)
        line = re.sub('[\n]', '', line)
        line = re.sub('[ ]+', ' ', line)
        line = line.strip()
    #     print("Line", line)
        word = line.split()
    #     print("Word", word)
        
        word_dict[word[1]] = int(word[0])
#         vocab_list.append(word_dict)
        line = fh_vocab.readline()
#     return vocab_list
    return word_dict

In [28]:
def processEmail(mail):
    mail = mail.lower()
    mail = re.sub('<.*?>', ' ', mail)
    mail = re.sub('.*@.+\.com', 'emailaddr', mail)
    mail = re.sub('(http|https):.*\.com', 'httpaddr', mail)
    mail = re.sub('[0-9]+', 'number', mail)
    mail = re.sub('\$+', 'dollar', mail)
    mail = re.sub('\n', ' ', mail)
    bad_symbols = re.compile("[><\?/\\\.\[\]:,']")
    mail = bad_symbols.sub(' ', mail)
    mail = re.sub('[ ]+',' ', mail)
    mail = mail.strip()
    print(mail)
#     print(mail)
    vocab_list = getVocab()
    ps = PorterStemmer()
    tokenized_mail = word_tokenize(mail)
    word_indices = []
    for word in tokenized_mail:
        word = ps.stem(word)
#         print(word)
        if word in vocab_list.keys():
            word_indices.append(vocab_list[word])
    
    return word_indices


In [29]:
clean_mail_indices = processEmail(sample_mail)

anyone knows how much it costs to host a web portal well it depends on how many visitors you re expecting this can be anywhere from less than number bucks a month to a couple of dollarnumber you should checkout httpaddr or perhaps amazon ecnumber if youre running something big to unsubscribe yourself from this mailing list send an email to emailaddr


In [34]:
def emailFeatures(word_indices):
    vocab_dict = getVocab()
#     print(vocab_dict)
    no_of_features = len(vocab_dict)
#     print(no_of_features)
    X = [0]*no_of_features
#     print(len(X))
    for value in vocab_dict.values():
#         print(value)
#         value = int(value)
        if value in word_indices:
#             print(index)
            X[(value - 1)] = 1
    return X

In [35]:
feature_vector = emailFeatures(clean_mail_indices)

In [40]:
data = loadmat('./data/ex6/spamTrain.mat')

In [44]:
X = data['X']
y = data['y']

In [46]:
def trainModel(C, X, y, kernel, sigma=1):
    if kernel == 'linear':
        linear_model = SVC(kernel=kernel, C= C)
        linear_model.fit(X, y)
        return linear_model
    else:
        gamma = 1/(2*(sigma**2))
        model = SVC(kernel=kernel, C = C, gamma=gamma)

In [48]:
C = 0.1
linear_model = trainModel(C, X, y.flatten(), 'linear') 

In [50]:
y_pred = linear_model.predict(X)

In [61]:
accuracy = np.mean(np.double(y_pred == y.flatten())*100)

In [52]:
accuracy

99.825

In [53]:
data_test = loadmat('./data/ex6/spamTest.mat')

In [58]:
X_test = data_test['Xtest']
y_test = data_test['ytest']

In [59]:
y_pred_test = linear_model.predict(X_test)

In [62]:
accuracy_test = np.mean(np.double(y_pred_test == y_test.flatten())*100)

In [63]:
accuracy_test

98.9

In [68]:
linear_model.decision_function(X)

array([ 2.73912663,  1.81225137, -1.00010388, ...,  1.56809078,
       -1.0001831 , -2.88377927])

In [82]:
np.where?

In [85]:
np.where((linear_model.coef_[0])==0)

(array([  58,  334,  474,  689,  702,  727,  807,  813,  889,  971, 1126,
        1127, 1128, 1129, 1131, 1132, 1134, 1138, 1140, 1142, 1147, 1193,
        1245, 1251, 1253, 1282, 1338, 1478, 1561, 1564, 1598, 1626, 1627,
        1748, 1862, 1882], dtype=int64),)

Important Concept

In [123]:
from collections import OrderedDict
t = sorted(list(enumerate(linear_model.coef_[0])),key=lambda e: e[1], reverse=True)
d = OrderedDict(t)
# print(d.keys())
# print(d.values())
idx = list(d.keys())
weight = list(d.values())
vocabList = getVocab()
# print(vocabList)
vocabList = {v:k for k,v in vocabList.items()}
# vocabList is word:index dictionary list it needs to be changed in index:word dictionary
print ('Top predictors of spam: ')
for i in range(15):
#     print(idx[i])
    print (vocabList[idx[i]], weight[i])

print ('Program paused. Press enter to continue.')

Top predictors of spam: 
otherwis 0.5006137361746403
clearli 0.465916390688888
remot 0.42286911706104086
gt 0.38362160179406524
visa 0.367710398245535
base 0.3450640979461706
doesn 0.3236320357963838
wife 0.2697241060374008
previous 0.2672977146177071
player 0.2611688867001495
mortgag 0.2572981979518164
natur 0.2539414551595324
ll 0.25346652431419925
futur 0.24829699045568662
hot 0.24640435783158998
Program paused. Press enter to continue.
