In [1]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

In [2]:
def make_Dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]    
    all_words = []       
    for mail in emails:    
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:
                    words = line.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    
    #list_to_remove = dictionary.keys()
    list_to_remove = list(dictionary)
    for item in list_to_remove:
        if item.isalpha() == False: 
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    return dictionary


In [3]:
def extract_features(mail_dir): 
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    docID = 0;
    for fil in files:
      with open(fil) as fi:
        for i,line in enumerate(fi):
          if i == 2:
            words = line.split()
            for word in words:
              wordID = 0
              for i,d in enumerate(dictionary):
                if d[0] == word:
                  wordID = i
                  features_matrix[docID,wordID] = words.count(word)
        docID = docID + 1     
    return features_matrix

In [4]:
train_dir = 'Desktop/lingspam_public/lemm_stop/train_1a8'
dictionary = make_Dictionary(train_dir)

In [5]:
train_labels = np.zeros(2313)




In [6]:
train_labels

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [7]:
train_labels[1929:2313] = 1

In [8]:
train_labels

array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [9]:
train_matrix = extract_features(train_dir)


In [10]:
train_matrix

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [11]:
model1 = LinearSVC()
model2 = MultinomialNB()

In [12]:
model1.fit(train_matrix,train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [13]:
model2.fit(train_matrix,train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
test_dir = 'Desktop/lingspam_public/lemm_stop/test_9e10'
test_matrix = extract_features(test_dir)

In [15]:
test_matrix

array([[ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 7.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [16]:
len(test_matrix)

580

In [17]:
test_labels = np.zeros(580)
test_labels[383:580] = 1

In [18]:
result1 = model1.predict(test_matrix)

In [19]:
result2 = model2.predict(test_matrix)

In [20]:
print (confusion_matrix(test_labels,result1))

[[378   5]
 [ 95 102]]


In [21]:
print (confusion_matrix(test_labels,result2))

[[382   1]
 [109  88]]
