In [2]:
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from nltk import pos_tag,word_tokenize,sent_tokenize
import pyphen
import enchant
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer


In [7]:
data=[]
labels=[]

for fname in glob.glob('data/lingspam_public/bare/*/*'):
    with open(fname,encoding='utf-8') as f:
        data.append('\n'.join(f.readlines()))
        if("spm" in fname):
            labels.append(1)#spam
        else:
            labels.append(0) #notspam
print(len(data))
print(len(labels))
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=101)


2893
2893


In [8]:
#using a simple count vectorizer
vectorizer = CountVectorizer()
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)
print(X_train.shape)

(2314, 53930)


In [9]:
def classify(clf,X_train,X_test,y_train,y_test):
    
    clf.fit(X_train, y_train)
    y_pred=clf.predict(X_test)
    print('Precision  : ',metrics.precision_score(y_test, y_pred, average='macro'))
    print('Recall  : ',metrics.recall_score(y_test, y_pred, average='macro'))
    print('F1_score  : ',metrics.f1_score(y_test, y_pred, average='macro'))


In [10]:
#MultinomialNB with countVectorizer
clf= MultinomialNB()
print("MultinomialNB")
classify(clf, X_train, X_test, y_train, y_test)

MultinomialNB
Precision  :  0.987223148369
Recall  :  0.992027417027
F1_score  :  0.989607576836


In [11]:
#K-means neigbout with countVecotrizer
clf=KNeighborsClassifier(n_neighbors=3)
print("K-means")
classify(clf, X_train, X_test, y_train, y_test)

K-means
Precision  :  0.952777777778
Recall  :  0.911616161616
F1_score  :  0.930742062818


In [12]:
#Random Forest with countVectorizer
clf_RF=RandomForestClassifier(n_estimators=10,random_state=0)
print("Random Forest")
classify(clf, X_train, X_test, y_train, y_test)

Random Forest
Precision  :  0.952777777778
Recall  :  0.911616161616
F1_score  :  0.930742062818


In [13]:
def features_extraction(email):
    dic={
        'F1':len(sent_tokenize(email)),
        'F2':number_verbs(word_tokenize(email)),
        'F3':hasNumbers(word_tokenize(email)),
        'F4':in_spam_list(word_tokenize(email)),
        'F5':num_words_syl(word_tokenize(email))[0],
        'F6':num_words_syl(word_tokenize(email))[1],
        #'F7':num_misspelled(word_tokenize(email)),
        'F9':sum_tfidf([email])
                             }
    return dic;

In [14]:
def number_verbs(email):
    tagged=[]
    tagged+=pos_tag(email)
    return (len(set([word for (word, tag) in tagged if 'VB' in tag[:2] ])))  

In [15]:
def hasNumbers(email):
    hasD=False;
    hasA=False;
    count=0
    for word in email:
            for char in word:
                if(char.isdigit()):
                    hasD=True
                if(char.isalpha()):
                    hasA=True
            if(hasD and hasA):
                count+=1
            hasD=False
            hasA=False
    return count
                
     
                    


In [16]:
def in_spam_list(email):
    count=0
    with open('data/spamlist.txt',encoding='latin-1') as f:
        spamlist=f.readlines()
    for word in email:
        if word in spamlist:
            count+=1
    return count

In [17]:
def num_syl(word):
    dic = pyphen.Pyphen(lang='en_GB')
    syl=dic.inserted(word)
    return len(syl.split('-'))

In [18]:
def num_words_syl(email):
    count_3=0
    avg=0
    for word in email:
        if(num_syl(word)>3):
            count_3+=1
            avg+=num_syl(word)
    return (count_3, avg/len((email)))

In [19]:
def num_misspelled(email):
    count=0
    d = enchant.Dict('en_GB')
    for word in email:
        if(not d.check(word)):
            count+=1
    return count

In [20]:
def sum_tfidf(email):
    vectorizer = TfidfVectorizer()
    tfidf=vectorizer.fit_transform(email).toarray()
    return np.sum(tfidf[0])

In [21]:
#apply feature extraction on the data
fe_data=[]
for email in data:
    fe_data.append(features_extraction(email))

    
X_train, X_test, y_train, y_test = train_test_split(fe_data, labels, test_size=0.2, random_state=101)





In [22]:
#transfomr list of dic to list of list
vec = DictVectorizer()
X_train= vec.fit_transform(X_train)
X_test= vec.transform(X_test)



In [23]:
#MultinomialNB with feature extraction
clf= MultinomialNB()
print("MultinomialNB")
classify(clf, X_train, X_test, y_train, y_test)

MultinomialNB
Precision  :  0.69189569143
Recall  :  0.733080808081
F1_score  :  0.70845763602


In [24]:
#K-means neigbout with feature extraction
clf=KNeighborsClassifier(n_neighbors=3)
print("K-means")
classify(clf, X_train, X_test, y_train, y_test)

K-means
Precision  :  0.738331799231
Recall  :  0.711724386724
F1_score  :  0.723683959353


In [25]:
#Random Forest with feature extraction
clf=RandomForestClassifier(n_estimators=10,random_state=0)
print("Random Forest")
classify(clf, X_train, X_test, y_train, y_test)

Random Forest
Precision  :  0.780027453672
Recall  :  0.715873015873
F1_score  :  0.741363907088
