In [13]:
import os
import re

import numpy as np
import pandas as pd

from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Todo: There is snowball stemmer too
from nltk.stem.porter import PorterStemmer

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from collections import Counter

import random

## Loading Data

The first step is to load our sample data for both spam and ham. For this, we created a utility called loader which, given an input file path, reads the content from the file and append it to a python list.

In [2]:
def loader(file_input):
    data = []
    for (dirpath, dirnames, filenames) in os.walk(file_input):
        for file in filenames:
            path = os.path.join(dirpath, file)
            with open(path, encoding='latin-1') as f:
                data.append(f.read())
                f.close()
    return data

In [3]:
file_input = './data/enron1/ham'
ham = loader(file_input)

In [4]:
file_input = './data/enron1/spam'
spam = loader(file_input)

In [5]:
# Tokenize
patt = re.compile(r'\W')
stops = set(stopwords.words('english'))
ps = PorterStemmer()

def process_words(data):
    words = word_tokenize(data)
    
    # Lowercase
    words = [word.lower() for word in words]

    # Remove stop words
    words = [word for word in words if word not in stops]

    # Remove special characters
    words = [word for word in words if not patt.search(word)]

    # Remove digit
    words = [word for word in words if not word.isdigit()]
    
    # Strip
    words = [word.strip() for word in words]

    # Stem words
    words = [ps.stem(word) for word in words]
    
#     return dict([(word, True) for word in words])
    return ' '.join(words)

In [6]:
ham_data = [(process_words(words), 0) for words in ham] # 0 for ham
spam_data = [(process_words(words), 1) for words in spam] # 1 for spam
all_data = spam_data + ham_data
print('done')

done


In [7]:
all_data = np.array(all_data)
X = all_data[:, 0]
y = all_data[:, 1]

In [8]:
vectorizer = CountVectorizer(max_features=3000)
X = vectorizer.fit_transform(X).toarray()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [11]:
estimators = [('clf', LinearSVC())]
pipe = Pipeline(estimators)

param_grid = dict(clf__C = [1, 10, 100],
                  clf__random_state = [42],
                  clf__max_iter = [1000, 10000])
clf = GridSearchCV(pipe, param_grid = param_grid)
clf.fit(X_train, y_train)
'ok'

'ok'

In [18]:
print('Best parameters set found on development set:')
clf.best_params_

Best parameters set found on development set:


{'clf__C': 1, 'clf__max_iter': 1000, 'clf__random_state': 42}

In [15]:
print('Grid scores on development set:')
df = pd.DataFrame.from_dict(clf.cv_results_)

df = df[['mean_fit_time', 
         'mean_score_time',
         'mean_test_score',
         'param_clf__C',
         'param_clf__max_iter',
         'param_clf__random_state',
         'split0_test_score',
         'split1_test_score',
         'split2_test_score',
         'std_fit_time',
         'std_score_time',
         'std_test_score'
        ]]
df

Grid scores on development set:




Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_clf__C,param_clf__max_iter,param_clf__random_state,split0_test_score,split1_test_score,split2_test_score,std_fit_time,std_score_time,std_test_score
0,2.495874,1.201997,0.954978,1,1000,42,0.947186,0.955844,0.961905,0.764956,0.136037,0.00604
1,2.130771,1.050004,0.954978,1,10000,42,0.947186,0.955844,0.961905,0.311174,0.004181,0.00604
2,1.855962,1.002008,0.946898,10,1000,42,0.941126,0.948052,0.951515,0.237287,0.027871,0.004319
3,1.966587,0.972643,0.946898,10,10000,42,0.941126,0.948052,0.951515,0.25172,0.030333,0.004319
4,1.661111,0.922435,0.945743,100,1000,42,0.94026,0.945455,0.951515,0.059258,0.025758,0.0046
5,1.842381,0.929047,0.945743,100,10000,42,0.94026,0.945455,0.951515,0.115369,0.013006,0.0046


In [19]:
text = 'fake babe is amazing'
text_data = vectorizer.transform([process_words(text)]).toarray()
clf.predict(text_data)
# text_data

array(['1'], 
      dtype='<U20041')

In [20]:
text = 'hello world'
text_data = vectorizer.transform([process_words(text)]).toarray()
clf.predict(text_data)

array(['1'], 
      dtype='<U20041')

In [21]:
y_pred = clf.predict(X_test)
y_pred

array(['0', '0', '1', ..., '0', '1', '0'], 
      dtype='<U20041')

In [30]:
confusion_matrix(y_true = y_test, 
                 y_pred = y_pred, 
                 labels = ['1', '0'])

array([[ 490,   26],
       [  37, 1154]])

In [29]:
print(classification_report(y_test, y_pred, target_names = ['0', '1']))

             precision    recall  f1-score   support

          0       0.98      0.97      0.97      1191
          1       0.93      0.95      0.94       516

avg / total       0.96      0.96      0.96      1707

