In [1]:
import os
import re
import random
import concurrent.futures
# from timeit import default_timer as timer

# Utils
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# NOTE: classification_report contains accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

# print(joblib.cpu_count())
# print(os.cpu_count())

## Loading Data

The first step is to load our sample data for both spam and ham. For this, we created a utility called loader which, given an input file path, reads the content from the file and append it to a python list.

In [2]:
def loader(file_input):
    data = []
    for (dirpath, dirnames, filenames) in os.walk(file_input):
        for file in filenames:
            path = os.path.join(dirpath, file)
            with open(path, encoding='latin-1') as f:
                data.append(f.read())
                f.close()
    return data

In [3]:
file_input = './data/enron1/ham'
ham = loader(file_input)

In [4]:
file_input = './data/enron1/spam'
spam = loader(file_input)

In [5]:
ham_data = [(words, 0) for words in ham] # 0 for ham
spam_data = [(words, 1) for words in spam] # 1 for spam
all_data = spam_data + ham_data
'ok'

'ok'

In [6]:
all_data = np.array(all_data)
X = all_data[:, 0]
y = all_data[:, 1]
'ok'

'ok'

In [7]:
class Mapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None, **fit_params):
        return self
    
    def transform(self, X):
        return X.toarray()

In [8]:
class Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip = True, stem = True, symbols = True, stemmer = None):
        self.strip = strip
        self.stem = stem
        self.symbols = symbols
        self.stemmer = stemmer or PorterStemmer()
        self.pattern = re.compile(r'\W')
        
    def fit(self, X, y = None, **fit_params):
        return self

    def transform(self, X):
        # ProcessPoolExecutor is for CPU intensive stuff.
        # ThreadPoolExecutor is better suited for network operations or I/O.
        with concurrent.futures.ProcessPoolExecutor(max_workers = os.cpu_counts()) as executor:
            pids = [executor.submit(self.tokenize, sentence) for sentence in X]
            X_done = [pid.result() for pid in pids]
            return X_done
    
    def tokenize(self, sentence):
        words = word_tokenize(sentence)
        out = []
        for word in words:
            if self.symbols and self.pattern.search(word):
                continue
            word = word.strip() if self.strip else word
            word = self.stemmer.stem(word) if self.stem else word
            out.append(word)
        return ' '.join(out)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
'ok'

'ok'

In [10]:
def linear_svc_clf():
    from sklearn.svm import LinearSVC
    
    estimators = [('linear_svc', LinearSVC())]
    param_grid = dict(linear_svc__C = [1, 10, 100],
                      linear_svc__random_state = [42],
                      linear_svc__max_iter = [1000, 10000])

    return estimators, param_grid

In [11]:
def multinomial_nb_clf():
    from sklearn.naive_bayes import MultinomialNB
    
    estimators = [('multinomial_nb', MultinomialNB())]
    param_grid = dict(multinomial_nb__alpha = [1, 10, 100],
                      multinomial_nb__fit_prior = [True],
                      multinomial_nb__class_prior = [None])

    return estimators, param_grid

In [12]:
def random_forest_clf():
    from sklearn.ensemble import RandomForestClassifier

    estimators = [('random_forest', RandomForestClassifier())]
    param_grid = dict(random_forest__n_estimators = [10, 20, 30],
                      random_forest__criterion = ['gini', 'entropy'],
                      random_forest__max_features = ['auto', 'sqrt', 'log2'],
#                       random_forest__max_depth = [None],
                      random_forest__min_samples_split = [2],
                      random_forest__min_samples_leaf = [1],
                      random_forest__min_weight_fraction_leaf = [0],
                      random_forest__max_leaf_nodes = [None],
                      random_forest__min_impurity_decrease = [0],
                      random_forest__bootstrap = [True],
                      random_forest__oob_score = [False],
                      random_forest__n_jobs = [-1],
                      random_forest__random_state = [42],
                      random_forest__warm_start = [False],
                      random_forest__class_weight = ['balanced'])

    return estimators, param_grid

In [13]:
def gaussian_nb_clf():
    from sklearn.naive_bayes import GaussianNB
    
    estimators = [('gaussian_nb', GaussianNB())]
    param_grid = dict()
    return estimators, param_grid

In [14]:
classifiers = [# linear_svc_clf(),
               # multinomial_nb_clf(),
               random_forest_clf(),
               gaussian_nb_clf()]

def build_model(estimators, param_grid):
    clf_name = estimators[0][0]
    print('Start')
    print('Classifier name: {}'.format(clf_name))
    
    pipeline_estimators = [('transformer', Transformer()),
                           ('vectorizer', CountVectorizer(stop_words = 'english')),
                           ('tfidf', TfidfTransformer()),
                           ('mapper', Mapper()),
                           estimators[0]]
    
    pipeline = Pipeline(pipeline_estimators)
    clf = GridSearchCV(pipeline, param_grid = param_grid)
    clf.fit(X_train, y_train)
    
    print('Best params:\n', clf.best_params_, '\n')
    print('Classifier results:\n', clf.cv_results_, '\n')
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_true = y_test, 
                          y_pred = y_pred, 
                          labels = ['1', '0'])
    print('Confusion matrix:\n', cm)
    report = classification_report(y_test, 
                                   y_pred, 
                                   target_names = ['0', '1'])
    
    # Save model
    print('Classification report:\n', report)
    print('End')
    return clf_name, clf

# Unable to parallelize: Multiprocessing backed parallel loops cannot be nested below threads, setting n_jobs=1
# with concurrent.futures.ThreadPoolExecutor(max_workers = 4) as executor:
#     futures = { executor.submit(build_model, estimators, param_grid): estimators[0][0]
#                for (estimators, param_grid) in classifiers}
#     for future in concurrent.futures.as_completed(future_to_url):
#         out = futures[future]
#         (clf_name, clf) = future.result()
#     print('model saved as {}.pkl'.format(clf_name))
#     joblib.dump(clf, '{}.pkl'.format(clf_name)) 

results = [build_model(estimators, param_grid) 
           for (estimators, param_grid) in classifiers]

for (clf_name, clf) in results:
    joblib.dump(clf, '{}.pkl'.format(clf_name)) 
    print('model saved as {}.pkl'.format(clf_name))

Start
Classifier name: random_forest
Best params:
 {'random_forest__bootstrap': True, 'random_forest__class_weight': 'balanced', 'random_forest__criterion': 'entropy', 'random_forest__max_features': 'auto', 'random_forest__max_leaf_nodes': None, 'random_forest__min_impurity_decrease': 0, 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 2, 'random_forest__min_weight_fraction_leaf': 0, 'random_forest__n_estimators': 20, 'random_forest__n_jobs': -1, 'random_forest__oob_score': False, 'random_forest__random_state': 42, 'random_forest__warm_start': False} 

Classifier results:
 {'mean_fit_time': array([  14.98731804,   14.53175004,   13.63925799,   12.15075421,
         12.70283731,   13.35229111,   11.81410162,   11.9166801 ,
         12.22733434,  102.97441045,   14.40297143,   14.48356374,
         13.2359523 ,   13.28904843,   14.42465528,   12.65140525,
         13.09362133,   14.4357899 ]), 'std_fit_time': array([  6.20726309e-01,   2.52685225e-01,   1.3336880

In [19]:
clf = joblib.load('gaussian_nb.pkl')
# clf = joblib.load('random_forest.pkl')

4
4


In [20]:
features = ['sexy babe', 'hello world']
clf.predict(features)

array(['1', '0'], 
      dtype='<U1')