In [23]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

# %%writefile file_name.py
# %load file_name.py

## Loading Data

The first step is to load our sample data for both spam and ham. For this, we created a utility called loader which, given an input file path, reads the content from the file and append it to a python list.

In [24]:
# %load train_server/utils/csv.py
import pandas as pd

def to_csv(file_name, features, labels):
    df = pd.DataFrame({'features': features,
                       'labels': labels})
    df.to_csv('{}'.format(file_name))
    print('Wrote to {}'.format(file_name))

def read_csv(file_name):
    df = pd.read_csv(file_name)
    X = df.as_matrix(columns = ['features']).flatten()
    y = df.as_matrix(columns = ['labels']).astype(str).flatten()
    return X, y

In [25]:
# %load train_server/pipeline/array_transformer.py
from sklearn.base import BaseEstimator, TransformerMixin

class ArrayTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None, **fit_params):
        return self

    def transform(self, X):
        return X.toarray()

In [26]:
# %load train_server/pipeline/nltk_preprocessor.py
import os
import re

from concurrent.futures import ProcessPoolExecutor

from sklearn.base import BaseEstimator, TransformerMixin

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, strip = True, stem = True, symbols = True, stemmer = None):
        self.strip = strip
        self.stem = stem
        self.symbols = symbols
        self.stemmer = stemmer or PorterStemmer()
        self.pattern = re.compile(r'\W')
        
    def fit(self, X, y = None, **fit_params):
        return self

    def transform(self, X):
        with ProcessPoolExecutor(max_workers = os.cpu_count() * 5) as executor:
            futures = [executor.submit(self.tokenize, X_i) for X_i in X]
            X_out = [future.result() for future in futures]
            return X_out

    def tokenize(self, sentence):
        words = word_tokenize(sentence)
        out = []
        for word in words:
            if self.symbols and self.pattern.search(word):
                continue
            word = word.strip() if self.strip else word
            word = self.stemmer.stem(word) if self.stem else word
            out.append(word)
        return ' '.join(out)


In [27]:
X, y = read_csv('train_server/data.csv')
print(X.shape, y.shape, X[0], y[0])

(5172,) (5172,) Subject: what up , , your cam babe
what are you looking for ?
if your looking for a companion for friendship , love , a date , or just good ole '
fashioned * * * * * * , then try our brand new site ; it was developed and created
to help anyone find what they ' re looking for . a quick bio form and you ' re
on the road to satisfaction in every sense of the word . . . . no matter what
that may be !
try it out and youll be amazed .
have a terrific time this evening
copy and pa ste the add . ress you see on the line below into your browser to come to the site .
http : / / www . meganbang . biz / bld / acc /
no more plz
http : / / www . naturalgolden . com / retract /
counterattack aitken step preemptive shoehorn scaup . electrocardiograph movie honeycomb . monster war brandywine pietism byrne catatonia . encomia lookup intervenor skeleton turn catfish .
 1


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
'ok'

'ok'

In [29]:
# %load train_server/classifiers.py
def linear_svc_clf():
    from sklearn.svm import LinearSVC
    
    estimators = [('linear_svc', LinearSVC())]
    param_grid = dict(linear_svc__C = [1, 10, 100],
                      linear_svc__random_state = [42],
                      linear_svc__max_iter = [1000, 10000])

    return estimators, param_grid

def multinomial_nb_clf():
    from sklearn.naive_bayes import MultinomialNB
    
    estimators = [('multinomial_nb', MultinomialNB())]
    param_grid = dict(multinomial_nb__alpha = [1, 10, 100],
                      multinomial_nb__fit_prior = [True],
                      multinomial_nb__class_prior = [None])

    return estimators, param_grid

def random_forest_clf():
    from sklearn.ensemble import RandomForestClassifier

    estimators = [('random_forest', RandomForestClassifier())]
    param_grid = dict(random_forest__n_estimators = [10, 20, 30],
                      random_forest__criterion = ['gini', 'entropy'],
                      random_forest__max_features = ['auto', 'sqrt', 'log2'],
#                       random_forest__max_depth = [None],
                      random_forest__min_samples_split = [2],
                      random_forest__min_samples_leaf = [1],
                      random_forest__min_weight_fraction_leaf = [0],
                      random_forest__max_leaf_nodes = [None],
                      random_forest__min_impurity_decrease = [0],
                      random_forest__bootstrap = [True],
                      random_forest__oob_score = [False],
                      random_forest__n_jobs = [-1],
                      random_forest__random_state = [42],
                      random_forest__warm_start = [False],
                      random_forest__class_weight = ['balanced'])

    return estimators, param_grid

def gaussian_nb_clf():
    from sklearn.naive_bayes import GaussianNB
    
    estimators = [('gaussian_nb', GaussianNB())]
    param_grid = dict()
    return estimators, param_grid

In [30]:
classifiers = [# linear_svc_clf(),
               # multinomial_nb_clf(),
               # random_forest_clf(),
               gaussian_nb_clf()]

labels = ['0', '1']

def build_model(estimators, param_grid):
    clf_name = estimators[0][0]
    print('Start')
    print('Classifier name: {}'.format(clf_name))
    
    pipeline_estimators = [('nltk_preprocessor', NLTKPreprocessor()),
                           ('vectorizer', CountVectorizer(stop_words = 'english')),
                           ('tfidf', TfidfTransformer()),
                           ('transformer', ArrayTransformer()),
                           estimators[0]]
    
    pipeline = Pipeline(pipeline_estimators)
    clf = GridSearchCV(pipeline, param_grid = param_grid)
    clf.fit(X_train, y_train)
    
    print('Best params:\n', clf.best_params_, '\n')
    print('Classifier results:\n', clf.cv_results_, '\n')
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_true = y_test, 
                          y_pred = y_pred, 
                          labels = labels)
    print('Confusion matrix:\n', cm)
    report = classification_report(y_test, 
                                   y_pred, 
                                   target_names = labels)
    
    # Save model
    print('Classification report:\n', report)
    print('End')
    return clf_name, clf

results = [build_model(estimators, param_grid) 
           for (estimators, param_grid) in classifiers]

for (clf_name, clf) in results:
    joblib.dump(clf, 'models/{}.pkl'.format(clf_name)) 
    print('model saved as models/{}.pkl'.format(clf_name))

Start
Classifier name: gaussian_nb
Best params:
 {} 

Classifier results:
 {'mean_fit_time': array([ 10.05631177]), 'std_fit_time': array([ 0.67966023]), 'mean_score_time': array([ 5.20344122]), 'std_score_time': array([ 0.14778877]), 'params': [{}], 'split0_test_score': array([ 0.94978355]), 'split1_test_score': array([ 0.94199134]), 'split2_test_score': array([ 0.94978355]), 'mean_test_score': array([ 0.94718615]), 'std_test_score': array([ 0.00367328]), 'rank_test_score': array([1], dtype=int32), 'split0_train_score': array([ 0.98787879]), 'split1_train_score': array([ 0.98787879]), 'split2_train_score': array([ 0.98787879]), 'mean_train_score': array([ 0.98787879]), 'std_train_score': array([ 0.])} 

Confusion matrix:
 [[1136   55]
 [  40  476]]
Classification report:
              precision    recall  f1-score   support

          0       0.97      0.95      0.96      1191
          1       0.90      0.92      0.91       516

avg / total       0.94      0.94      0.94      1707

E

In [31]:
# clf = joblib.load('gaussian_nb.pkl')
# # clf = joblib.load('random_forest.pkl')

In [32]:
# features = ['sexy babe', 'hello world']
# clf.predict(features)