# NB classifier


We use scikit-learn's implementation of SVM and its cross validation tools. http://scikit-learn.org/

## Installation

To install all of the python dependencies for this notbook in a virtual environment:

```bash
# create environment in directory named 'venv'
python -m venv venv
# or:
# virtualenv venv

# activate environment
source venv/bin/activate

# install dependencies
pip3 install -r requirements.txt
```

In [9]:
from class_utils import *
import pickle
import numpy as np

from nltk.tokenize.casual import casual_tokenize

from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [10]:
# globals
iteration="not-critical"
model_filename = "/Users/amyburkhardt/Documents/NLP/Code/combining-machine-qual/saved_models/best_svc_{}.pickle".format(iteration)

## Parse data sets

Here we parse data from our training files, and then randomly select a portion to be held out for evaluation. The training set is used to both train the SVM classifier and select parameters using k-fold cross validation.

The `parse_training_data()` function is provided in the external `class_utils.py` file.

In [11]:
import os
os.chdir("/Users/amyburkhardt/Documents/NLP/Code/combining-machine-qual/training_data/")
!pwd

/Users/amyburkhardt/Documents/NLP/Code/combining-machine-qual/training_data


In [12]:
# parse data from files
classes = ['NEG', 'POS']
docs, targets = parse_training_data(['NEG.txt'.format(iteration), 'POS-{}.txt'.format(iteration)], classes)

# convert the targets array of strings to binary labels (0=NEG, 1=POS)
lb = LabelBinarizer(sparse_output=False)
lb.fit(classes)
bin_targets = lb.transform(targets).ravel()

# split data set into to training and evaluation sets
# X_test/y_test are held out and not used during the
# k-fold training and parameter search below
#
# The percentage of samples to hod out is determined by the `test_size`
# parameter
# for this iter2, the holdout is only going to be 10% 
X_train, X_test, y_train, y_test = train_test_split(
    docs, bin_targets, test_size=0.10, random_state=0)

In [13]:
len(X_train)

604

In [14]:
len(X_test)

68

## Create sklearn pipeline

Here we setup a scikit-learn pipeline to create vectors from our training sample vocabulary (`CountVectorizer`), normalize words based on frequency (`TfidfTransformer`), and train a SVM classifier (`SVC`). http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

We evaluate parameters based on th `fscore_prec` which is a weighted fscore which favors precision (beta < 1). We also calculate accuracy, precision, recall, and f1 scores for each of the k-fold training sessions.

Using a pipeline makes it easy to search a range of hyperparameters using sklearn's `GridSearchCV`. http://scikit-learn.org/stable/modules/grid_search.html

# beta = .5

In [15]:
svc_pl = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__preprocessor': [normalize_tweet],#[normalize_tweet, normalize_simple, None],
    'vect__max_df': np.linspace(0.3, 1.0, 10),
    'vect__tokenizer': [word_tokenize],#[casual_tokenize, word_tokenize, None],
    'vect__stop_words' : ['english', None],
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],# ((1, 1), (1, 2), (1,3)),  # largest n-gram
    'tfidf__use_idf':[True, False],# (True, False), #DEFAULT
    'clf__alpha': np.linspace(0.05, 0.2, 10),
    
}

# define the scores we want to calcualte during each k-fold training
fscore_prec = make_scorer(fbeta_score, beta=.5)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'fscore_prec': fscore_prec
}

# create the GridSearchCV object.
# by setting refit='fscore_prec', the model which maximizes that score
# will be selected and retrained on all training data.
svc_search = GridSearchCV(svc_pl, parameters, n_jobs=-1, verbose=1, scoring=scoring, refit='fscore_prec')

In [2]:
import numpy as np
np.linspace(0.05, 0.2, 10)

array([0.05      , 0.06666667, 0.08333333, 0.1       , 0.11666667,
       0.13333333, 0.15      , 0.16666667, 0.18333333, 0.2       ])

In [16]:
# Here we do the actual training
# Can take several minutes depending on the range of parameters given
# int he parameters dict above
svc_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 39.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__preprocessor': [<function normalize_tweet at 0x1122fe730>], 'vect__max_df': array([0.3    , 0.37778, 0.45556, 0.53333, 0.61111, 0.68889, 0.76667,
       0.84444, 0.92222, 1.     ]), 'vect__tokenizer': [<function word_tokenize at 0x1152258c8>], 'vect__stop_words': ['english', None]...([0.05   , 0.06667, 0.08333, 0.1    , 0.11667, 0.13333, 0.15   ,
       0.16667, 0.18333, 0.2    ])},
       pre_dispatch='2*n_jobs', refit='fscore_prec

In [17]:
# The parameters selected by the grid search
svc_search.best_params_

{'clf__alpha': 0.06666666666666668,
 'tfidf__use_idf': False,
 'vect__max_df': 0.45555555555555555,
 'vect__ngram_range': (1, 3),
 'vect__preprocessor': <function class_utils.normalize_tweet(item)>,
 'vect__stop_words': 'english',
 'vect__tokenizer': <function nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)>}

In [18]:
# print the average scores over the k training folds
fields = ['precision', 'recall', 'f1', 'fscore_prec']

for f in fields:
    score = svc_search.cv_results_["mean_test_%s" % f][svc_search.best_index_]
    print("%s: %.3f" % (f, score))

precision: 0.860
recall: 0.522
f1: 0.646
fscore_prec: 0.758


In [19]:
# use model to predict held out set (X_test) and print score table
# Note that in binary classification, accuracy is the same as the
# [mico averaged recall reported in the table
best_model = svc_search.best_estimator_
predictions = best_model.predict(X_test)
print(classification_report(y_test, predictions, target_names=classes))

             precision    recall  f1-score   support

        NEG       0.78      0.96      0.86        47
        POS       0.80      0.38      0.52        21

avg / total       0.78      0.78      0.75        68



In [20]:
scores = [precision_score, recall_score, f1_score,accuracy_score]
for s in scores:
    score = s(y_test, predictions)
    print(score)


0.8
0.38095238095238093
0.5161290322580645
0.7794117647058824


## Results

We check how it works by running the best classifier from the grid search on our held out set.

# beta = 1

In [21]:
svc_pl = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__preprocessor': [normalize_tweet],#[normalize_tweet, normalize_simple, None],
    'vect__max_df': np.linspace(0.3, 1.0, 10),
    'vect__tokenizer': [word_tokenize],#[casual_tokenize, word_tokenize, None],
    'vect__stop_words' : ['english', None],
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],# ((1, 1), (1, 2), (1,3)),  # largest n-gram
    'tfidf__use_idf':[True, False],# (True, False), #DEFAULT
    'clf__alpha': np.linspace(0.05, 0.2, 10),
    
}

# define the scores we want to calcualte during each k-fold training
fscore_prec = make_scorer(fbeta_score, beta=1)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'fscore_prec': fscore_prec
}

# create the GridSearchCV object.
# by setting refit='fscore_prec', the model which maximizes that score
# will be selected and retrained on all training data.
svc_search = GridSearchCV(svc_pl, parameters, n_jobs=-1, verbose=1, scoring=scoring, refit='fscore_prec')

In [22]:
# Here we do the actual training
# Can take several minutes depending on the range of parameters given
# int he parameters dict above
svc_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 41.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__preprocessor': [<function normalize_tweet at 0x1122fe730>], 'vect__max_df': array([0.3    , 0.37778, 0.45556, 0.53333, 0.61111, 0.68889, 0.76667,
       0.84444, 0.92222, 1.     ]), 'vect__tokenizer': [<function word_tokenize at 0x1152258c8>], 'vect__stop_words': ['english', None]...([0.05   , 0.06667, 0.08333, 0.1    , 0.11667, 0.13333, 0.15   ,
       0.16667, 0.18333, 0.2    ])},
       pre_dispatch='2*n_jobs', refit='fscore_prec

In [23]:
# The parameters selected by the grid search
svc_search.best_params_

{'clf__alpha': 0.05,
 'tfidf__use_idf': False,
 'vect__max_df': 0.45555555555555555,
 'vect__ngram_range': (1, 3),
 'vect__preprocessor': <function class_utils.normalize_tweet(item)>,
 'vect__stop_words': 'english',
 'vect__tokenizer': <function nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)>}

In [24]:
# print the average scores over the k training folds
fields = ['precision', 'recall', 'f1', 'fscore_prec']

for f in fields:
    score = svc_search.cv_results_["mean_test_%s" % f][svc_search.best_index_]
    print("%s: %.3f" % (f, score))

precision: 0.811
recall: 0.549
f1: 0.654
fscore_prec: 0.654


In [25]:
# use model to predict held out set (X_test) and print score table
# Note that in binary classification, accuracy is the same as the
# [mico averaged recall reported in the table
best_model = svc_search.best_estimator_
predictions = best_model.predict(X_test)
print(classification_report(y_test, predictions, target_names=classes))

             precision    recall  f1-score   support

        NEG       0.80      0.91      0.85        47
        POS       0.71      0.48      0.57        21

avg / total       0.77      0.78      0.76        68



In [26]:
scores = [precision_score, recall_score, f1_score,accuracy_score]
for s in scores:
    score = s(y_test, predictions)
    print(score)


0.7142857142857143
0.47619047619047616
0.5714285714285714
0.7794117647058824


# beta = 1.5

In [30]:
svc_pl = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__preprocessor': [normalize_tweet],#[normalize_tweet, normalize_simple, None],
    'vect__max_df': np.linspace(0.3, 1.0, 10),
    'vect__tokenizer': [word_tokenize],#[casual_tokenize, word_tokenize, None],
    'vect__stop_words' : ['english', None],
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],# ((1, 1), (1, 2), (1,3)),  # largest n-gram
    'tfidf__use_idf':[True, False],# (True, False), #DEFAULT
    'clf__alpha': np.linspace(0.05, 0.2, 10),
    
}

# define the scores we want to calcualte during each k-fold training
fscore_prec = make_scorer(fbeta_score, beta=1.5)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'fscore_prec': fscore_prec
}

# create the GridSearchCV object.
# by setting refit='fscore_prec', the model which maximizes that score
# will be selected and retrained on all training data.
svc_search = GridSearchCV(svc_pl, parameters, n_jobs=-1, verbose=1, scoring=scoring, refit='fscore_prec')

In [31]:
# Here we do the actual training
# Can take several minutes depending on the range of parameters given
# int he parameters dict above
svc_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 38.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__preprocessor': [<function normalize_tweet at 0x1122fe730>], 'vect__max_df': array([0.3    , 0.37778, 0.45556, 0.53333, 0.61111, 0.68889, 0.76667,
       0.84444, 0.92222, 1.     ]), 'vect__tokenizer': [<function word_tokenize at 0x1152258c8>], 'vect__stop_words': ['english', None]...([0.05   , 0.06667, 0.08333, 0.1    , 0.11667, 0.13333, 0.15   ,
       0.16667, 0.18333, 0.2    ])},
       pre_dispatch='2*n_jobs', refit='fscore_prec

In [32]:
# The parameters selected by the grid search
svc_search.best_params_

{'clf__alpha': 0.05,
 'tfidf__use_idf': True,
 'vect__max_df': 0.6111111111111112,
 'vect__ngram_range': (1, 3),
 'vect__preprocessor': <function class_utils.normalize_tweet(item)>,
 'vect__stop_words': 'english',
 'vect__tokenizer': <function nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)>}

In [33]:
# print the average scores over the k training folds
fields = ['precision', 'recall', 'f1', 'fscore_prec']

for f in fields:
    score = svc_search.cv_results_["mean_test_%s" % f][svc_search.best_index_]
    print("%s: %.3f" % (f, score))

precision: 0.773
recall: 0.565
f1: 0.649
fscore_prec: 0.613


In [34]:
# use model to predict held out set (X_test) and print score table
# Note that in binary classification, accuracy is the same as the
# [mico averaged recall reported in the table
best_model = svc_search.best_estimator_
predictions = best_model.predict(X_test)
print(classification_report(y_test, predictions, target_names=classes))

             precision    recall  f1-score   support

        NEG       0.82      0.89      0.86        47
        POS       0.71      0.57      0.63        21

avg / total       0.79      0.79      0.79        68



In [35]:
scores = [precision_score, recall_score, f1_score,accuracy_score]
for s in scores:
    score = s(y_test, predictions)
    print(score)


0.7058823529411765
0.5714285714285714
0.6315789473684211
0.7941176470588235
