In [1]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# load training and testing datasets
data_train = load_files('aclImdb/train/', categories=['neg', 'pos'])
data_test = load_files('aclImdb/test/')

In [3]:
# converts documents to word count vectors, then transforms to smoothed TFIDF scores
tfidf_vect = TfidfVectorizer(decode_error='replace',  # replace chars with decoding problems
                             strip_accents='unicode', # replace accented chars
                             stop_words='english',    # remove stopwords with built-in list
                             lowercase=True,          # convert all words to lowercase
                             analyzer='word',         # use words as the basic unit
                             smooth_idf=True)         # add 1 to DF

# multinomial naive bayes classifier
bayes = MultinomialNB()

# pipeline for our learning strategy
pipeline = Pipeline([('tfidf_vect', tfidf_vect),
                     ('bayes', bayes)])

# different options we want to explore
params = {
    'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3), (1,4)], # range of feature ngram size
    'tfidf_vect__use_idf': [True, False],             # use TFIDF or only TF
    'tfidf_vect__sublinear_tf': [True, False],        # use 1+log(TF)
    'tfidf_vect__min_df': [0, 1, 2],                  # minimum df to consider
    'bayes__alpha': [0.01, 0.1, 0.5, 1, 2, 10, 100],  # smoothing param for NB
}

# performs parallel search on all combinations of parameters
# evaluates all options and selects the one with the best score
clf = GridSearchCV(pipeline, params, n_jobs=2)        # use all cpu cores

# perform the search by fitting on training data
clf = clf.fit(data_train.data, data_train.target)

In [4]:
# resultant classifier model
print 'Chosen params: ' + str(clf.best_params_)
print 'Model score: ' + str(clf.best_score_)

Chosen params: {'tfidf_vect__sublinear_tf': True, 'tfidf_vect__use_idf': True, 'tfidf_vect__min_df': 0, 'tfidf_vect__ngram_range': (1, 3), 'bayes__alpha': 0.1}
Model score: 0.88224


In [5]:
# use it to predict on the testing set
predicted = clf.predict(data_test.data)

# show metrics
acc = np.mean(predicted == data_test.target)
print 'Test accuracy: ' + str(100*acc) + '%'
print classification_report(data_test.target, predicted, target_names=data_test.target_names)
print confusion_matrix(data_test.target, predicted)

Test accuracy: 85.556%
             precision    recall  f1-score   support

        neg       0.83      0.89      0.86     12500
        pos       0.88      0.82      0.85     12500

avg / total       0.86      0.86      0.86     25000

[[11132  1368]
 [ 2243 10257]]


In [6]:
x = ["I understand a lot of people have problems with the movie's ending. I can understand the frustration."
     " The climactic twist takes away from the central premise, it reminded me of 'The Reaping' in some way."
     " And besides being incongruous to theherok main storyline, it further had the problem of being very poorly executed.",
     "This movie is pretty great. The acting is somewhat bad but the direction and writing makes up for it."]
y = clf.predict(foo)
print [data_test.target_names[i] for i in y]

['neg', 'pos']
