In [1]:
from sklearn.datasets import load_files
import numpy as np

categories = ['01','02','03','04','07','08','09','10']
train = load_files('../aclImdb/train', categories=categories)
test = load_files('../aclImdb/test', categories=categories)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                        stop_words='english', lowercase=True,
                        analyzer='word', ngram_range=(1,3),
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

train_X = tfidf.fit_transform(train.data, train.target)
test_X = tfidf.transform(test.data)

train_y = train.target
test_y = test.target

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

nb = MultinomialNB()
params = {'alpha': [0.001, 0.01, 0.1, 1, 2, 4, 8]}

def err1(y, pred):
    return np.mean(np.abs(pred - y) <= np.ones(len(y)))

clf = GridSearchCV(nb, params, scoring=make_scorer(err1), n_jobs=2)
clf.fit(train_X, train_y)

print clf.best_params_
print clf.best_score_

{'alpha': 0.1}
0.36556

In [4]:
predicted = clf.predict(test_X)

from sklearn.metrics import classification_report, confusion_matrix

diff = np.abs(test_y - predicted)
print np.mean(test_y == predicted)
print np.mean(diff <= np.ones(len(diff)))
print np.mean(diff <= 2*np.ones(len(diff)))
print confusion_matrix(test_y, predicted)


0.35772
0.51604
0.68612
[[4905    0    0    0    0    0    0  117]
 [2169   15    0    0    0    0    0  118]
 [2292    0   24    0    0    0    0  225]
 [2175    0    0   23    1    3    0  433]
 [ 967    0    0    1    3    7    0 1329]
 [ 874    0    0    0    1   19    0 1956]
 [ 556    0    0    0    0    1    9 1778]
 [1052    0    0    0    1    1    0 3945]]
