In [1]:
from sklearn.datasets import load_files
import numpy as np

categories = ['01','02','03','04','07','08','09','10']
train = load_files('../aclImdb/train', categories=categories)
test = load_files('../aclImdb/test', categories=categories)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                        stop_words='english', lowercase=True,
                        analyzer='word', ngram_range=(1,3),
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

train_X = tfidf.fit_transform(train.data, train.target)
test_X = tfidf.transform(test.data)

train_y = train.target
test_y = test.target

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

logreg = LogisticRegression()
params = {'solver': ['newton-cg', 'lbfgs'],
          'multi_class': ['multinomial', 'ovr'],
          'C': [0.1, 1.0, 5, 10, 50, 100, 500, 1000]}

def err1(y, pred):
    return np.mean(np.abs(pred - y) <= np.ones(len(y)))

clf = GridSearchCV(logreg, params, scoring=make_scorer(err1), n_jobs=2)
clf.fit(train_X, train_y)

print clf.best_params_

{'solver': 'newton-cg', multi_class': 'multinomial', 'C': 100}


In [4]:
predicted = clf.predict(test_X)

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

diff = np.abs(test_y - predicted)
print np.mean(test_y == predicted)
print np.mean(diff <= np.ones(len(diff)))
print np.mean(diff <= 2*np.ones(len(diff)))
print confusion_matrix(test_y, predicted)

0.43056
0.69288
0.85532
[[4256   78  181  225   62   53   17  150]
 [1424  114  240  288   77   47   13   99]
 [1069  127  388  574  146   70   27  140]
 [ 720   81  378  793  312  190   26  135]
 [ 148   28   93  246  729  540   90  433]
 [ 149   17   68  145  575  736  179  981]
 [ 119   10   38   65  272  491  165 1184]
 [ 250   19   59   86  267  525  210 3583]]
