In [1]:
from sklearn.datasets import load_files

categories = ['neg', 'pos']
train = load_files('../aclImdb/train', categories=categories)
test = load_files('../aclImdb/test', categories=categories)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                        stop_words='english', lowercase=True,
                        analyzer='word', ngram_range=(1,3),
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

train_X = tfidf.fit_transform(train.data, train.target)
test_X = tfidf.transform(test.data)

train_y = train.target
test_y = test.target

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logreg = LogisticRegression()
params = {'solver': ['newton-cg', 'lbfgs'],
          'multi_class': ['multinomial', 'ovr'],
          'C': [0.1, 1.0, 5, 10, 50, 100, 500, 1000]}

clf = GridSearchCV(logreg, params, n_jobs=2)
clf.fit(train_X, train_y)

print clf.best_params_

{'solver': 'newton-cg', multi_class': 'multinomial', 'C': 100}


In [4]:
predicted = clf.predict(test_X)

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

print np.mean(predicted == test_y)
print confusion_matrix(test_y, predicted)
print classification_report(test_y, predicted, target_names=categories)

0.88788
[[11090  1410]
 [ 1393 11107]]
             precision    recall  f1-score   support

        neg       0.89      0.89      0.89     12500
        pos       0.89      0.89      0.89     12500

avg / total       0.89      0.89      0.89     25000
