In [1]:
from sklearn.datasets import load_files

categories = ['neg', 'pos']
train = load_files('../aclImdb/train', categories=categories)
test = load_files('../aclImdb/test', categories=categories)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                        stop_words='english', lowercase=True,
                        analyzer='word', ngram_range=(1,3),
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

train_X = tfidf.fit_transform(train.data, train.target)
test_X = tfidf.transform(test.data)

train_y = train.target
test_y = test.target

In [3]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

svc = LinearSVC()
params = {'loss': ['hinge', 'squared_hinge'],
          'penalty': ['l1', 'l2'],
          'C': [0.1, 0.5, 1.0, 2, 4, 8, 12, 16, 32],
          'dual': [True, False]}

clf = GridSearchCV(svc, params, n_jobs=2)
clf.fit(train_X, train_y)

print clf.best_params_
print clf.best_score_

{'loss': 'squared_hinge', 'penalty': 'l2', 'C': 2, 'dual': True}
0.8934


In [4]:
predicted = clf.predict(test_X)

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

print np.mean(predicted == test_y)
print confusion_matrix(test_y, predicted)
print classification_report(test_y, predicted, target_names=categories)

0.89032
[[11116  1384]
 [ 1358 11142]]
             precision    recall  f1-score   support

        neg       0.89      0.89      0.89     12500
        pos       0.89      0.89      0.89     12500

avg / total       0.89      0.89      0.89     25000
