In [1]:
from sklearn.datasets import load_files
import numpy as np

categories = ['01','02','03','04','07','08','09','10']
train = load_files('../aclImdb/train', categories=categories)
test = load_files('../aclImdb/test', categories=categories)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                        stop_words='english', lowercase=True,
                        analyzer='word', ngram_range=(1,3),
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

train_X = tfidf.fit_transform(train.data, train.target)
test_X = tfidf.transform(test.data)

train_y = train.target
test_y = test.target

In [3]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

svc = LinearSVC()
params = {'loss': ['hinge', 'squared_hinge'],
          'penalty': ['l1', 'l2'],
          'C': [0.1, 0.5, 1.0, 2, 4, 8, 12, 16, 32],
          'multi_class': ['ovr', 'crammer-singer'],
          'dual': [True, False]}

def err1(y, pred):
    return np.mean(np.abs(pred - y) <= np.ones(len(y)))

clf = GridSearchCV(svc, params, scoring=make_scorer(err1), n_jobs=2)
clf.fit(train_X, train_y)

print clf.best_params_
print clf.best_score_

{'loss': 'squared_hinge', 'penalty': 'l2', 'C': 2, 'multi_class': 'ovr', 'dual': True}
0.43244

In [4]:
predicted = clf.predict(test_X)

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

diff = np.abs(test_y - predicted)
print np.mean(test_y == predicted)
print np.mean(diff <= np.ones(len(diff)))
print np.mean(diff <= 2*np.ones(len(diff)))
print confusion_matrix(test_y, predicted)


0.42684
0.68876
0.85386
[[4220  111  178  216   55   56   26  160]
 [1412  136  225  283   85   46   23   92]
 [1070  163  382  523  154   85   31  133]
 [ 712  114  389  750  308  190   44  128]
 [ 138   42  101  246  675  517  134  454]
 [ 136   22   83  147  534  697  214 1017]
 [ 118   19   36   65  256  454  195 1201]
 [ 242   25   58   87  249  463  259 3616]]
