In [1]:
from sklearn.datasets import load_files
import numpy as np

categories = ['01','02','03','04','07','08','09','10']
train = load_files('../aclImdb/train', categories=categories)
test = load_files('../aclImdb/test', categories=categories)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                        stop_words='english', lowercase=True,
                        analyzer='word', ngram_range=(1,3),
                        use_idf=True, smooth_idf=True, sublinear_tf=True)

train_X = tfidf.fit_transform(train.data, train.target)
test_X = tfidf.transform(test.data)

train_y = train.target
test_y = test.target

In [3]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

svr = LinearSVR()
params = {'epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          'C': [0.1, 1, 5, 10, 50, 100, 500, 1000],
          'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
          'dual': [True, False]}

def err1(y, pred):
    pred = np.clip(np.round(pred), 0, 7)
    return np.mean(np.abs(pred - y) <= np.ones(len(y)))

clf = GridSearchCV(svr, params, scoring=make_scorer(err1), n_jobs=2)
clf.fit(train_X, train_y)

print clf.best_params_

{'epsilon': 0.001, 'C': 50, 'loss': 'epsilon_insensitive', 'dual': True}


In [4]:
predicted = clf.predict(test_X)

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

predicted = np.clip(np.round(predicted), 0, 7)
diff = np.abs(test_y - predicted)
print np.mean(test_y == predicted)
print np.mean(diff <= np.ones(len(diff)))
print np.mean(diff <= 2*np.ones(len(diff)))
print confusion_matrix(test_y, predicted)

0.32412
0.67856
0.88612
[[2565  965  784  451  189   50    8   10]
 [ 815  521  450  307  142   54   11    2]
 [ 547  524  614  531  222   77   19    7]
 [ 301  411  640  650  408  178   38    9]
 [   7   41  141  320  593  630  382  193]
 [   3   15   77  265  547  783  618  542]
 [   4   10   40  144  376  581  544  645]
 [   3   28   67  245  602 1034 1187 1833]]
