In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
pos_data = np.loadtxt('data/rt-polaritydata/rt-polarity.pos', dtype=str, delimiter='\n')
neg_data = np.loadtxt('data/rt-polaritydata/rt-polarity.neg', dtype=str, delimiter='\n')
pos_label = np.array([0]*len(pos_data))
neg_label = np.array([1]*len(neg_data))
data = np.concatenate([pos_data, neg_data])
label = np.concatenate([pos_label, neg_label])

In [3]:
vectorizer = TfidfVectorizer(input='content', decode_error='ignore')
vectors = vectorizer.fit_transform(data).toarray()
vectors_and_labels = np.array([np.array([vectors[i],label[i]]) for i in range(len(data))])

In [4]:
np.random.shuffle(vectors_and_labels)
print(vectors_and_labels)

test_data, train_data = vectors_and_labels[:500], vectors_and_labels[500:]

[[array([0., 0., 0., ..., 0., 0., 0.]) 1]
 [array([0., 0., 0., ..., 0., 0., 0.]) 1]
 [array([0., 0., 0., ..., 0., 0., 0.]) 1]
 ...
 [array([0., 0., 0., ..., 0., 0., 0.]) 0]
 [array([0., 0., 0., ..., 0., 0., 0.]) 0]
 [array([0., 0., 0., ..., 0., 0., 0.]) 1]]


In [12]:
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
from scipy.stats import uniform, randint
from sklearn.svm import LinearSVC, SVC


In [None]:
svm = SVC()
svm_params = {
    "tol": uniform(0,1e-1),
    "C": uniform(0,10)
}

svm = RandomizedSearchCV(svm, svm_params, n_iter=20, scoring="accuracy", cv=5, verbose=10)

svm.fit(train_data[:,0].tolist(), train_data[:,1].tolist())
print("Best params for Linear SVM")
print(svm.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print("Train Accuracy")
print(svm.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
print("Test Accuracy")
print(svm.score(test_data[:,0].tolist(), test_data[:,1].tolist()))