In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [2]:
pos_data = np.loadtxt('data/rt-polaritydata/rt-polarity.pos', dtype=str, delimiter='\n')
neg_data = np.loadtxt('data/rt-polaritydata/rt-polarity.neg', dtype=str, delimiter='\n')
pos_label = np.array([0]*len(pos_data))
neg_label = np.array([1]*len(neg_data))
data = np.concatenate([pos_data, neg_data])
label = np.concatenate([pos_label, neg_label])

In [3]:
# vectorizer = TfidfVectorizer(input='content', decode_error='ignore', strip_accents='unicode', max_df=0.7, min_df=6)
vectorizer = CountVectorizer(input='content', decode_error='ignore')
vectors = vectorizer.fit_transform(data).toarray()
vectors_and_labels = np.array([np.array([vectors[i],label[i]]) for i in range(len(data))])

In [4]:
np.random.shuffle(vectors_and_labels)
print(vectors_and_labels)

test_data, train_data = vectors_and_labels[:500], vectors_and_labels[500:]

[[array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 1]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 1]
 ...
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 1]]


In [5]:
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
from scipy.stats import uniform, randint
from sklearn.svm import LinearSVC, SVC


In [6]:
svm = LinearSVC(max_iter=10000)
svm_params = {
    "tol": uniform(0,1),
    "C": uniform(0,10),
    "dual": (True, False),
#     "random_state": randint(0,64)
}

svm = RandomizedSearchCV(svm, svm_params, n_iter=100, scoring="accuracy", cv=5, verbose=1)

svm.fit(train_data[:,0].tolist(), train_data[:,1].tolist())
print("Best params for Linear SVM")
print(svm.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 21.1min finished


Best params for Linear SVM
{'C': 0.04955728788779923, 'dual': True, 'tol': 0.700979673423408}


In [7]:
print("Train Accuracy")
print(svm.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
print("Test Accuracy")
print(svm.score(test_data[:,0].tolist(), test_data[:,1].tolist()))

Train Accuracy
0.9492225939775635
Test Accuracy
0.794


In [8]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(algorithm='kd_tree')
# knn.fit(train_data[:,0].tolist(), train_data[:,1].tolist())

In [9]:
# print("Train Accuracy")
# print(knn.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
# print("Test Accuracy")
# print(knn.score(test_data[:,0].tolist(), test_data[:,1].tolist()))