In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [2]:
pos_data = np.loadtxt('data/rt-polaritydata/rt-polarity.pos', dtype=str, delimiter='\n')
neg_data = np.loadtxt('data/rt-polaritydata/rt-polarity.neg', dtype=str, delimiter='\n')
pos_label = np.array([0]*len(pos_data))
neg_label = np.array([1]*len(neg_data))
data = np.concatenate([pos_data, neg_data])
label = np.concatenate([pos_label, neg_label])

In [3]:
# vectorizer = TfidfVectorizer(input='content', decode_error='ignore', strip_accents='unicode', max_df=0.7, min_df=6)
vectorizer = CountVectorizer(input='content', decode_error='ignore')
vectors = vectorizer.fit_transform(data).toarray()
vectors_and_labels = np.array([np.array([vectors[i],label[i]]) for i in range(len(data))])

In [4]:
np.random.shuffle(vectors_and_labels)
print(vectors_and_labels)

test_data, train_data = vectors_and_labels[:500], vectors_and_labels[500:]

[[array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 1]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]
 ...
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 1]
 [array([0, 0, 0, ..., 0, 0, 0], dtype=int64) 0]]


In [5]:
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit, GridSearchCV
from scipy.stats import uniform, randint
from sklearn.svm import LinearSVC, SVC


In [6]:
svm = LinearSVC(max_iter=20000)
svm_params = {
    "tol": uniform(0,1),
    "C": uniform(0,10),
    "dual": (True, False),
#     "random_state": randint(0,64)
}

svm = RandomizedSearchCV(svm, svm_params, n_iter=500, scoring="accuracy", cv=5, verbose=1)

svm.fit(train_data[:,0].tolist(), train_data[:,1].tolist())
print("Best params for Linear SVM")
print(svm.best_params_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2500 out of 2500 | elapsed: 110.0min finished


Best params for Linear SVM
{'C': 0.03562303127855104, 'dual': False, 'tol': 0.5782808676307457}


In [7]:
print("Train Accuracy")
print(svm.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
print("Test Accuracy")
print(svm.score(test_data[:,0].tolist(), test_data[:,1].tolist()))

Train Accuracy
0.9235386734894706
Test Accuracy
0.768


In [8]:
from sklearn.naive_bayes import GaussianNB

NB_Classifier_freq = GaussianNB()
params = {'var_smoothing': uniform(0,1e-2)}
NB_freq = RandomizedSearchCV(NB_Classifier_freq, params, n_iter=500, scoring="accuracy", cv=5, verbose=1)

NB_freq.fit(train_data[:,0].tolist(), train_data[:,1].tolist())

print("Best params for NB")
print(NB_freq.best_params_)
print("Train Accuracy")
print(NB_freq.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
print("Test Accuracy")
print(NB_freq.score(test_data[:,0].tolist(), test_data[:,1].tolist()))


Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2500 out of 2500 | elapsed: 340.1min finished


Best params for NB
{'var_smoothing': 0.002736118051540739}
Train Accuracy
0.8980515646526275
Test Accuracy
0.772


In [9]:
from sklearn.tree import DecisionTreeClassifier
Tree_Classifier = DecisionTreeClassifier()
dt_params = {
    "criterion": ("gini", "entropy"),
    "splitter": ("best", "random"),
    "max_depth": randint(1, 2000),
    "min_samples_leaf": randint(1, 2000)
}

dt = RandomizedSearchCV(Tree_Classifier, dt_params, n_iter=500, scoring="f1_micro", cv=5, verbose=1)

dt.fit(train_data[:,0].tolist(), train_data[:,1].tolist())

print("Best params for dt")
print(dt.best_params_)
print("Train Accuracy")
print(dt.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
print("Test Accuracy")
print(dt.score(test_data[:,0].tolist(), test_data[:,1].tolist()))


Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2500 out of 2500 | elapsed: 247.4min finished


Best params for dt
{'criterion': 'entropy', 'max_depth': 1662, 'min_samples_leaf': 28, 'splitter': 'random'}
Train Accuracy
0.6658138161779177
Test Accuracy
0.646


In [10]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(algorithm='kd_tree')

# k = [i for i in range(200)]

# params = {'n_neighbors': k}

# k_nn = GridSearchCV(knn, params, scoring="accuracy", cv=5, verbose=10)

# k_nn.fit(train_data[:,0].tolist(), train_data[:,1].tolist())

# print("Best params for NB")
# print(k_nn.best_params_)
# print(k_nn.best_score_)

# print("Train Accuracy")
# print(k_nn.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
# print("Test Accuracy")
# print(k_nn.score(test_data[:,0].tolist(), test_data[:,1].tolist()))

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_Classifier = RandomForestClassifier()
dt_params = {
    "n_estimators": randint(100,400),
    "min_samples_leaf": randint(3, 20)
}
dt = RandomizedSearchCV(RF_Classifier, dt_params, n_iter=20, scoring="accuracy", cv=5, verbose=1)

dt.fit(train_data[:,0].tolist(), train_data[:,1].tolist())

print("Best params for dt")
print(dt.best_params_)
print("Train Accuracy")
print(dt.score(train_data[:,0].tolist(), train_data[:,1].tolist()))
print("Test Accuracy")
print(dt.score(test_data[:,0].tolist(), test_data[:,1].tolist()))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
