In [20]:
import pickle

train_data, test_data = None, None
with open("../data/train_data.pickle", "rb") as f:
    train_data = pickle.load(f)
with open("../data/test_data.pickle", "rb") as f:
    test_data = pickle.load(f)

In [21]:
x_train = train_data['train_texts']
y_train = train_data['train_labels']

x_test = test_data['test_texts']
y_test = test_data['test_labels']

In [22]:
%%time

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
word_vec = TfidfVectorizer(min_df=5, ngram_range=(1,2), lowercase=False)
char_vec = TfidfVectorizer(min_df=5, ngram_range=(2,3), lowercase=False)

fu = FeatureUnion([
    ('word', word_vec),
    ('char', char_vec)
])

X_train = fu.fit_transform(x_train)

CPU times: user 4.53 s, sys: 132 ms, total: 4.67 s
Wall time: 4.67 s


In [23]:
X_test = fu.transform(x_test)

In [24]:
from sklearn.svm import LinearSVC

svm = LinearSVC()

In [25]:
%%time

svm.fit(X_train, y_train)

CPU times: user 957 ms, sys: 14 ms, total: 971 ms
Wall time: 972 ms


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [26]:
preds = svm.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

0.462

In [27]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

In [28]:
print(len(x_test))
print(len(y_test))

2500
2500


In [29]:
from statistics import mean
word_counts = [text.count(" ") for text in x_test]
mean(word_counts)

182.5416

In [30]:
# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(x_test, 5)
longer_test_labels = get_chunks(y_test, 5)

In [31]:
all([len(set(x)) == 1 for x in longer_test_labels])  # Make sure that all combined labels are the same

True

In [33]:
longer_test_texts = ['\n'.join(chunk) for chunk in longer_test_texts]

In [34]:
longer_test_labels = [chunk[0] for chunk in longer_test_labels]

In [35]:
len(longer_test_texts)

500

In [36]:
X_test_longer = fu.transform(longer_test_texts)

In [37]:
preds = svm.predict(X_test_longer)

In [38]:
accuracy_score(longer_test_labels, preds)

0.02