In [10]:
import os
import gensim
import re
import string
import numpy as np
import sklearn.neighbors
import sklearn.svm

w2v_model = gensim.models.KeyedVectors.load_word2vec_format(os.path.join('.', 'models', 'ubercorpus.lowercased.tokenized.word2vec.300d'))

In [23]:
def normalize(doc):
    doc = doc.lower()
    doc = re.sub('[%s]' % re.escape(string.punctuation), '', doc)
    return doc

def word_filter(word):
    return len(word) >= 4 and word in w2v_model.vocab
#     return word in w2v_model.vocab

def docs_dataset(dirpath):
    for (dirpath, dirnames, filenames) in os.walk(dirpath):
        files = filenames
        break
   
    docs = []
    for fn in files:
        with open(os.path.join(dirpath, fn), 'r', encoding='UTF-8') as f:
            line = f.read()
            words = normalize(line).split()
            words = list(filter(word_filter, words))
            if words:
                docs.append(words)
    return docs
    
def doc_max_len(docs):
    return max(map(len, docs))
        
def array_assign_2d(a, b):
    a[:b.shape[-2], :b.shape[-1]] = b

def docs_to_vectors(docs, doc_max_len):
    vector_len = w2v_model.vectors.shape[-1]
    vectors = np.zeros((len(docs), doc_max_len, vector_len))
    for i, doc in enumerate(docs):
        doc_vector = w2v_model[doc]
        array_assign_2d(vectors[i], doc_vector)
    return vectors

def shuffle_split(src, fraction):
    copy = np.array(src)
    np.random.shuffle(copy)
    a = copy[:int(copy.shape[0] * fraction)]
    b = copy[int(copy.shape[0] * fraction):]
    return a, b

In [24]:
phones_path = os.path.join('.', 'data', 'dscr_phones')
headphones_path = os.path.join('.', 'data', 'dscr_headphones')
tablets_path = os.path.join('.', 'data', 'dscr_tablets')
phones_docs = docs_dataset(phones_path)
headphones_docs = docs_dataset(headphones_path)
tablets_docs = docs_dataset(tablets_path)

doc_len = max(doc_max_len(phones_docs), doc_max_len(headphones_docs), doc_max_len(tablets_docs))
x_phones = docs_to_vectors(phones_docs, doc_len)
x_headphones = docs_to_vectors(headphones_docs, doc_len)
x_tablets = docs_to_vectors(tablets_docs, doc_len)

In [32]:
x_train_phones, x_test_phones = shuffle_split(x_phones, 0.8)
x_train_headphones, x_test_headphones = shuffle_split(x_headphones, 0.8)
x_train_tablets, x_test_tablets = shuffle_split(x_tablets, 0.8)

x_train = np.vstack((x_train_phones, x_train_headphones, x_train_tablets))
x_test = np.vstack((x_test_phones, x_test_headphones, x_test_tablets))

y_train = np.hstack((
    np.full(x_train_phones.shape[0], 1),
    np.full(x_train_headphones.shape[0], 2),
    np.full(x_train_tablets.shape[0], 3)
))
y_test = np.hstack((
    np.full(x_test_phones.shape[0], 1),
    np.full(x_test_headphones.shape[0], 2),
    np.full(x_test_tablets.shape[0], 3)
))

x_train = x_train.mean(axis=1)
x_test = x_test.mean(axis=1)
# x_train = x_train.reshape((x_train.shape[0], -1))
# x_test = x_test.reshape((x_test.shape[0], -1))

In [33]:
# clf = sklearn.svm.SVC(gamma='scale')
clf = sklearn.neighbors.KNeighborsClassifier()
clf.fit(x_train, y_train)
print('finished training')
acc = clf.score(x_test, y_test)
print('accuracy =', acc)
sklearn.metrics.confusion_matrix(y_test, clf.predict(x_test))

finished training
accuracy = 0.8613445378151261


array([[57, 13,  9],
       [ 2, 76,  2],
       [ 5,  2, 72]], dtype=int64)

In [6]:
len(w2v_model.vocab)

538431

In [7]:
doc_len

794