Load word2vec and "global" clusters

In [1]:
from gensim.models import KeyedVectors
import numpy
word2vec = KeyedVectors.load_word2vec_format("classifier/dataset/ruwikiruscorpora_0_300_20.bin", binary=True)
clusters = numpy.load("classifier/dataset/clusters.npy")

Load dataset

In [2]:
import csv
from classifier import DatasetItem

with open("classifier/dataset/weather.csv", "r", encoding="utf-8") as src:
    data = list(csv.reader(src))
dataset = [DatasetItem(row[0], row[1:]) for row in data]

Initialize classifier builder

In [3]:
from classifier import W2vPosClusterTextVectorBuilder, ClassifierBuilder
from pymorphy2 import MorphAnalyzer

builder = ClassifierBuilder(word2vec, clusters, MorphAnalyzer(), dataset)

Get nearest clusters for each class

In [4]:
builder.labels_top_clusters(5)

{'температура': array([1170, 1889, 1070, 1825, 1703]),
 'условия': array([1889, 1170, 1319, 1070,  982])}

Print nearest words for "nearest" clusters

In [5]:
for cluster in [1170, 1889, 1319]:
    print(cluster, builder.cluster_words(cluster))

1170 ['холод_NOUN', 'озябнуть_VERB', 'зябкий_ADJ', 'жарко_ADV', 'отогреваться_VERB', 'морозить_VERB', 'зябнуть_VERB', 'продрогнуть_VERB', 'мерзнуть_VERB', 'согреваться_VERB']
1889 ['безветренный_ADJ', 'ветрено_ADV', 'ведренный_ADJ', 'пасмурный_ADJ', 'морозно_ADV', 'дождливо_ADV', 'облачно_ADV', 'ненастье_NOUN', 'пасмурно_ADV', 'ненастный_ADJ']
1319 ['ливневый::дождь_NOUN', 'ветер_NOUN', 'ливень_NOUN', 'шквалистый::ветер_NOUN', 'шторм_NOUN', 'штормовой_ADJ', 'пыльный::буря_NOUN', 'циклон_NOUN', 'ураган_NOUN', 'дождь_NOUN']


For each class - build mean vector, and get mean distance between it

In [6]:
builder.class_mean_metric()

0.2801143666217679

Test classifier

In [7]:
import random
from sklearn.metrics import mean_squared_error

split_dataset = 0.3
dataset_indices = list(range(0, len(dataset)))
train_before = int(split_dataset * len(dataset))
errors = []
morph = MorphAnalyzer()
for i in range(0, 100):
    random.shuffle(dataset_indices)
    train_set = [dataset[i] for i in dataset_indices[:train_before]]
    test_set = [dataset[i] for i in dataset_indices[:train_before]]
    classifier = ClassifierBuilder(word2vec, clusters, morph, train_set).create_classifier([1170, 1889, 1319], 0.5)
    test_predictions = [classifier.predict(item.text) for item in test_set]
    test_labels = numpy.array([
        [int(label in row.labels) for label in classifier.labels]
        for row in test_set
    ])
    test_labels_predicted = numpy.array([
        [row[label] for label in classifier.labels]
        for row in test_predictions
    ])
    error = mean_squared_error(test_labels, test_labels_predicted)
    errors.append(error)
errors_np = numpy.array(errors)
print("Error mean/std", errors_np.mean(), errors_np.std())

Error mean/std 0.17432 0.0578118388491
