In [1]:
import os
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
import re
import enchant
import gensim

%matplotlib inline

In [2]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [3]:
dict_eng = enchant.Dict('en')

* ### "Стандартная" [модель](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) для английского языка
* ### "Стандартная" [модель](https://nlpub.ru/Russian_Distributional_Thesaurus) для русского языка
* ### [Коллекция моделей](http://rusvectores.org/ru/models/) для русского языка

**Использование готовой модели:**

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('/home/alexandr/distr/GoogleNews-vectors-negative300.bin.gz',
                                                        unicode_errors='ignore', binary=True)

Получение вектора:

In [5]:
print(model.word_vec('linguistic').shape)

(300,)


In [6]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951)]

In [7]:
model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.9314123392105103),
 ('monarch', 0.858533501625061),
 ('princess', 0.8476566076278687)]

In [8]:
model.similarity('woman', 'man')

0.76640123

**Обучение собственной модели:**

In [4]:
texts = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')).data

print(len(texts))

11314


In [5]:
texts_prep = []

for text in log_progress(texts):
    filtered = re.findall('(?u)\\b\\w\\w+\\b', text)
    filtered = [filt for filt in filtered if dict_eng.check(filt)]
    texts_prep.append(' '.join(filtered).lower())

VBox(children=(HTML(value=''), IntProgress(value=0, max=11314)))

In [6]:
corpus = []

for text in log_progress(texts_prep):
    tokens = list(gensim.utils.tokenize(text, lower=True))
    if len(tokens) != 0:
        corpus.append(tokens)
#     break

VBox(children=(HTML(value=''), IntProgress(value=0, max=11314)))

In [7]:
model = gensim.models.Word2Vec(corpus, size=100, window=5, min_count=5, workers=4)

In [11]:
results = model.most_similar(positive=['salary'], negative=['work'], topn=10)
for r in results:
    print('{}: {}'.format(r[0], r[1]))

averaged: 0.7503620386123657
bears: 0.7275941371917725
provinces: 0.7267054319381714
olympic: 0.7264388799667358
avengers: 0.7162312269210815
blazers: 0.7111951112747192
rickey: 0.7108708620071411
manifestations: 0.7107887268066406
assists: 0.7098666429519653
totals: 0.706010103225708


  """Entry point for launching an IPython kernel.
