In [3]:
import os
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
import re
import gensim

%matplotlib inline

In [4]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

* ### "Стандартная" [модель](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) для английского языка
* ### "Стандартная" [модель](https://nlpub.ru/Russian_Distributional_Thesaurus) для русского языка
* ### [Коллекция моделей](http://rusvectores.org/ru/models/) для русского языка

**Использование готовой модели:**

In [1]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
                                                        unicode_errors='ignore', binary=True)

NameError: name 'gensim' is not defined

Получение вектора:

In [7]:
print(model.get_vector('linguistic').shape)

(300,)


In [8]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951)]

In [10]:
model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.9314123392105103),
 ('monarch', 0.858533501625061),
 ('princess', 0.8476566076278687)]

In [11]:
model.similarity('woman', 'man')

0.76640123

In [12]:
model.most_similar(positive=["good"])

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348341941833),
 ('nice', 0.6836092472076416),
 ('excellent', 0.644292950630188),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728850364685),
 ('solid', 0.5806034803390503),
 ('lousy', 0.576420247554779)]

In [13]:
model.most_similar(positive=["bad"])

[('good', 0.7190051674842834),
 ('terrible', 0.6828612089157104),
 ('horrible', 0.6702597737312317),
 ('Bad', 0.669891893863678),
 ('lousy', 0.6647640466690063),
 ('crummy', 0.567781925201416),
 ('horrid', 0.5651682615280151),
 ('awful', 0.5527253150939941),
 ('dreadful', 0.5526429414749146),
 ('horrendous', 0.5445998311042786)]

In [17]:
model.vector_size

300

[('ASDs', 0.6904527544975281),
 ('autism_spectrum_disorders', 0.6598615646362305),
 ('autism_spectrum_disorder', 0.6474117636680603),
 ('Autism_Spectrum_Disorder_ASD', 0.6191781759262085),
 ('ADHD', 0.6040418148040771),
 ('autism_spectrum', 0.6007470488548279),
 ('autism', 0.5998409986495972),
 ('autistic_spectrum', 0.5933749675750732),
 ('Autism_Spectrum_Disorder', 0.584269106388092),
 ('autism_spectrum_disorders_ASD', 0.5795554518699646)]