In [None]:
# https://github.com/akutuzov/webvectors/blob/master/preprocessing/rus_preprocessing_mystem.py
# https://github.com/akutuzov/webvectors/blob/master/preprocessing/rusvectores_tutorial.ipynb

In [1]:
import sys
import requests

import gensim
from pymystem3 import Mystem

# Preprocessing

In [2]:
mapping_url = 'https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map'

mystem2upos = {}
r = requests.get(mapping_url, stream=True)
for pair in r.text.split('\n'):
    pair = pair.split()
    if len(pair) > 1:
        mystem2upos[pair[0]] = pair[1]

class Preprocessor:
    def __init__(self, mapping):
        self.m = Mystem()
        self.mapping = mapping
        
    def process(self, text, postags=True):
        processed = self.m.analyze(text)
        tagged = []
        for w in processed:
            try:
                lemma = w["analysis"][0]["lex"].lower().strip()
                pos = w["analysis"][0]["gr"].split(',')[0]
                pos = pos.split('=')[0].strip()
                pos = self.mapping.get(pos, 'X')
                tagged.append(lemma.lower() + '_' + pos)
            except KeyError:
                continue
        if not postags:
            tagged = [t.split('_')[0] for t in tagged]
        return tagged

phrases_processor = Preprocessor(mystem2upos)

Installing mystem to C:\Users\Artem_Panov/.local/bin\mystem.exe from http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip


BadZipFile: File is not a zip file

In [34]:
phrases_processor.process('Может ли камень платить налоги?')

['мочь_VERB', 'ли_PART', 'камень_NOUN', 'платить_VERB', 'налог_NOUN']

# Word2Vec model

In [6]:
# You can download pretrained model from http://rusvectores.org/ru/models/

model_path = '/mnt/disk/datasets/word2vec_ru/ruwikiruscorpora-nobigrams_upos_skipgram_300_5_2018.vec.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)

In [26]:
model.most_similar('платить_VERB')

[('уплатить_VERB', 0.7811346650123596),
 ('доплачивать_VERB', 0.7447771430015564),
 ('выплачивать_VERB', 0.7416616082191467),
 ('уплачивать_VERB', 0.736369252204895),
 ('оплачивать_VERB', 0.7315130829811096),
 ('заплатить_VERB', 0.7277652025222778),
 ('приплачивать_VERB', 0.6747636795043945),
 ('задолжать_VERB', 0.664445698261261),
 ('расплачиваться_VERB', 0.6592860221862793),
 ('плащать_VERB', 0.654596209526062)]

In [35]:
word_vector = model['платить_VERB']
word_vector.shape

(300,)