In [6]:
from typing import List
import nltk
import string
import pymorphy2
import codecs

class PrepareNew():
  def __init__(self):
    self.morph = pymorphy2.MorphAnalyzer()
    self.tokenizer = nltk.WordPunctTokenizer()
    self.stopwords = set(line.strip() for line in codecs.open('../rus_stopwords.txt', "r", "utf_8_sig").readlines())

  def prepare_corp(self, news_list: List[str]):
    return [self.newstext2token(news_text) for news_text in news_list]

  def newstext2token(self, news_text: str):
      tokens = self.tokenizer.tokenize(news_text.lower())
      tokens_with_no_punct = [self.morph.parse(w)[0].normal_form for w in tokens if all(c not in string.punctuation for c in w)]
      tokens_base_forms = [w for w in tokens_with_no_punct if w not in self.stopwords and w.isalpha()]
      tokens_last = [w for w in tokens_base_forms if len(w)>1]
      return tokens_last


In [7]:
raw_data = [line.strip() for line in codecs.open('../ved.txt', "r", "utf_8_sig").readlines() if line.strip()!=""]

In [8]:
raw_data = raw_data[1::2]

In [9]:
my_corp = PrepareNew().prepare_corp(raw_data)

In [10]:
from gensim.models import Word2Vec
from gensim.models import Phrases
bigram_transformer = Phrases(my_corp)
model = Word2Vec(bigram_transformer[my_corp], window=5, min_count=2)

In [14]:
type(model)

gensim.models.word2vec.Word2Vec

In [23]:
model.wv.most_similar('рубль', topn=10)

[('падать', 0.9817359447479248),
 ('цена_продовольствие', 0.9777151346206665),
 ('следующий_десятилетие', 0.9736192226409912),
 ('чистый_отток', 0.9734081029891968),
 ('оставаться_низкий', 0.9733332991600037),
 ('существенно_увеличиться', 0.973079264163971),
 ('сильно', 0.9725044369697571),
 ('доходность_облигация', 0.9714182615280151),
 ('говориться_обзор', 0.9708289504051208),
 ('снижаться', 0.9706270098686218)]

In [28]:
import numpy as np
def text2vec(tokens, embeddings, dim=100):
    """
        question: токены
        embeddings: w2v модель
        dim: размер любого вектора в нашем представлении
        
        return: векторное представление для вопроса
    """

    relevant=0
    words_vecs=np.zeros((dim,))
    for word in tokens:
      if word in embeddings.wv:
        words_vecs+=embeddings.wv[word]
        relevant+=1

    if relevant:
      words_vecs/=relevant
    return words_vecs


In [29]:
import pandas as pd
df = pd.DataFrame()

print(len(raw_data), len(my_corp))

df['title'] = raw_data
df['tokens'] = my_corp

3336 3336


In [32]:
data_vectors = np.array([text2vec(tokens, model) for tokens in my_corp])

In [33]:
data_vectors.shape

(3336, 100)

In [34]:
from sklearn.manifold import TSNE
import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_vectors)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 3336 samples in 0.001s...
[t-SNE] Computed neighbors for 3336 samples in 0.491s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3336
[t-SNE] Computed conditional probabilities for sample 2000 / 3336
[t-SNE] Computed conditional probabilities for sample 3000 / 3336
[t-SNE] Computed conditional probabilities for sample 3336 / 3336
[t-SNE] Mean sigma: 0.253937
[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.496048
[t-SNE] KL divergence after 300 iterations: 1.662562
t-SNE done! Time elapsed: 3.8178114891052246 seconds
