# RuBert и необходимые библиотеки

In [None]:
!pip install datasets transformers[sentencepiece]



In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pickle
from tqdm import tqdm
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru", output_hidden_states = True)
bert_dictionary = tokenizer.vocab

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/655 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

#  Словарь RuBERT в тензоры

In [None]:
vocab_tensors = {}

for word in tqdm(bert_dictionary):

    tokenized_text = []
    tokenized_text.append(word)
    indexed_tokens = []
    indexed_tokens.append(bert_dictionary[word])

    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        encoded_layers = model(tokens_tensor, segments_tensors)
    token_embeddings = [] 
    for token_i in range(len(tokenized_text)):
      hidden_layers = [] 
      for layer_i in range(len(encoded_layers)):
        vec = encoded_layers[layer_i][0][0]
        hidden_layers.append(vec)
      token_embeddings.append(hidden_layers)

      word_embedding = token_embeddings[0][0]

      vocab_tensors[word] = word_embedding

with open('vocab_tensors.pickle', 'wb') as f:
  pickle.dump(vocab_tensors, f)

### Загрузка полученного файла

In [None]:
with open('vocab_tensors.pickle', 'rb') as f:
  vocab_tensors = pickle.load(f)

len(vocab_tensors)

120138

# Поиск похожих слов по эмбеддингу

Введите в ```text``` своё слово

In [None]:
text = "университет"

Эмбеддинг для ```text```

In [None]:
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [1] * len(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
with torch.no_grad():
    encoded_layers = model(tokens_tensor, segments_tensors)
token_embeddings = [] 
for token_i in range(len(tokenized_text)):
  hidden_layers = [] 
  for layer_i in range(len(encoded_layers)):
    vec = encoded_layers[layer_i][0][0]
    hidden_layers.append(vec)
  token_embeddings.append(hidden_layers)

text_embedding = token_embeddings[0][0]

print(f"Эмбеддинг: {text_embedding}")

Эмбеддинг: tensor([ 0.4884, -0.4788,  0.6102,  ..., -0.1668,  0.3430, -0.3652])


Расчет схожести слов

- по евклидову расстоянию ```euclidean_dist```
- по косиноснуму сходству ```cosine_sim```

In [None]:
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

euclidean_dist = {}
cosine_sim = {}

for word in tqdm(vocab_tensors):

      word_embedding = vocab_tensors[word]

      euclidean_dist[word] = 100 - (cdist(text_embedding.reshape(1,-1), 
                                   word_embedding.reshape(1,-1), 
                                   metric='euclidean')[0][0])
      cosine_sim[word] = (cosine_similarity(text_embedding.reshape(1,-1), 
                                           word_embedding.reshape(1,-1))[0][0]) * 100

100%|██████████| 120138/120138 [00:47<00:00, 2524.32it/s]


Топ слов по схожести

- Евклидово расстояние:

In [None]:
sorted_tuples = sorted(euclidean_dist.items(), key=lambda item: item[1])
euclidean_dist = {k: v for k, v in sorted_tuples}
for i, j in zip(list(euclidean_dist.keys())[-10:], list(euclidean_dist.values())[-10:]):
  print(i, j)

бытовом 87.43234806718017
школьниц 87.585745469591
здание 87.61813242156639
газопровод 87.62179652156027
либералов 87.82348287887778
трон 88.01494920194176
прогни 88.17724603305031
дорога 88.19715160897564
корп 88.68821357950794
университет 100.0


- Косинус сходства:

In [None]:
sorted_tuples = sorted(cosine_sim.items(), key=lambda item: item[1])
cosine_sim = {k: v for k, v in sorted_tuples}
for i, j in zip(list(cosine_sim.keys())[-10:], list(cosine_sim.values())[-10:]):
  print(i, j)

бытовом 80.58443069458008
школьниц 80.90823292732239
здание 81.08738660812378
газопровод 81.15297555923462
либералов 81.79396390914917
трон 82.32142329216003
дорога 82.6618492603302
прогни 82.73684978485107
корп 84.23560857772827
университет 100.0
