In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from tqdm.notebook import tqdm
from helper import check_path
from collections import defaultdict

import pandas as pd
import numpy as np

In [3]:
EMB_SIZE = 768
BATCH_SIZE = 1
EMB_NAME = 'deeppavlov'

In [4]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
model.cuda()  # uncomment it if you have a GPU

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
train = pd.read_pickle(os.path.join(cfg.PREPROCESSED_DATA_PATH, 'train.pkl'))
test = pd.read_pickle(os.path.join(cfg.PREPROCESSED_DATA_PATH, 'test.pkl'))

In [6]:
def get_embedding(text: pd.Series, emb_name='') -> pd.DataFrame:
    n = len(text)
    embeddings = np.zeros(shape=(n, EMB_SIZE))
    for i in tqdm(range(0, n, BATCH_SIZE), total=n // BATCH_SIZE):
        sentences = text.iloc[i:i+BATCH_SIZE].tolist()
        sentences = [sentence[:2000] for sentence in sentences]
        embeddings[i:i+BATCH_SIZE, :] = embed_bert_cls(sentences, model, tokenizer)
    embeddings = pd.DataFrame(
        embeddings, 
        columns=[f'{emb_name}_{c}' for c in range(EMB_SIZE)],
        index=text.index)
    return embeddings

In [7]:
train_embeddings = get_embedding(train[cfg.TEXT_COL], emb_name=EMB_NAME)
test_embeddings = get_embedding(test[cfg.TEXT_COL], emb_name=EMB_NAME)

  0%|          | 0/2000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
emb_path = os.path.join(cfg.DATA_PATH, EMB_NAME)
check_path(emb_path)

In [9]:
train_embeddings.to_pickle(os.path.join(emb_path, 'train.pkl'))
test_embeddings.to_pickle(os.path.join(emb_path, 'test.pkl'))