In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining
from src.constants import SEPARATOR_TOKEN, CLS_TOKEN
from src.utils import pool_and_normalize
from src.datasets_loader import prepare_tokenizer
from mteb import MTEB
# from huggingface_hub import notebook_login

In [2]:
class embedding_model(torch.nn.Module):

    def __init__(self, ):
        super().__init__()

        _model_name = "bigcode/bigcode-encoder"
        self.tokenizer = prepare_tokenizer(AutoTokenizer.from_pretrained(_model_name, use_auth_token=True))
        self.encoder = AutoModelForPreTraining.from_pretrained(_model_name, use_auth_token=True)
    
    def forward(self, input_sentences):

        inputs = self.tokenizer([f"{CLS_TOKEN}{sentence}{SEPARATOR_TOKEN}" for sentence in input_sentences], return_tensors="pt", padding=True)
        outputs = self.encoder(**inputs)
        embedding = pool_and_normalize(outputs.hidden_states[-1], inputs.attention_mask)

        return embedding
    
    def encode(self, input_sentences, **kwargs):

        return self.forward(input_sentences)


In [3]:
model = embedding_model()

In [4]:
input_sentences = [
    "Hello world!!",
    "def my_sum(a, b): return a+b"
]

embeddings = model(input_sentences)

In [5]:
embeddings.size()

torch.Size([2, 768])

In [None]:
evaluation = MTEB(task_types=['Clustering', 'Retrieval'])
results = evaluation.run(model)