In [18]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [20]:
output.last_hidden_state.shape

torch.Size([1, 13, 768])

In [3]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "Where do I live?"
context = "My name is Yatra and I live in India."
qa_model(question = question, context = context)
## {'answer': 'India', 'end': 39, 'score': 0.953, 'start': 31}


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.9750055074691772, 'start': 31, 'end': 36, 'answer': 'India'}

In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch

def chroma_db_bert_embedding(text):
    # TF-IDF vectorization
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
  

    # BERT tokenization and embedding
    tokenizer = BertTokenizer.from_pretrained(f'bert-base-multilingual-cased')
    model = BertModel.from_pretrained(f'bert-base-multilingual-cased')

    # Generate BERT embeddings for the text
    tokens = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor(tokens).unsqueeze(0)
    outputs = model(input_ids)
    bert_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()  # Using mean pooling for simplicity

    # Combine BERT embeddings with TF-IDF weights
    combined_embedding = np.concatenate((bert_embedding, tfidf_matrix.toarray()), axis=1)

    return combined_embedding

# Example usage
text_example = "M. Olivier Chastel, ministre de la Coopération audéveloppement, chargé des Affaires européennes. – Je partagevos inquiétudes vis-à-vis des droits de l’homme au Rwanda.Le bilan des progrès dans ce pays est particulièrement mitigé.Certes, il a fait beaucoup de progrès dans le domaine socio-économique et dans la gouvernance économique, notammentdans la lutte contre la corruption, la gestion des financespubliques et la gestion macro-économique. Cela se traduitd’ailleurs par une croissance économique assez considérableet des progrès importants à l’égard de certains Objectifs deDéveloppement du Millénaire."
embedding_example = chroma_db_bert_embedding(text_example)
print("ChromaDB-like Embedding:", embedding_example)


ChromaDB-like Embedding: [[-4.06425260e-02 -1.54496491e-01  2.27500573e-01 -2.30786353e-01
  -2.37424416e-03 -1.53172076e-01 -3.25793207e-01  5.62416613e-01
  -2.15045244e-01  3.46150279e-01  2.81920638e-02  2.48170063e-01
   3.26444685e-01  4.16531444e-01  1.57037035e-01 -4.56295818e-01
   4.04685020e-01 -1.36212215e-01 -1.05830736e-01 -7.57065192e-02
  -4.56591398e-01 -1.35318503e-01 -5.31567752e-01  4.03620303e-01
  -1.88481987e-01  8.22765648e-01 -6.83822393e-01 -1.04918629e-01
  -2.05937594e-01 -5.01475632e-01 -2.22924873e-02  2.23720312e-01
  -5.38871586e-01  3.68942082e-01  1.62696436e-01  4.66618270e-01
   1.97043478e-01 -1.74015447e-01  3.33579600e-01  3.18424344e-01
   1.82746008e-01  2.02402771e-01  3.67932282e-02 -2.89180875e-01
   8.00300986e-02 -1.17490649e-01  6.48011208e-01 -9.25587863e-02
   3.07093393e-02  9.82022047e-01  1.63868561e-01 -9.18557718e-02
   4.99524593e-01 -3.71196926e-01 -3.60960364e-01  1.38067976e-01
  -1.76899716e-01  1.32506192e-01  4.31383938e-01 -

In [23]:
from transformers import BertTokenizer, BertModel
import torch
from typing import List

class MyEmbeddingFunction:
    def __init__(self, model_name='bert-base-multilingual-cased'):
        """
        Initialize the embedding function with a pre-trained BERT model.

        Parameters:
            model_name (str): Name of the pre-trained BERT model to use.
        """
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

    def __call__(self, texts: List[str]) -> List[torch.Tensor]:
        """
        Embed the given list of texts using pre-trained BERT.

        Parameters:
            texts (List[str]): List of input texts.

        Returns:
            List[torch.Tensor]: List of BERT embeddings for each input text.
        """
        # Tokenize the input texts
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

        # Generate BERT embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Return embeddings
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Example usage
text_samples = ["This is the first example.", "Here's the second example."]
embedding_function = MyEmbeddingFunction()
embeddings = embedding_function(text_samples)

# 'embeddings' now contains BERT embeddings for the input texts
print(embeddings.shape)  # Shape: (2, max_seq_length, embedding_dim)


(2, 768)


In [24]:
embeddings[0]

array([-1.12103432e-01, -2.51927942e-01,  8.30225229e-01,  1.12672329e-01,
        6.92149520e-01,  3.49491239e-01, -5.13610184e-01,  4.00701582e-01,
       -3.05360258e-02, -4.09553915e-01,  2.59677202e-01, -1.71613693e-03,
        3.62130880e-01, -5.36631942e-01, -1.05814539e-01, -4.44250077e-01,
        3.61989260e-01, -4.31834400e-01,  2.74647892e-01,  4.85665321e-01,
        1.86325401e-01, -1.35240406e-01, -2.93641627e-01,  7.67444670e-02,
        1.19166918e-01, -7.39476383e-01, -5.79746604e-01, -2.84853816e-01,
       -7.69184204e-03,  1.68713361e-01,  4.31787759e-01,  2.55637914e-01,
        1.22261196e-01,  5.53519130e-01, -2.13804781e-01,  4.75808412e-01,
       -2.18756184e-01,  2.32805729e-01, -1.06246948e-01, -1.93486050e-01,
        1.52260527e-01,  2.68003438e-02,  6.36437833e-02, -1.00743495e-01,
        3.51212233e-01, -5.92088878e-01, -1.78538382e-01,  1.36466324e-01,
       -7.24526167e-01, -2.54400313e-01, -1.58928424e-01, -6.26799047e-01,
        5.51933587e-01,  