<a href="https://colab.research.google.com/github/aurotripathy/raggedy-ann/blob/main/llama_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### LLama Embeddings from Weighted Mean-Pooling
Why not use LLama for generating embeddings?\
Decoder-only LLM-based embedding models are beginning to outperform \
BERT or T5-based embedding models in general-purpose text embedding tasks, \
including dense vector-based retrieval.\
This is in lieu of an embeddings model trained to generate embeddings.\
The weighted mean-pooling based embeddings idea is from:\
https://stackoverflow.com/questions/76926025/sentence-embeddings-from-llama-2-huggingface-opensource

In [1]:
# The installs are silent and may take some time
!pip install --upgrade -q transformers
!pip install -q torch --index-url https://download.pytorch.org/whl/cu121/
!python -m pip install -q huggingface_hub
%pip install -q accelerate>=0.26.0
!pip install -U -q scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Get your HF token from https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
print(torch.__version__)

2.4.1+cu121


In [5]:
import transformers
from transformers import AutoTokenizer, AutoModel
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModel.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# generate something to test
pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="cuda:0"
)
pipeline("Hey how are you doing today?")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[{'generated_text': 'Hey how are you doing today? I’m good. I’m doing well. I’m excited'}]

In [9]:
print(f"GPU mem allocated (GB): {torch.cuda.max_memory_allocated(device=None)/ 1000**3} memory")

GPU mem allocated (GB): 31.088142848 memory


In [10]:
import numpy as np
def get_weighted_mean_pooled_embeddings(text):
    inputs = tokenizer(text, padding=True, return_tensors="pt")
    cuda_inputs = inputs.to("cuda")
    
    # with torch.no_grad():
    #     embeddings = model(**inputs).last_hidden_state
        
    # alternate way to ge the embeddings
    with torch.no_grad():
        last_hidden_state = model(**cuda_inputs, output_hidden_states=True).hidden_states[-1]
    print(f'last_hidden_state:\n{last_hidden_state}')
    print(f'size: {last_hidden_state.size()}')
    
    # assert torch.equal(embeddings, last_hidden_state)

    last_hidden_state = last_hidden_state.to("cpu").float()
    print(f'cpu last hidden state:\n{last_hidden_state}')
    
    weights_for_non_padding = inputs.attention_mask.to("cpu") * torch.arange(start=1, end=last_hidden_state.shape[1] + 1).unsqueeze(0)
    print(f'padding token weights:\n{weights_for_non_padding}')

    sum_embeddings = torch.sum(last_hidden_state * weights_for_non_padding.unsqueeze(-1), dim=1)
    num_of_none_padding_tokens = torch.sum(weights_for_non_padding, dim=-1).unsqueeze(-1)
    sentence_embeddings = sum_embeddings / num_of_none_padding_tokens
    
    print(f'sentence embedding size: {sentence_embeddings.size()}')
    return sentence_embeddings.mean(dim=0).squeeze().numpy()

In [11]:
texts = ["Hey how are you doing today?", "The weather was great in San Francisco yesterday.", "Hey how are you doing today?"]
sentence_embeddings_list = []
for text in texts:
    sentence_embeddings_list.append(get_weighted_mean_pooled_embeddings(text))
for sentence_embeddings in sentence_embeddings_list:
    print(sentence_embeddings.shape)

last_hidden_state:
tensor([[[ 1.6484,  2.4219,  0.7773,  ..., -1.6953,  1.7656,  2.8594],
         [-0.2168, -0.2812, -2.0156,  ..., -1.4531, -2.4375, -1.3438],
         [-2.5469,  1.8828,  3.0000,  ...,  2.3281,  0.5938,  0.9297],
         ...,
         [-1.0781, -0.5469,  0.2275,  ...,  1.3984, -4.3438, -0.1680],
         [-1.7422, -1.1797,  2.4375,  ...,  1.3594, -4.2500,  0.3027],
         [-1.3828, -2.2500,  0.3984,  ...,  1.3047, -1.1484,  2.2969]]],
       device='cuda:0', dtype=torch.bfloat16)
size: torch.Size([1, 8, 4096])
cpu last hidden state:
tensor([[[ 1.6484,  2.4219,  0.7773,  ..., -1.6953,  1.7656,  2.8594],
         [-0.2168, -0.2812, -2.0156,  ..., -1.4531, -2.4375, -1.3438],
         [-2.5469,  1.8828,  3.0000,  ...,  2.3281,  0.5938,  0.9297],
         ...,
         [-1.0781, -0.5469,  0.2275,  ...,  1.3984, -4.3438, -0.1680],
         [-1.7422, -1.1797,  2.4375,  ...,  1.3594, -4.2500,  0.3027],
         [-1.3828, -2.2500,  0.3984,  ...,  1.3047, -1.1484,  2.2969]]

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# # Calculate cosine similarity
similarity = cosine_similarity(sentence_embeddings_list[0].reshape(1,-1), sentence_embeddings_list[1].reshape(1,-1))
print(similarity)
similarity = cosine_similarity(sentence_embeddings_list[0].reshape(1,-1), sentence_embeddings_list[2].reshape(1,-1))
print(similarity)

[[0.54188967]]
[[1.0000005]]
