# Example of EmbeddingBag Caching
This example shows how to cache a EmbeddingBag for LightRetriever's Asymmetric Dense Retrieval.

### Load Fine-tuned LightRetriever as Huggingface Model

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel, LoraModel

model_name_or_path = "lightretriever/lightretriever-qwen2.5-1.5b"
device = torch.device("cuda:0")

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# Load Base HF Model & Peft Adapters
config = LoraConfig.from_pretrained(model_name_or_path)
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path, 
    torch_dtype=torch.bfloat16, 
    attn_implementation="flash_attention_2",
    device_map=device,
)
hf_model: LoraModel = PeftModel.from_pretrained(base_model, model_name_or_path, config=config)
hf_model = hf_model.merge_and_unload()  # Merge to single HF Model

### Construct EmbeddingBag
Construct prompted token embedding bags. 

1. `construct_embedding_bag` will construct inputs formated in `[bos] + [prompts] + [vocab_token_id] + [eos]`.
2. It pools the eos hidden states as the embedding of coresponding vocab_token_id. 
3. vocab_token_id ranges [0, len(tokenizer)).

In [None]:
from lightretriever.finetune.nonctx_emb_utils import construct_embedding_bag

emb_bag = construct_embedding_bag(
    model=hf_model.model,
    tokenizer=tokenizer,
    prompt="Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: ",
    batch_size=1000,
)

### Save EmbeddingBag
Save the EmbeddingBag matrix with shape `[vocab_size, hidden_dim]` to `save_path`


In [None]:
torch.save(emb_bag.weight, "web_search_en.emb_bag.pt")