In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the LLM-Embedder model
tokenizer = AutoTokenizer.from_pretrained("BAAI/llm-embedder")
model = AutoModel.from_pretrained("BAAI/llm-embedder")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
def get_embeddings(texts, instruction=""):
    # Add instruction prefix if provided (useful for queries)
    if instruction:
        texts = [f"{instruction} {text}" for text in texts]
    
    # Tokenize
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use mean pooling of last hidden states
        embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return embeddings

In [None]:
# Sample knowledge base (documents)
documents = [
    "Real Madrid and Barcelona clashed in another thrilling edition of El Clásico.",
    "The match was filled with intensity, showcasing world-class football from both sides.",
    "Real Madrid took an early lead with a clinical finish from Jude Bellingham.",
    "Barcelona responded quickly with a brilliant goal by Robert Lewandowski.",
    "The midfield battle was fierce, with both teams pressing high and forcing turnovers.",
    "Vinícius Jr. caused constant problems for Barcelona’s defense with his pace and dribbling.",
    "A late goal from Federico Valverde sealed the win for Real Madrid.",
    "The Santiago Bernabéu erupted as Madrid secured a vital victory in La Liga.",
    "The win pushed Real Madrid to the top of the table, asserting dominance in the title race."
]

# Create embeddings for documents
print("Creating document embeddings...")
doc_embeddings = get_embeddings(documents)
print(doc_embeddings)
print(doc_embeddings.shape)

Creating document embeddings...
tensor([[-0.4256, -0.5326,  0.9291,  ..., -0.5737,  0.6766, -0.1039],
        [-0.7919, -0.7674,  0.6042,  ..., -0.1216,  0.0868,  0.5076],
        [-0.2909, -1.0550,  0.7306,  ..., -0.2639,  0.4091,  0.1456],
        ...,
        [-0.1506, -0.4367,  1.0117,  ..., -0.2462,  0.0858,  0.3055],
        [-0.2443, -0.1187,  0.9840,  ..., -0.1865, -0.1917,  0.2820],
        [-0.8901, -0.4655,  1.1325,  ..., -0.8435,  0.5171,  0.0556]])
torch.Size([9, 768])


In [8]:
def retrieve_and_answer(query, top_k=2):
    
    # Get query embedding with instruction
    query_embedding = get_embeddings([query], instruction="Represent this sentence for searching relevant passages:")
    
    # Calculate similarities
    similarities = cosine_similarity(query_embedding.numpy(), doc_embeddings.numpy())[0]
    
    # Get top-k most similar documents
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    print(f"\nQuery: {query}")
    print("=" * 50)
    
    retrieved_docs = []
    for i, idx in enumerate(top_indices):
        doc = documents[idx]
        score = similarities[idx]
        print(f"Retrieved Document {i+1} (similarity: {score:.3f}):")
        print(f"  {doc}")
        retrieved_docs.append(doc)
    
    # Create context for generation (in a real RAG system, this would go to an LLM)
    context = "\n".join(retrieved_docs)
    print(f"\nCombined Context for Generation:")
    print(f"  {context}")
    
    return retrieved_docs, context

In [9]:
# Example usage
if __name__ == "__main__":
    # Test queries
    queries = [
        "Who lead the first goal for real madrid?",
        "Who scored the win goal",
    ]
    
    for query in queries:
        retrieve_and_answer(query)
        print("\n" + "="*70 + "\n")


Query: Who lead the first goal for real madrid?
Retrieved Document 1 (similarity: 0.846):
  Real Madrid took an early lead with a clinical finish from Jude Bellingham.
Retrieved Document 2 (similarity: 0.823):
  A late goal from Federico Valverde sealed the win for Real Madrid.

Combined Context for Generation:
  Real Madrid took an early lead with a clinical finish from Jude Bellingham.
A late goal from Federico Valverde sealed the win for Real Madrid.



Query: Who scored the win goal
Retrieved Document 1 (similarity: 0.828):
  A late goal from Federico Valverde sealed the win for Real Madrid.
Retrieved Document 2 (similarity: 0.798):
  Barcelona responded quickly with a brilliant goal by Robert Lewandowski.

Combined Context for Generation:
  A late goal from Federico Valverde sealed the win for Real Madrid.
Barcelona responded quickly with a brilliant goal by Robert Lewandowski.


