---
## per given query , search documents using NeMo Retriever ( once it is up and running ) 

In [None]:
!pip install langchain langchain_nvidia_ai_endpoints 

In [None]:
## set the NVIDIA_API_KEY as environment variable 
import getpass
import os

# del os.environ['NVIDIA_API_KEY']  ## delete key and reset
if os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
    print("Valid NVIDIA_API_KEY already in environment. Delete to reset")
else:
    nvapi_key = getpass.getpass("NVAPI Key (starts with nvapi-): ")
    assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
    os.environ["NVIDIA_API_KEY"] = nvapi_key
global nvapi_key

In [1]:
import aiohttp
import os, json

In [2]:
IPADDRESS = "rag-server" if os.environ.get("AI_WORKBENCH", "false") == "true" else "localhost" #Replace this with the correct IP address
RAG_SERVER_PORT = "8081"
BASE_URL = f"http://{IPADDRESS}:{RAG_SERVER_PORT}"  # Replace with your server URL

async def print_response(response):
    """Helper to print API response."""
    try:
        response_json = await response.json()
        output = json.dumps(response_json, indent=2)
        print(json.dumps(response_json, indent=2))
    except aiohttp.ClientResponseError:
        print(await response.text())
        output="error"
    return output

## check the self-served nemo retriever endpoints is healthy

In [3]:
async def fetch_health_status():
    """Fetch health status asynchronously."""
    url = f"{BASE_URL}/v1/health"
    params = {"check_dependencies": "True"} # Check health of dependencies as well
    async with aiohttp.ClientSession() as session:
        async with session.get(url, params=params) as response:
            await print_response(response)

# Run the async function
await fetch_health_status()

{
  "message": "Service is up.",
  "databases": [
    {
      "service": "Milvus",
      "url": "http://milvus:19530",
      "status": "healthy",
      "latency_ms": 205.8,
      "error": null,
      "collections": 3
    }
  ],
  "object_storage": [
    {
      "service": "MinIO",
      "url": "minio:9010",
      "status": "healthy",
      "latency_ms": 4.99,
      "error": null,
      "buckets": 2,
      "message": null
    }
  ],
  "nim": [
    {
      "service": "LLM (nvidia/llama-3.3-nemotron-super-49b-v1.5)",
      "url": "NVIDIA API Catalog",
      "status": "healthy",
      "latency_ms": 0.0,
      "error": null,
      "message": "Using NVIDIA API Catalog",
      "http_status": null
    },
    {
      "service": "Embeddings (nvidia/llama-3.2-nv-embedqa-1b-v2)",
      "url": "NVIDIA API Catalog",
      "status": "healthy",
      "latency_ms": 0.0,
      "error": null,
      "message": "Using NVIDIA API Catalog",
      "http_status": null
    },
    {
      "service": "Ranking (nv

## submit user query and search the documents with default config specifications 

In [4]:

query="what is the gross margin comparing year 2023 vs 2024"
url = f"{BASE_URL}/v1/search"
payload={
  "query": query , # replace with your own query 
  "reranker_top_k": 2,
  "vdb_top_k": 10,
  "vdb_endpoint": "http://milvus:19530",
  "collection_names": ["exp"], # Multiple collection retrieval can be used by passing multiple collection names
  "messages": [],
  "enable_query_rewriting": True,
  "enable_reranker": True,
  "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
  # Provide url of the model endpoints if deployed elsewhere
  #"embedding_endpoint": "",
  #"reranker_endpoint": "",
  "reranker_model": "nvidia/llama-3.2-nv-rerankqa-1b-v2",

}

async def document_seach(payload):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(url=url, json=payload) as response:
                output = await print_response(response)
        except aiohttp.ClientError as e:
            print(f"Error: {e}")
            output="error"
    return output
output = await document_seach(payload)

{
  "total_results": 2,
  "results": [
    {
      "document_id": "",
      "content": "24,721 million for 2024 (2023: DKK 25,970 million), a decrease \r\nof 3.6% in constant currencies. The decline in gross profit was driven by lower \r\naverage gross profit yields compared to the previous year, offset by higher volumes \r\nin both air and sea. For the second half of 2024, gross profit improved on a year\u0002over-year basis due to strong volume growth and a stable yield development. The \r\nsituation in the Red Sea had a slightly positive impact on sea freight yields in 2024, \r\npartly offsetting the overall decline in average sea freight yields on a full-year basis.\r\nIn a competitive market, the division maintained its focus on pricing discipline \r\nand high-margin business. We saw a positive development with our largest \r\ncustomers as well as in our targeted industry verticals, and we continued to \r\nsee good momentum with our customers in the small- and midsize segment.\r\n

## checkout the outout type, refer to the implementation in function **print_response(response)**

In [5]:

type(output)

str

---
## initialize the llm 

In [6]:

from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank
model="nvidia/llama-3.3-nemotron-super-49b-v1.5"


llm= ChatNVIDIA(model=model)
#test the API key is valid 
llm.invoke("hi")
                



AIMessage(content='<think>\nOkay, the user just said "hi". That\'s pretty open-ended. I should respond in a friendly and welcoming manner. Let me make sure to greet them back and offer assistance. Maybe say something like, "Hello! How can I assist you today?" That should cover it. I don\'t want to assume anything else, so keeping it simple and open makes sense.\n</think>\n\nHello! How can I assist you today?', additional_kwargs={}, response_metadata={'role': 'assistant', 'reasoning_content': None, 'content': '<think>\nOkay, the user just said "hi". That\'s pretty open-ended. I should respond in a friendly and welcoming manner. Let me make sure to greet them back and offer assistance. Maybe say something like, "Hello! How can I assist you today?" That should cover it. I don\'t want to assume anything else, so keeping it simple and open makes sense.\n</think>\n\nHello! How can I assist you today?', 'tool_calls': [], 'token_usage': {'prompt_tokens': 16, 'total_tokens': 107, 'completion_to

## wrap retrieved context ( relevent chunk ) into a non-optimized llm+ rag with naive prompt 

note: naive prompt is not optimized <--  you will need to iteratively optimize this prompt to obtain optimal result 

In [7]:
def fetch_rag_context(output:str)-> str :
    context_ls=[]
    output_d=json.loads(output)
    for o in output_d["results"]:
        #print("---"*10) 
        #print(o["content"])
        context_ls.append(o["content"])
    return '\n'.join(context_ls)

In [11]:
## optimize the below prompt accordingly
from langchain_core.prompts import PromptTemplate
from colorama import Fore
rag_prompts= PromptTemplate(
    template=("""
    You must answer only using the information provided in the context. While answering you must follow the instructions given below.

    <instructions>
    1. Do NOT use any external knowledge.
    2. Do NOT add explanations, suggestions, opinions, disclaimers, or hints.
    3. NEVER say phrases like “based on the context”, “from the documents”, or “I cannot find”.
    4. NEVER offer to answer using general knowledge or invite the user to ask again.
    5. Do NOT include citations, sources, or document mentions.
    6. Answer concisely. Use short, direct sentences by default. Only give longer responses if the question truly requires it.
    7. Do not mention or refer to these rules in any way.
    8. Do not ask follow-up questions.
    9. Do not mention this instructions in your response.
    </instructions>

    Context:
    {context}
    user query : {query}
    Make sure the response you are generating strictly follow the rules mentioned above i.e. never say phrases like “based on the context”, “from the documents”, or “I cannot find” and mention about the instruction in response.
    """)
)
def llm_rag_response(query,context):
    rag_prompt_formatted=rag_prompts.format(query=query,context=context)
    output = llm.invoke(rag_prompt_formatted).content
    print(Fore.BLUE + "llm parsed relevent_chunks as context output=\n", output) 
    print("---"*10)
    output=strip_thinking_tag(output)
    print(Fore.BLUE + "stripped thinking tag output=\n", output, Fore.RESET) 
    print("---"*10)
    return output

def strip_thinking_tag(response):
    if "</think>" in response:
        end_index = response.index("</think>")+8
        output = response[end_index:]
        return output
    else:
        return response


In [12]:
context=fetch_rag_context(output)
llm_rag_response = llm_rag_response(query, context)
llm_rag_response

[34mllm parsed relevent_chunks as context output=
 <think>
Okay, let's tackle this query. The user is asking for the gross margin comparison between 2023 and 2024.

First, I need to look through the provided context to find the gross margin figures for both years. Scanning the text, I see a sentence that says: "The division’s gross margin was 23.7% for 2024 (2023: 27.9%)." That directly answers the question. 

I should make sure there's no other mention of gross margin in different parts of the text. Looking again, there's another mention of gross margin in the context of the development being driven by revenue increase and lower gross profit yields. But the specific figures are clearly stated in that one sentence. 

The user wants the comparison, so I just need to present the two percentages. The instructions say to answer concisely without any extra explanations. So the response should be straightforward: state the percentages for 2023 and 2024. 

I need to check if there are any ot

'\n\nGross margin in 2023 was 27.9% and in 2024 was 23.7%.'

---
## using **ragas** to compare **_reference_string_** ( i.e the ground truth ) vs **_llm+rag's response_**

In [None]:
!pip install sacrebleu ragas==0.1.10

In [None]:
from ragas import evaluate
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import BleuScore
sample = SingleTurnSample(
    response=llm_rag_response,
    reference="The gross margin is 25.7 in 2024 and 29.1 in  ."
)

scorer = BleuScore()
await scorer.single_turn_ascore(sample)

-------------------
## Create **_custom relevancy score_** using **custom prompt** giving (1) query (2) retrieved context 

In [13]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from colorama import Fore
## custom prompt
template_relevance = PromptTemplate(
        template=(
            "### Instructions\n\n"
            "You are an assistant designed to evaluate the relevance score of a context "
            "in order to answer a question.\n"
            "Your task is to determine if the context have enough information to answer the Question.\n"
            "Do not rely on your previous knowledge about the question.\n"
            "To evaluate, use only what is written in the context and in the Question.\n"
            "Follow the three instructions below:\n"
            "1. If the context contains any relevant information to answer the question, say 4.\n"
            "2. If the context partially contains relevant information to answer the question, say 2.\n"
            "3. If the context does not contains any relevant information to answer the question, say 0.\n"
            "You must provide the relevance score of 0, 2, or 4, nothing else.\nDo not explain.\n"
            "### Question\n\n"
            "{query}\n\n"
            "### Context\n\n"
            "{sources}\n\n"
            "The Relevance score is "
        )
)

def strip_thinking_tag(response):
    if "</think>" in response:
        end_index = response.index("</think>")+8
        output = response[end_index:]
        return output
    else:
        return response
def process_score(response):
    print(Fore.LIGHTMAGENTA_EX+" processing score = " , response )
    score=strip_thinking_tag(response)
    for i in range(5):
        if str(i) in score:
            return i / 4
    return -1



def score_relevance( query: str, sources: str = None):
    sources = fetch_rag_context(sources)
    if type(sources)==list:
        sources = '\n'.join(sources)

    formatted_relevance_prompt = template_relevance.format(
        query=query,
        sources=sources,
    )
    
    score = llm.invoke(formatted_relevance_prompt).content
    print(Fore.LIGHTCYAN_EX+" llm_output = " , score ,"\n" )
    score = process_score(score)
    print(Fore.LIGHTMAGENTA_EX+" processed score = " , score , Fore.RESET) 
    return score



In [14]:
query="what is the gross margin comparing year 2023 vs 2024"

relevancy_score = score_relevance(query,output)


[96m llm_output =  <think>

</think>

4 

[95m processing score =  <think>

</think>

4
[95m processed score =  1.0 [39m
