In [None]:
import json

filename ="evaluate-rag-deepseek-r1-k30-chroma_2025-04-15_17-24-42.jsonl"
with open(filename, 'r') as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [None]:
retriever_performance = sum([x["retriever_match"] for x in data])/len(data)
print(f"Retriever performance: {retriever_performance}")

In [None]:
from dotenv import load_dotenv
from openai import AzureOpenAI
from pathlib import Path
import os

# Specify the path to your .env file
dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

def get_env_var(key: str) -> str:
    try:
        return os.environ[key]
    except KeyError:
        raise KeyError(f"Please set the {key} environment variable.")

In [None]:
def make_prompt(symptoms_description, document_titles_and_scores, document_text, k):
    prompt = f"""
    You are part of a retrieval system in a medical domain.
    Given a description of symptoms from a patient, our first retriever has narrowed it down to a few possible documents describing conditions.
    I will provide you the document title, the score returned by the retriever and the document text.
    This is the patent's symptoms:
    {symptoms_description}
    The document titles and their scores are:
    {document_titles_and_scores}
    The document text is:
    {document_text}
    Your task is to return the top {k} documents that are most relevant to the symptoms. Please return their titles, comma separated.
    """
    return prompt

In [None]:
# Constants
MAX_TOKENS = 2048
AZURE_OPENAI_API_VERSION = "2024-12-01-preview"

model = "gpt-4o"

endpoint = get_env_var("AZURE_OPENAI_ENDPOINT_gpt-4o")
key = get_env_var("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(
        api_version=AZURE_OPENAI_API_VERSION,
        azure_endpoint=endpoint,
        api_key=key,
        )

def get_response (prompt: str, model: str) -> str:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        max_completion_tokens=MAX_TOKENS,
        model=model,
    )
    return response.choices[0].message.content

In [None]:
def re_rank_documents(line, k):
    symptoms_description = line['symptoms_description']
    document_titles_and_scores = {line['retrieved_documents_sources'][x]:line['retrieved_documents_scores'][x] for x in range(len(line['retrieved_documents_sources']))}
    document_text = "\n\n".join(data[0]['retrieved_documents'])
    prompt = make_prompt(symptoms_description, document_titles_and_scores, document_text, k=k)
    response = get_response(prompt, model)
    return response

In [None]:
k = 5

ct = 0

cutoff = 100

for line in data[:cutoff]:
    try:
        reranked_documents = re_rank_documents(line, k).split(",")
        reranked_documents = [x.strip().replace('"',"").replace("'","") for x in reranked_documents]
        gold_document = line['conditions_title']
        if gold_document in reranked_documents:
            ct += 1
    except Exception as e:
        continue
print (f"Reranking accuracy: {ct/len(data[:cutoff])}")


In [None]:
# Tested with cutoff = 100
# Reranking accuracy: 0.72 (k=10)
# Reranking accuracy: 0.67 (k=5)
# Reranking accuracy: 0.54 (k=3)