In [None]:
import json
#filename ="evaluate-rag-deepseek-r1-k10-chroma_2025-04-16_04-27-00.jsonl"
filename ="evaluate-rag-deepseek-r1-k30-chroma_2025-04-15_17-24-42.jsonl"
#filename= "../data/evaluation/evaluation_vector_store_results_2025-04-18_18-59-38.jsonl"
with open(filename, 'r') as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [None]:
data[0].keys()

In [None]:
#retriever_performance = sum([x["match"] for x in data])/len(data)
retriever_performance = sum([x["retriever_match"] for x in data])/len(data)
print(f"Retriever performance: {retriever_performance}")

In [None]:
from dotenv import load_dotenv
from openai import AzureOpenAI
from pathlib import Path
import os

# Specify the path to your .env file
dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

def get_env_var(key: str) -> str:
    try:
        return os.environ[key]
    except KeyError:
        raise KeyError(f"Please set the {key} environment variable.")

In [None]:
def make_prompt(symptoms_description, document_titles, document_text, k):
    prompt = f"""
    You are part of a retrieval system in a medical domain.
    Given a description of symptoms from a patient, our first retriever has narrowed it down to a few possible documents describing conditions.
    I will provide you the document title and the document text.
    This is the patent's symptoms:
    {symptoms_description}
    The document titles are:
    {document_titles}
    The document text is:
    {document_text}
    Your task is to return the top {k} documents that are most relevant to the symptoms. Please return their titles, comma separated.
    """
    return prompt

In [None]:
import ollama


# basic function to prompt the model
def get_response_from_ollama(prompt, model="gemma3:1b"):
    response = ollama.generate(model=model, prompt=prompt)
    return response["response"]

In [None]:
# Constants
MAX_TOKENS = 2048
AZURE_OPENAI_API_VERSION = "2024-12-01-preview"

model = "gpt-4o"

endpoint = get_env_var("AZURE_OPENAI_ENDPOINT_gpt-4o")
key = get_env_var("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(
        api_version=AZURE_OPENAI_API_VERSION,
        azure_endpoint=endpoint,
        api_key=key,
        )

def get_response (prompt: str, model: str) -> str:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        max_completion_tokens=MAX_TOKENS,
        model=model,
    )
    return response.choices[0].message.content

In [None]:
def re_rank_documents(line, k):
    symptoms_description = line['symptoms_description']
    document_titles = line['retrieved_documents_sources']
    document_text = "\n\n".join(line['retrieved_documents'])
    prompt = make_prompt(symptoms_description, document_titles, document_text, k=k)
    response = get_response(prompt, model)
    #response = get_response_from_ollama(prompt, model="gemma3:4b")
    return response

In [None]:
import tqdm

cutoff = 300

for k in [5]:
    ct = 0
    tot = 0
    for line in tqdm.tqdm(data[:cutoff]):
        try:
            reranked_documents = re_rank_documents(line, k).split(",")
            reranked_documents = [x.strip().replace('"',"").replace("'","").replace(" ","-").lower() for x in reranked_documents]
            gold_document = line['conditions_title']
            tot += 1
            if gold_document in reranked_documents:
                ct += 1
            if tot % 10 == 0:
                print (f"Current accuracy: {ct/tot}")
        except Exception as e:
            #print (f"Error: {e}")
            continue
    print (f"Reranking accuracy: {ct/tot}")


In [None]:
# Summary k=50
# k=3 accuracy=0.57
# k=5 accuracy=0.67

### Summary k=30
#k=3 accuracy=0.63
# k=5 accuracy=0.70
# k=10 accuracy=0.73
 
# Summary k=10
# k=3 accuracy=0.56
# k=5 accuracy=0.62 

```
10%|█         | 100/1000 [03:05<29:09,  1.94s/it]
Current accuracy: 0.6
 20%|██        | 201/1000 [06:53<32:01,  2.40s/it]
Current accuracy: 0.61
 30%|███       | 301/1000 [12:40<8:14:54, 42.48s/it]
Current accuracy: 0.63
 40%|████      | 401/1000 [16:08<18:12,  1.82s/it]  
Current accuracy: 0.625
 50%|█████     | 502/1000 [19:25<15:19,  1.85s/it]
Current accuracy: 0.628
 60%|██████    | 602/1000 [22:33<12:14,  1.85s/it]
Current accuracy: 0.6333333333333333
 70%|███████   | 702/1000 [25:48<09:48,  1.97s/it]
Current accuracy: 0.62
 80%|████████  | 802/1000 [29:09<05:37,  1.71s/it]
Current accuracy: 0.63125
 90%|█████████ | 902/1000 [32:29<02:42,  1.65s/it]
Current accuracy: 0.6355555555555555
100%|██████████| 1000/1000 [35:33<00:00,  2.13s/it]
Reranking accuracy: 0.633901705115346
 10%|█         | 100/1000 [03:33<26:17,  1.75s/it] 
Current accuracy: 0.72
 20%|██        | 200/1000 [07:00<30:06,  2.26s/it]
Current accuracy: 0.695
 30%|███       | 300/1000 [10:31<25:16,  2.17s/it]
Current accuracy: 0.71
 40%|████      | 400/1000 [14:13<19:55,  1.99s/it]
Current accuracy: 0.6925
 50%|█████     | 502/1000 [17:47<21:12,  2.55s/it]
Current accuracy: 0.698
 60%|██████    | 602/1000 [21:18<12:17,  1.85s/it]
Current accuracy: 0.6983333333333334
 70%|███████   | 702/1000 [25:02<09:14,  1.86s/it]
Current accuracy: 0.6928571428571428
 80%|████████  | 803/1000 [28:38<06:37,  2.02s/it]
Current accuracy: 0.69375
 90%|█████████ | 903/1000 [32:15<03:40,  2.27s/it]
Current accuracy: 0.6955555555555556
100%|██████████| 1000/1000 [35:45<00:00,  2.15s/it]
Reranking accuracy: 0.6960882647943831
 10%|█         | 100/1000 [04:22<37:27,  2.50s/it]
Current accuracy: 0.75
 20%|██        | 200/1000 [08:17<29:03,  2.18s/it]
Current accuracy: 0.72
 30%|███       | 301/1000 [12:12<35:48,  3.07s/it]
Current accuracy: 0.73
 40%|████      | 402/1000 [16:17<22:20,  2.24s/it]
Current accuracy: 0.715
 50%|█████     | 502/1000 [20:16<19:14,  2.32s/it]
Current accuracy: 0.728
 60%|██████    | 603/1000 [24:13<14:10,  2.14s/it]
Current accuracy: 0.7333333333333333
 70%|███████   | 703/1000 [28:22<11:50,  2.39s/it]
Current accuracy: 0.7242857142857143
 80%|████████  | 803/1000 [32:20<08:36,  2.62s/it]
Current accuracy: 0.72375
 90%|█████████ | 904/1000 [36:11<03:18,  2.06s/it]
Current accuracy: 0.7288888888888889
100%|██████████| 1000/1000 [40:15<00:00,  2.42s/it]
Reranking accuracy: 0.7279116465863453
```

```
10%|█         | 100/1000 [02:54<21:30,  1.43s/it]
Current accuracy: 0.63
 20%|██        | 201/1000 [06:04<31:29,  2.37s/it]
Current accuracy: 0.59
 30%|███       | 302/1000 [09:11<34:29,  2.96s/it]
Current accuracy: 0.61
 40%|████      | 402/1000 [12:01<15:38,  1.57s/it]
Current accuracy: 0.575
 50%|█████     | 502/1000 [15:17<18:03,  2.18s/it]
Current accuracy: 0.578
 60%|██████    | 602/1000 [18:28<13:03,  1.97s/it]
Current accuracy: 0.5716666666666667
 70%|███████   | 702/1000 [21:33<08:11,  1.65s/it]
Current accuracy: 0.5628571428571428
 80%|████████  | 803/1000 [24:42<07:20,  2.24s/it]
Current accuracy: 0.555
 90%|█████████ | 904/1000 [27:47<03:20,  2.09s/it]
Current accuracy: 0.5577777777777778
100%|██████████| 1000/1000 [30:35<00:00,  1.84s/it]
Reranking accuracy: 0.5592369477911646
 10%|█         | 100/1000 [03:18<33:35,  2.24s/it]
Current accuracy: 0.69
 20%|██        | 200/1000 [06:30<29:01,  2.18s/it]
Current accuracy: 0.64
 30%|███       | 300/1000 [09:48<20:31,  1.76s/it]
Current accuracy: 0.6566666666666666
 40%|████      | 401/1000 [13:16<18:52,  1.89s/it]
Current accuracy: 0.6375
 50%|█████     | 501/1000 [16:31<14:45,  1.77s/it]
Current accuracy: 0.642
 60%|██████    | 601/1000 [19:44<16:19,  2.45s/it]
Current accuracy: 0.6283333333333333
 70%|███████   | 701/1000 [23:04<09:17,  1.86s/it]
Current accuracy: 0.6257142857142857
 80%|████████  | 801/1000 [26:28<05:14,  1.58s/it]
Current accuracy: 0.62375
 90%|█████████ | 901/1000 [29:42<02:43,  1.65s/it]
Current accuracy: 0.6222222222222222
100%|██████████| 1000/1000 [33:05<00:00,  1.99s/it]
Reranking accuracy: 0.6206206206206206
```

```
10%|█         | 101/1000 [02:46<21:33,  1.44s/it]
Current accuracy: 0.51
 20%|██        | 201/1000 [05:27<23:44,  1.78s/it]
Current accuracy: 0.52
 30%|███       | 301/1000 [08:13<17:09,  1.47s/it]
Current accuracy: 0.5566666666666666
 40%|████      | 401/1000 [11:02<20:52,  2.09s/it]
Current accuracy: 0.555
 50%|█████     | 502/1000 [13:56<12:51,  1.55s/it]
Current accuracy: 0.554
 60%|██████    | 603/1000 [16:48<10:36,  1.60s/it]
Current accuracy: 0.555
 70%|███████   | 703/1000 [19:30<08:14,  1.66s/it]
Current accuracy: 0.5528571428571428
 80%|████████  | 803/1000 [22:14<05:49,  1.77s/it]
Current accuracy: 0.56125
 90%|█████████ | 903/1000 [25:05<02:45,  1.70s/it]
Current accuracy: 0.5688888888888889
100%|██████████| 1000/1000 [27:58<00:00,  1.68s/it]
Reranking accuracy: 0.5692771084337349
 10%|█         | 100/1000 [03:07<23:05,  1.54s/it]
Current accuracy: 0.73
 20%|██        | 200/1000 [06:18<21:53,  1.64s/it]
Current accuracy: 0.69
 30%|███       | 300/1000 [09:15<20:33,  1.76s/it]
Current accuracy: 0.7066666666666667
 40%|████      | 401/1000 [12:27<15:43,  1.58s/it]
Current accuracy: 0.68
 50%|█████     | 503/1000 [15:28<13:31,  1.63s/it]
Current accuracy: 0.674
 60%|██████    | 603/1000 [18:31<13:27,  2.03s/it]
Current accuracy: 0.6766666666666666
 70%|███████   | 703/1000 [21:26<08:07,  1.64s/it]
Current accuracy: 0.6685714285714286
 80%|████████  | 803/1000 [24:17<07:18,  2.22s/it]
Current accuracy: 0.6675
 90%|█████████ | 903/1000 [27:10<02:23,  1.48s/it]
Current accuracy: 0.6711111111111111
100%|██████████| 1000/1000 [29:44<00:00,  1.78s/it]
Reranking accuracy: 0.6700100300902708
```