In [1]:
import json
filename= "../data/evaluation/evaluation_vector_store_results_2025-04-26_12-03-28.jsonl" #k=30
#filename = "../data/evaluation/evaluation_vector_store_results_2025-04-26_12-11-06.jsonl" #k=50
with open(filename, 'r') as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [2]:
data[0].keys()

dict_keys(['general_demographics', 'symptoms_description', 'query_type', 'severity_level', 'conditions_title', 'query_field', 'target_document_field', 'k', 'retrieved_documents', 'retrieved_documents_scores', 'retrieved_documents_sources', 'match'])

In [3]:
#retriever_performance = sum([x["match"] for x in data])/len(data)
retriever_performance = sum([x["match"] for x in data])/len(data)
print(f"Retriever performance: {retriever_performance}")

query_types = set([x["query_type"] for x in data])
for query_type in query_types:
    query_type_data = [x for x in data if x["query_type"] == query_type]
    retriever_performance = sum([x["match"] for x in query_type_data])/len(query_type_data)
    print(f"Retriever performance for {query_type}: {retriever_performance}")

Retriever performance: 0.88
Retriever performance for downplay: 0.8425925925925926
Retriever performance for basic: 0.8571428571428571
Retriever performance for hypochondriac: 0.9350282485875706


In [4]:
from dotenv import load_dotenv
from openai import AzureOpenAI
from pathlib import Path
import os

# Specify the path to your .env file
dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

def get_env_var(key: str) -> str:
    try:
        return os.environ[key]
    except KeyError:
        raise KeyError(f"Please set the {key} environment variable.")

In [5]:
def make_prompt(symptoms_description, document_titles, document_text, k):
    prompt = f"""
    You are part of a retrieval system for a medical domain.
    Given a description of symptoms provided by a patient, an initial retriever has shortlisted several possible conditions, along with their distance scores (so lower is better) and the associated document content.
    
    Here is the patient's symptom description:
    {symptoms_description}
    
    The shortlisted conditions and their retrieval scores are:
    {document_titles}
    
    The corresponding condition descriptions are:
    {document_text}
    
    Your task is to select the {k} most likely conditions based on the symptoms. 
    Please return only the titles of the selected conditions, comma-separated.
    """
    return prompt


In [6]:
import ollama


# basic function to prompt the model
def get_response_from_ollama(prompt, model="gemma3:1b"):
    response = ollama.generate(model=model, prompt=prompt)
    return response["response"]

In [7]:
# Constants
MAX_TOKENS = 2048
AZURE_OPENAI_API_VERSION = "2024-12-01-preview"

model = "gpt-4o"

endpoint = get_env_var("AZURE_OPENAI_ENDPOINT_gpt-4o")
key = get_env_var("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(
        api_version=AZURE_OPENAI_API_VERSION,
        azure_endpoint=endpoint,
        api_key=key,
        )

def get_response (prompt: str, model: str) -> str:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        max_completion_tokens=MAX_TOKENS,
        model=model,
    )
    return response.choices[0].message.content

In [8]:
with open("../data/nhs-conditions/v3/conditions.jsonl", "r") as f:
    lines = f.readlines()

conditions = {}
for line in lines:
    condition = json.loads(line)
    conditions[condition["condition_title"]] = condition['condition_content']

In [9]:
def re_rank_documents(line, k):
    symptoms_description = line['symptoms_description']
    #document_titles = line['retrieved_documents_sources']
    document_titles = [(source, score) for source, score in zip(line["retrieved_documents_sources"], line["retrieved_documents_scores"])]

    best_sources = {}

    for source, score in document_titles:
        if source not in best_sources or score < best_sources[source]:
            best_sources[source] = score

    document_titles_unique = sorted(best_sources.items(), key=lambda x: x[1])

    document_text = "\n\n".join([conditions[title] for title, score in document_titles_unique])

    #document_text =""
    prompt = make_prompt(symptoms_description, document_titles_unique, document_text, k=k)
    response = get_response(prompt, model)
    #response = get_response_from_ollama(prompt, model="gemma3:4b")
    return response

In [None]:
import tqdm

cutoff = 1000

for k in [5,3]:
    ct = 0
    tot = 0
    for line in tqdm.tqdm(data[:cutoff]):
        try:
            reranked_documents = re_rank_documents(line, k).split(",")
            reranked_documents = [x.strip().replace('"',"").replace("'","").replace(" ","-").lower() for x in reranked_documents]
            gold_document = line['conditions_title']
            tot += 1
            if gold_document in reranked_documents:
                ct += 1
            if tot % 50 == 0:
                print (f"k={k}, Current accuracy: {ct/tot}")
        except Exception as e:
            #print (f"Error: {e}")
            continue
    print (f"k={k}, Reranking accuracy: {ct/tot}")

  5%|▌         | 50/1000 [03:37<1:29:27,  5.65s/it]

k=5, Current accuracy: 0.86


 10%|█         | 101/1000 [07:06<56:15,  3.76s/it] 

k=5, Current accuracy: 0.84


 15%|█▌        | 152/1000 [10:25<49:41,  3.52s/it]  

k=5, Current accuracy: 0.8333333333333334


 20%|██        | 204/1000 [13:50<44:17,  3.34s/it]  

k=5, Current accuracy: 0.815


 25%|██▌       | 254/1000 [17:23<42:04,  3.38s/it]  

k=5, Current accuracy: 0.792


 30%|███       | 304/1000 [21:00<1:03:16,  5.45s/it]

k=5, Current accuracy: 0.8033333333333333


 36%|███▌      | 355/1000 [24:14<39:37,  3.69s/it]  

k=5, Current accuracy: 0.8028571428571428


 40%|████      | 405/1000 [27:34<32:35,  3.29s/it]  

k=5, Current accuracy: 0.7925


 46%|████▌     | 455/1000 [31:04<50:33,  5.57s/it]

k=5, Current accuracy: 0.7888888888888889


 50%|█████     | 505/1000 [34:13<18:08,  2.20s/it]  

k=5, Current accuracy: 0.794


 56%|█████▌    | 555/1000 [37:32<22:20,  3.01s/it]

k=5, Current accuracy: 0.7872727272727272


 61%|██████    | 607/1000 [40:24<17:08,  2.62s/it]

k=5, Current accuracy: 0.7883333333333333


 66%|██████▌   | 657/1000 [43:35<21:32,  3.77s/it]

k=5, Current accuracy: 0.7861538461538462


 71%|███████   | 708/1000 [47:08<16:24,  3.37s/it]

k=5, Current accuracy: 0.7785714285714286


 76%|███████▌  | 758/1000 [50:24<16:44,  4.15s/it]

k=5, Current accuracy: 0.7786666666666666


 81%|████████  | 808/1000 [53:55<13:19,  4.17s/it]

k=5, Current accuracy: 0.7825


 86%|████████▌ | 858/1000 [57:29<11:46,  4.97s/it]

k=5, Current accuracy: 0.7847058823529411


 91%|█████████ | 908/1000 [1:00:35<04:50,  3.16s/it]

k=5, Current accuracy: 0.7833333333333333


 96%|█████████▌| 958/1000 [1:04:09<02:50,  4.05s/it]

k=5, Current accuracy: 0.7863157894736842


100%|██████████| 1000/1000 [1:06:42<00:00,  4.00s/it]


k=5, Reranking accuracy: 0.782258064516129


  5%|▌         | 51/1000 [03:53<1:12:12,  4.57s/it]

k=3, Current accuracy: 0.8


 10%|█         | 102/1000 [06:58<44:31,  2.97s/it] 

k=3, Current accuracy: 0.81


 15%|█▌        | 152/1000 [10:09<43:53,  3.11s/it]  

k=3, Current accuracy: 0.78


 20%|██        | 203/1000 [13:45<1:07:03,  5.05s/it]

k=3, Current accuracy: 0.775


 25%|██▌       | 253/1000 [17:08<36:04,  2.90s/it]  

k=3, Current accuracy: 0.736


 30%|███       | 304/1000 [20:47<1:01:40,  5.32s/it]

k=3, Current accuracy: 0.7433333333333333


 35%|███▌      | 354/1000 [24:02<30:20,  2.82s/it]  

k=3, Current accuracy: 0.7428571428571429


 40%|████      | 404/1000 [27:07<31:18,  3.15s/it]  

k=3, Current accuracy: 0.73


 45%|████▌     | 454/1000 [30:14<38:42,  4.25s/it]

k=3, Current accuracy: 0.7333333333333333


 47%|████▋     | 468/1000 [31:25<40:24,  4.56s/it]

In [None]:
# Summary k=50
# k=3 accuracy=
# k=5 accuracy=

### Summary k=30
# k=3 accuracy=
# k=5 accuracy=0.78
 
