In [1]:
import json
filename= "../data/evaluation/evaluation_vector_store_results_2025-04-26_12-03-28.jsonl" #k=30
#filename = "../data/evaluation/evaluation_vector_store_results_2025-04-26_12-11-06.jsonl" #k=50
initial_k = 30
with open(filename, 'r') as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [2]:
data[0].keys()

dict_keys(['general_demographics', 'symptoms_description', 'query_type', 'severity_level', 'conditions_title', 'query_field', 'target_document_field', 'k', 'retrieved_documents', 'retrieved_documents_scores', 'retrieved_documents_sources', 'match'])

In [None]:
retriever_performance = sum([x["match"] for x in data])/len(data)
print(f"Retriever performance: {retriever_performance}")

query_types = set([x["query_type"] for x in data])
for query_type in query_types:
    query_type_data = [x for x in data if x["query_type"] == query_type]
    retriever_performance = sum([x["match"] for x in query_type_data])/len(query_type_data)
    print(f"Retriever performance for {query_type}: {retriever_performance}")

In [None]:
from dotenv import load_dotenv
from openai import AzureOpenAI
from pathlib import Path
import os

# Specify the path to your .env file
dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

def get_env_var(key: str) -> str:
    try:
        return os.environ[key]
    except KeyError:
        raise KeyError(f"Please set the {key} environment variable.")

In [None]:
def make_prompt(symptoms_description, document_titles, document_text, k):
    prompt = f"""
    You are part of a retrieval system for a medical domain.
    Given a description of symptoms provided by a patient, an initial retriever has shortlisted several possible conditions, along with the distance score of the most relevant snippet for that condition (so lower is better), the number of snippets retrieved and the entire content of the associated document.
    
    Here is the patient's symptom description:
    {symptoms_description}
    
    The shortlisted conditions and their retrieval scores and number of snippets are:
    {document_titles}
    
    The corresponding condition descriptions are:
    {document_text}
    
    Your task is to select the {k} most likely conditions based on the symptoms. 
    Please return only the titles of the selected conditions, comma-separated.
    """
    return prompt


In [None]:
# Constants
MAX_TOKENS = 2048
AZURE_OPENAI_API_VERSION = "2024-12-01-preview"


def get_response_openai(prompt: str, model: str) -> str:
    endpoint = get_env_var(f"AZURE_OPENAI_ENDPOINT_{model}")
    key = get_env_var("AZURE_OPENAI_API_KEY")

    client = AzureOpenAI(
            api_version=AZURE_OPENAI_API_VERSION,
            azure_endpoint=endpoint,
            api_key=key,
        )

    response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant.",
                },
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
            max_completion_tokens=MAX_TOKENS,
            model=model,
        )
    return response.choices[0].message.content

get_response_openai("test", "gpt-4o")

In [None]:
with open("../data/nhs-conditions/v3/conditions.jsonl", "r") as f:
    lines = f.readlines()

conditions = {}
for line in lines:
    condition = json.loads(line)
    conditions[condition["condition_title"]] = condition['condition_content']

In [None]:
def re_rank_documents(line, k):
    symptoms_description = line['symptoms_description']
    # document_titles now contains tuples of (source, score)
    document_titles = [(source, score) for source, score in zip(line["retrieved_documents_sources"], line["retrieved_documents_scores"])]

    # Initialize a dictionary to store best sources with additional data
    best_sources = {}

    for source, score in document_titles:
        # If the source is not in best_sources or the current score is lower than the stored one
        if source not in best_sources:
            best_sources[source] = {'lowest_score': score, 'number_of_snippets': 1}  # Initialize the source with a count of 1 snippet
        else:
            best_sources[source]['lowest_score'] = min(best_sources[source]['lowest_score'], score)  # Update the lowest score
            best_sources[source]['number_of_snippets'] += 1  # Increment the number of snippets for this source

    # Sort the sources based on the lowest score
    document_titles_unique = sorted(best_sources.items(), key=lambda x: x[1]['lowest_score'])

    # Create the document_text using the unique titles (you may need to handle 'title' lookup or fetching more data)
    document_text = "\n\n".join([conditions[title] for title, data in document_titles_unique])

    # Prepare the prompt for getting the response
    prompt = make_prompt(symptoms_description, document_titles_unique, document_text, k=k)
    response = get_response_openai(prompt, "gpt-4o")
    
    return response


In [None]:
import tqdm

cutoff = 100

reranked_results = []

ct = 0
tot = 0

k = 5

for line in tqdm.tqdm(data[:cutoff]):
    try:
        reranked_documents = re_rank_documents(line, k).split(",")
        reranked_documents = [x.strip().replace('"',"").replace("'","").replace(" ","-").lower() for x in reranked_documents]
        gold_document = line['conditions_title']
        line['reranked_documents'] = reranked_documents
        tot += 1
        if gold_document in reranked_documents:
            line['correct_rerank'] = True
            ct += 1
        else:
            line['correct_rerank'] = False
        reranked_results.append(line)
        if tot % 50 == 0:
            print (f"Current accuracy: {ct/tot}, initial_k={initial_k}")
    except Exception as e:
        print (f"Error: {e}")
        continue
print (f"Reranking accuracy: {ct/tot}, initial_k={initial_k}")

In [None]:
template_eval_prompt = open("../templates/rag_evaluation_prompt.txt").read()
def make_eval_prompt(symptoms_description, document_titles, demographics, context):
    prompt = template_eval_prompt.format(
        question=symptoms_description,
        sources=document_titles,
        demographics=demographics,
        context=context
    )
    return prompt

In [None]:
final_results = []

for result in tqdm.tqdm(reranked_results):
    symptoms_description = result['symptoms_description']
    demographics = result['general_demographics']
    document_titles = result['reranked_documents']
    context = "\n\n".join([conditions[title] for title in document_titles if title in conditions])
    
    prompt = make_eval_prompt(symptoms_description, document_titles, demographics, context)
    response = get_response_openai(prompt, "o3-mini")
    result['prediction'] = response
    final_results.append(result)

In [None]:
correct_condition_predictions = 0
correct_severity_predictions = 0
for result in final_results:
    gold_condition = result['conditions_title'].lower()
    gold_severity = result['severity_level'].lower()
    try:
        predicted_condition = result['prediction'].split(",")[0].strip().replace('"',"").replace("'","").replace(" ","-").lower()
        predicted_severity = result['prediction'].split(",")[1].strip().replace('"',"").replace("'","").replace(" ","-").lower()
        if gold_condition == predicted_condition:
            result['correct_condition'] = True
            correct_condition_predictions += 1
        else:
            result['correct_condition'] = False
        if gold_severity == predicted_severity:
            result['correct_severity'] = True
            correct_severity_predictions += 1
        else:
            result['correct_severity'] = False
    except Exception as e:
        print (result['prediction'])
print (f"Condition prediction accuracy: {correct_condition_predictions/len(final_results)}")
print (f"Severity prediction accuracy: {correct_severity_predictions/len(final_results)}")

In [None]:
for query_type in query_types:
    query_type_data = [x for x in final_results if x["query_type"] == query_type]
    correct_condition_predictions = sum([x["correct_condition"] for x in query_type_data if "correct_condition" in x])
    correct_severity_predictions = sum([x["correct_severity"] for x in query_type_data if "correct_severity" in x])
    print (f"Condition prediction accuracy for {query_type}: {correct_condition_predictions/len(query_type_data)}")
    print (f"Severity prediction accuracy for {query_type}: {correct_severity_predictions/len(query_type_data)}")

In [None]:
# store the final results
import json
with open(f"../data/evaluation/evaluation_vector_store_results_2025-04-26_12-03-28_reranked_predicted.jsonl", 'w') as f:
    for result in final_results:
        f.write(json.dumps(result) + "\n")