In [76]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
from dotenv import load_dotenv
import os
import json
from tqdm import tqdm
from t0_1.llm.client import get_azure_client
from t0_1.llm.reasoner import (
    generate_template,
    SEVERITY_LEVEL,
    CONDITION,
)

load_dotenv()
DATA = os.environ["T0_DATA_FOLDER"]
# given the output of our RAG system
with open(f"{DATA}/evaluation/rag/results_2025-04-04_11-08-43.jsonl", "r") as file:
    data = [json.loads(line) for line in file]

In [None]:
client = get_azure_client(model="4o", use_async=False)

In [None]:
# Given a patient with symptoms
query = data[0]

# Information about the patient
general_demographics = query["general_demographics"]
symptoms_description = query["symptoms_description"]

# Informationa about the chunks retrieved (to be used when reasoning)
k = query["k"]
retrieved_chunks = query["retrieved_documents"]
retrieved_documents_scores = query["retrieved_documents_scores"]
retrieved_documents_sources = query["retrieved_documents_sources"]

# Grouping information about the chunks retrieved by the potential condition
potential_conditions = {x: [] for x in set(retrieved_documents_sources)}

for i in range(len(retrieved_documents_sources)):
    potential_conditions[retrieved_documents_sources[i]].append((retrieved_chunks[i]))

print(potential_conditions.keys())

# And also information for measuring performance (not to use when querying the model)
query_type = query["query_type"]
severity_level = query["severity_level"]
conditions_title = query["conditions_title"]

dict_keys(['testicle-lumps-and-swellings', 'undescended-testicles', 'how-to-check-your-testicles', 'testicle-pain', 'epididymitis', 'testicular-cancer'])


In [None]:
generate_template(patient_json=data[0])

'\n    A patient has given the following description of their symptoms: "I’ve noticed that one of my testicles seems like it’s not in the right position in the scrotum. It feels like it\'s higher up, and I can\'t always locate it. There’s no pain, but it’s been this way for a while, and I’m worried about long-term issues like fertility or other complications. From what I’ve read, it seems like this might not resolve on its own. Should I get this checked out by a specialist?".\n    This is a summary of their demographics: {\'age\': 34, \'sex\': \'Male\', \'occupation\': \'Electrician\', \'social_support\': \'My wife is here to help me.\', \'medical_history\': \'No known chronic conditions.\'}.\n    \n    Based on this information, our system suggests the patient has one of the following conditions: testicle-lumps-and-swellings, undescended-testicles, how-to-check-your-testicles, testicle-pain, epididymitis, testicular-cancer.\n\n    For each potential condition, this is the most relevan

In [None]:
ground_truths = []
recommendations = []
for patient_json in tqdm(data):
    severity_level = patient_json[SEVERITY_LEVEL]
    condition = patient_json["conditions_title"]
    ground_truth = {SEVERITY_LEVEL: severity_level, CONDITION: condition}
    ground_truths.append(ground_truth)
    # recommendation = generate_recommendation(client, patient_json)
    # recommendations.append(recommendation)

100%|██████████| 79/79 [00:00<00:00, 136526.58it/s]


In [None]:
with open(f"{DATA}/evaluation/reasoning/o3_combined.jsonl", "r") as file:
    o3_recommendations = [json.loads(line) for line in file]

with open(f"{DATA}/evaluation/reasoning/4o_2025-04-04_16-58-32.jsonl", "r") as file:
    gpt4o_recommendations = [json.loads(line) for line in file]

In [None]:
def eval(recs):
    correct = 0
    correct_severity = 0
    unsure = 0
    total = 0
    for i, (gt, rec) in enumerate(zip(ground_truths, recs)):
        correct += gt[CONDITION] == rec[CONDITION]
        correct_severity += gt[SEVERITY_LEVEL].strip() == rec[SEVERITY_LEVEL].strip()
        unsure += "none" in rec[CONDITION].lower()
        if "none" in rec[CONDITION].lower():
            print(i, gt[CONDITION], rec[CONDITION], rec[SEVERITY_LEVEL])
        total += 1
        # print(gt[SEVERITY_LEVEL], rec[SEVERITY_LEVEL])
    print(f"Correct: {correct}")
    print(f"Correct severity: {correct_severity}")
    print(f"Unsure: {unsure}")
    print(f"Total: {total}")
    print(f"Accuracy: {correct / total}")
    print(f"Severity accuracy: {correct_severity / total}")


print("O3")
eval(o3_recommendations)
print("GPT-4o")
eval(gpt4o_recommendations)

O3
22 polymyalgia-rheumatica none-of-the-above Urgent
Correct: 44
Correct severity: 43
Unsure: 1
Total: 79
Accuracy: 0.5569620253164557
Severity accuracy: 0.5443037974683544
GPT-4o
14 cgm-and-hcl-for-diabetes none-of-the-above Medium
39 hernia none-of-the-above Medium
Correct: 42
Correct severity: 33
Unsure: 2
Total: 79
Accuracy: 0.5316455696202531
Severity accuracy: 0.4177215189873418


In [None]:
correct = 0
unsure = 0
total = 0
for i, (gt, rec) in enumerate(zip(ground_truths, gpt4o_recommendations)):
    correct += gt[CONDITION] == rec[CONDITION]
    unsure += "none" in rec[CONDITION].lower()
    if "none" in rec[CONDITION].lower():
        print(i, gt[CONDITION], rec[CONDITION])
    total += 1
print(f"Correct: {correct}")
print(f"Unsure: {unsure}")
print(f"Total: {total}")
print(f"Accuracy: {correct / total}")

In [None]:
for rec in recommendations:
    print(rec[CONDITION])

undescended-testicles
hypoparathyroidism
insect-bites-and-stings
erythromelalgia
uveitis
craniosynostosis
hyperparathyroidism
bells-palsy
tonsillitis
gastritis
angina
bone-cancer
gastritis
pelvic-inflammatory-disease-pid
diabetic-ketoacidosis
None of the above
cirrhosis
red-eye
fungal-nail-infection
anal-pain
altitude-sickness
auditory-processing-disorder
transient-ischaemic-attack-tia
raynauds
trigeminal-neuralgia
alcohol-poisoning
diphtheria
genetic-and-genomic-testing
teeth-grinding
deep-vein-thrombosis-dvt
sepsis
jet-lag
erythromelalgia
food-poisoning
hand-tendon-repair
herpes-simplex-eye-infections
guillain-barre-syndrome
peripheral-neuropathy
itchy-anus
None of the above
acute-myeloid-leukaemia
retinal-migraine
noonan-syndrome
eye-injuries
genital-herpes
sciatica
mastoiditis
iron-deficiency-anaemia
uveitis
reactive-arthritis
polycystic-ovary-syndrome-pcos
lipoma
constipation
herpetic-whitlow
acute-respiratory-distress-syndrome
keloid-scars
constipation
hiccups
gum-disease
phimosi