In [1]:
import os
import json

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np

med_qa_data = pd.read_csv('data/cleaned_MedQA_complete_graded_data_encoded.csv')

In [2]:
med_qa_data.head()

Unnamed: 0,question,answer,new_IR,IR Grade,new_AR,AR Grade,new_BR,BR Grade,new_DR,DR Grade,...,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,Thrombotic microangiopathy,N,Thrombotic microangiopathy.,N,Thrombotic microangiopathy.,N,Thrombotic microangiopathy,N,...,Y,Atheroembolic renal disease,"This patient has decreased urinary output, mal...",Y,Cholesterol embolism,The patient's recent history of cardiac cathet...,Y,Contrast-induced nephropathy,This patient's symptoms and clinical history s...,N
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,Alcohol intoxication,N,Alcohol intoxication,N,Alcohol intoxication,N,Alcohol intoxication,N,...,N,Wernicke's encephalopathy,This patient's symptoms of altered mental stat...,N,Wernicke's encephalopathy,"The patient's symptoms (somnolence, slurred sp...",N,Wernicke's encephalopathy,The prior probability of Wernicke's encephalop...,N
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,Valley Fever,N,Coccidioidomycosis,N,Histoplasmosis,Y,Fungal infection,N,...,Y,Histoplasmosis,"This patient presents with fever, cough, chest...",Y,Histoplasmosis,The patient's history of recent hiking in Miss...,Y,Histoplasma capsulatum,The prior probability of Histoplasmosis (a fun...,Y
3,A 67-year-old man who was diagnosed with arthr...,Psoriatic arthritis,Psoriatic arthritis,Y,Psoriatic arthritis,Y,Psoriatic arthritis,Y,Psoriatic arthritis,Y,...,Y,Psoriatic Arthritis,The patient's clinical presentation suggests p...,Y,Psoriatic Arthritis,The patient presents with several features sug...,Y,Psoriatic Arthritis,The prior probability of psoriatic arthritis i...,Y
4,A one-day-old male is evaluated in the hospita...,Duodenal atresia,Gastroschisis,N,Intestinal obstruction,N,Gastroschisis,N,Duodenal atresia,Y,...,Y,Meconium ileus,This neonate's clinical presentation of biliou...,N,Intestinal atresia,"The symptoms presented by this neonate, such a...",Y,Duodenal atresia in Down Syndrome,"In Bayesian reasoning, we begin with a prior p...",Y


In [3]:
med_qa_data.columns

Index(['question', 'answer', 'new_IR', 'IR Grade', 'new_AR', 'AR Grade',
       'new_BR', 'BR Grade', 'new_DR', 'DR Grade', 'CoT', 'CoT.1', 'GPT-4->',
       'CoT.2', 'Rationale', 'CoT Grade', 'DR', 'Rationale.1', 'DR Grade.1',
       'IR', 'Rationale.2', 'IR Grade.1', 'AR', 'Rationale.3', 'AR Grade.1',
       'BR', 'Rationale.4', 'BR Grade.1'],
      dtype='object')

# separate out the gpt 3.5 responses from the gpt4 responses - these are all listed after the 'GPT-4->' column

In [4]:

gpt4_cols = ['CoT.2', 'Rationale', 'CoT Grade', 'DR', 'Rationale.1', 'DR Grade.1',
       'IR', 'Rationale.2', 'IR Grade.1', 'AR', 'Rationale.3', 'AR Grade.1',
       'BR', 'Rationale.4', 'BR Grade.1']
gpt_4_responses = med_qa_data[['question','answer', *gpt4_cols]]

# the last row is a total row, so leave it out
gpt_4_responses = gpt_4_responses.iloc[0:len(gpt_4_responses) - 1]

In [5]:
# dropping the na answer rows from the gpt_4_responses

gpt_4_responses[np.any(gpt_4_responses[gpt4_cols].isna(), axis = 1)]

Unnamed: 0,question,answer,CoT.2,Rationale,CoT Grade,DR,Rationale.1,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
50,A 36-year-old woman comes to the physician bec...,Papillary carcinoma of the thyroid,Thyroid cancer,This patient presents with a painless lump on ...,Y,Thyroid cancer,The differential for this patient includes: be...,Y,Papillary thyroid carcinoma,This patient presents with a painless lump on ...,Y,Papillary thyroid carcinoma,"The presence of a painless, firm, and irregula...",Y,,The prior probability of thyroid cancer in a y...,
51,A 45-year-old man presents to the physician be...,Pseudomonas keratitis,Insufficient information,The question is missing the photograph of the ...,N,,,,,,,,,,,,
97,A 75-year-old man comes to the physician for t...,Aortic valve stenosis,,,,,,,,,,,,,,,
114,A patient presents to the emergency department...,Medication reaction,The patient presented to the emergency departm...,Ischemic stroke,,The differential diagnosis for this patient co...,Cerebrovascular Accident (Stroke),N,This patient presents with severe headache tha...,Medication-induced ischemic stroke,Y,This patient's presentation is consistent with...,Ischemic stroke,N,The prior probability of a stroke in a relativ...,Stroke,N
179,A 50-year-old man comes to the physician for a...,Epidermoid cyst,"Apologies, but there's a missing component in ...",,N,"In the absence of a provided photograph, we'll...",Sebaceous cyst,N,Without an actual photograph or detailed descr...,Lipoma,N,Since no photograph or further description of ...,Lipoma,N,The prior probability of metastatic disease in...,Metastatic colon cancer,N
189,A 63-year-old man comes to the physician for t...,Malignant melanoma,,,,,,,,,,,,,,,
192,A 7-month-old boy is brought to the pediatrici...,Aldolase B,,,,,,,,,,,,,,,
196,A 77-year-old woman is brought to her primary ...,Lewy body dementia,,,,,,,,,,,,,,,
208,A 24-year-old man is brought to the physician ...,Osteoclastoma,,,,,,,,,,,,,,,
279,A 32-year-old man comes to the physician becau...,Chlamydia trachomatis,,,,,,,,,,,,,,,


In [6]:
gpt_4_responses = gpt_4_responses[~np.any(gpt_4_responses[gpt4_cols].isna(), axis = 1)]

In [7]:
gpt_4_responses.shape

(494, 17)

In [8]:
gpt_35_responses = med_qa_data[['question','answer','new_IR', 'IR Grade', 'new_AR', 'AR Grade',
       'new_BR', 'BR Grade', 'new_DR', 'DR Grade', 'CoT', 'CoT.1']]

gpt_35_responses = gpt_35_responses.iloc[0:len(gpt_35_responses) - 1]

In [9]:
gpt_4_responses.head()

Unnamed: 0,question,answer,CoT.2,Rationale,CoT Grade,DR,Rationale.1,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,Cholesterol embolization syndrome,This patient's presentation of decreased urina...,Y,Cholesterol embolization syndrome,The differential for this patient includes acu...,Y,Atheroembolic renal disease,"This patient has decreased urinary output, mal...",Y,Cholesterol embolism,The patient's recent history of cardiac cathet...,Y,Contrast-induced nephropathy,This patient's symptoms and clinical history s...,N
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,Alcoholic neuropathy,This patient presents with altered mental stat...,N,Cerebellar disease,The differential for this patient includes alc...,N,Wernicke's encephalopathy,This patient's symptoms of altered mental stat...,N,Wernicke's encephalopathy,"The patient's symptoms (somnolence, slurred sp...",N,Wernicke's encephalopathy,The prior probability of Wernicke's encephalop...,N
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,Histoplasmosis,This patient has recently been hiking in Missi...,Y,Histoplasmosis,The differential diagnosis in this case includ...,Y,Histoplasmosis,"This patient presents with fever, cough, chest...",Y,Histoplasmosis,The patient's history of recent hiking in Miss...,Y,Histoplasma capsulatum,The prior probability of Histoplasmosis (a fun...,Y
3,A 67-year-old man who was diagnosed with arthr...,Psoriatic arthritis,Psoriatic Arthritis,This patient has a long history of arthritis a...,Y,Psoriatic Arthritis,The differential diagnosis for this patient in...,Y,Psoriatic Arthritis,The patient's clinical presentation suggests p...,Y,Psoriatic Arthritis,The patient presents with several features sug...,Y,Psoriatic Arthritis,The prior probability of psoriatic arthritis i...,Y
4,A one-day-old male is evaluated in the hospita...,Duodenal atresia,Duodenal atresia,The baby in question is displaying signs of in...,Y,Duodenal atresia,"This neonate presents with bilious vomiting, a...",Y,Meconium ileus,This neonate's clinical presentation of biliou...,N,Intestinal atresia,"The symptoms presented by this neonate, such a...",Y,Duodenal atresia in Down Syndrome,"In Bayesian reasoning, we begin with a prior p...",Y


## Below is the template that the researchers used to get their responses

Got this from the GPT3.5_API.ipynb notebook

In [10]:
qa_template = (
    "Diagnose the patients condition using the given passage with a short factoid answer.\n"
    "Question: {question}\n"
    "Rationale: a step-by-step deduction that identifies the correct response. {prompt}"
               )

In [11]:
from lib.llms import PromptTemplate

prompt_template = PromptTemplate(qa_template)

### example showing how to use the prompt template

In [12]:
print(prompt_template.populate(question = "this is my special question", prompt = "this is a COT prompt"))

Diagnose the patients condition using the given passage with a short factoid answer.
Question: this is my special question
Rationale: a step-by-step deduction that identifies the correct response. this is a COT prompt


In [13]:
# intuitive
ir_prompt = "Use symptom, signs and laboratory disease associations to step by step deduce the correct response"

# differential
dr_prompt = "Use step by step deduction to create a differential diagnosis and then use step by step deduction to determine the correct response."

# analytic
ar_prompt = "Use analytic reasoning to deduce the physiologic or biochemical pathophysiology of the patient and step by step identify the correct response."

# bayesian
br_prompt = "Use step by step Bayesian Inference to create a prior probability that is updated with new information in the history to produce a posterior probability and determine the final diagnosis."

## populate the prompt template with a real question and prompt from the dataset

In [14]:
to_send = prompt_template.populate( question = gpt_4_responses['question'][0], prompt = dr_prompt)
print(to_send)

Diagnose the patients condition using the given passage with a short factoid answer.
Question: Two weeks after undergoing an emergency cardiac catherization with stenting for unstable angina pectoris, a 61-year-old man has decreased urinary output and malaise. He has type 2 diabetes mellitus and osteoarthritis of the hips. Prior to admission, his medications were insulin and naproxen. He was also started on aspirin, clopidogrel, and metoprolol after the coronary intervention. His temperature is 38°C (100.4°F), pulse is 93/min, and blood pressure is 125/85 mm Hg. Examination shows mottled, reticulated purplish discoloration of the feet. Laboratory studies show:
Hemoglobin count 14 g/dL
Leukocyte count 16,400/mm3
Segmented neutrophils 56%
Eosinophils 11%
Lymphocytes 31%
Monocytes 2%
Platelet count 260,000/mm3
Erythrocyte sedimentation rate 68 mm/h
Serum
Urea nitrogen 25 mg/dL
Creatinine 4.2 mg/dL
Renal biopsy shows intravascular spindle-shaped vacuoles. What is the most likely cause of t

Lets see what GPT 4o gives back when we ask this question

In [15]:
from lib.llms.chats import ChatOpenAI

llm_4o = ChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4
)

In [16]:
res = llm_4o(to_send)

In [17]:
print(res.text)

The patient presents with several key symptoms and laboratory findings that must be considered in the context of his recent medical history. Let's go through the step-by-step deduction process:

1. **Recent Medical History**: 
   - The patient had an emergency cardiac catheterization with stenting for unstable angina two weeks ago.
   - Post-procedure, he started on aspirin, clopidogrel, and metoprolol.

2. **Current Symptoms**:
   - Decreased urinary output and malaise.
   - Mottled, reticulated purplish discoloration of the feet.
   - Fever (temperature of 38°C or 100.4°F).

3. **Laboratory Findings**:
   - Elevated leukocyte count (16,400/mm³) with eosinophilia (11% eosinophils).
   - An elevated erythrocyte sedimentation rate (68 mm/h).
   - Elevated serum creatinine (4.2 mg/dL) indicating renal impairment.
   - Renal biopsy showing intravascular spindle-shaped vacuoles.

4. **Analysis and Deduction**:
   - The combination of decreased urinary output and elevated creatinine suggest

# Before I go any further, lets create a rudimentary eval harness


#### This eval tests for substring inclusion. If the "golden answer" is included in the llm response, the answer is counted as correct

In [18]:
def eval_responses(response_df: pd.DataFrame, true_col:str, llm_response_arr: list):
    '''
    Given a dataframe of responses, a true column label that is in the dataframe and an array of
    responses from the llm
    test whether answer from the true column is contained within the corresponding llm_res element

    ex: question: What color is the sky? answer = blue, llm_res = "the sky is blue" would evaluate to True
    '''
    no_nulls = response_df[~response_df[true_col].isna()].reset_index()
    true_answers = no_nulls[true_col]
    
    correct = pd.Series([a.lower() in b.lower() for a, b in zip(true_answers, llm_response_arr)])

    print(f'# correct = {correct.sum()} / {len(correct)}')

    return correct.sum() / len(correct)


def get_incorrect(response_df: pd.DataFrame, true_col:str, llm_res: list):

    no_nulls = response_df[~response_df[true_col].isna()].reset_index()
    true_answers = no_nulls[true_col]
    
    correct = pd.Series([a.lower() in b.lower() for a, b in zip(true_answers, llm_res)])

    incorrect_df = no_nulls[~correct]

    incorrect_llm_responses = [llm_res[i] for i in incorrect_df.index]

    incorrect_df['llm_responses'] = incorrect_llm_responses

    return incorrect_df[['question', true_col, 'llm_responses']]

## lets test this out on the study responses

### We can see that that answers that were marked as correct don't *exactly* match the "golden answer" (adhd = "attention deficit hyperactivity disorder")

I don't expect this to replicate exactly because the answers from the study were graded by real human doctors

In [19]:
study_answer_cols = ['CoT.2', 'DR',
       'IR', 'AR','BR']

study_responses = [gpt_4_responses[col].values for col in study_answer_cols]

for i, col in enumerate(study_answer_cols):
       percent = eval_responses(gpt_4_responses, 'answer', study_responses[i])
       print(f'reasoning type {col} = {percent * 100}%')

# correct = 177 / 494
reasoning type CoT.2 = 35.82995951417004%
# correct = 201 / 494
reasoning type DR = 40.688259109311744%
# correct = 192 / 494
reasoning type IR = 38.8663967611336%
# correct = 191 / 494
reasoning type AR = 38.663967611336034%
# correct = 164 / 494
reasoning type BR = 33.198380566801625%


## As you can see from above, DR is still the best, and the results here don't match the study results. We could get a lot fancier with the evaluation, but we are really interested for this session in prompt engineering



#### Now I'm going to define a function that will take in an array of the MedQA questions and then hit an LLM with them and get the answers

The template that I'm going to use will have the keywords "context" and "question". The "context" variable is where we will put the real medical question (sorry confusing) and the "question" variable is where we will put our prompt

In [20]:
from json import JSONDecodeError
from collections import defaultdict
from lib.reasoning.pipelines.answer_with_context import AnswerWithContextPipeline
from lib.base import AIMessage, Document, HumanMessage, SystemMessage
from lib.llms.chats.base import ChatLLM
from itertools import islice
import time
import asyncio
from typing import Union
import numpy as np



def batch_tasks(iterable, batch_size):
    iterator = iter(iterable)
    while True:
        batch = list(islice(iterator, batch_size))
        if not batch:
            break
        yield batch


def get_populated_prompts(questions: list[str], template: PromptTemplate):
    return [
        template.populate(question = question)
        for question in questions
    ]


def generic_get_populated_prompts(questions: list[dict], template: PromptTemplate):
    '''
    Use a list of dictionaries where the keys are the arguments that get passed to the template
    The keys of the dictionary should match the arguments that the template makes
    '''
    return [
        template.populate(**question)
        for question in questions
    ]

async def async_answer_from_question(llm: ChatLLM, prompt, examples: list[tuple] = [], system_prompt = ''):
    messages = []
    if system_prompt:
        messages.append(SystemMessage(content=system_prompt))

    for human, ai in examples:
        messages.append(HumanMessage(content=human))
        messages.append(AIMessage(content=ai))

    messages.append(HumanMessage(content=prompt))

    try:
        output = await llm.ainvoke(messages)

        return Document(text = output.text )
    except Exception as e:
        print("hit error async invoking in answer with context: ", e)
        raise e
    


async def async_answer(answer_with_llm: ChatLLM, questions: list[dict], template: PromptTemplate, examples: list[tuple[str]] = [], system_prompt = '', batch_size = 100):
    n_data = len(questions)
    
    populated_prompts = generic_get_populated_prompts(questions, template)
    tasks = []
    for i, pop_prompt in enumerate(populated_prompts):
        print(f'====== \n processing {i + 1}/{n_data} files \n')

        tasks.append(async_answer_from_question(answer_with_llm, pop_prompt, examples= examples, system_prompt=system_prompt))

    extracted_answers = []
    needs_reprocessing = []
    for batch_number, batch in enumerate(batch_tasks(tasks, batch_size)):
        print(f'processing batch {batch_number} of size {batch_size}')
        settled_tasks = await asyncio.gather(*batch)
        for i,initial_answer in enumerate(settled_tasks):
            idx = batch_number*batch_size + i
            if isinstance(initial_answer, dict):
                needs_reprocessing.append(initial_answer)
                extracted_answers.append(initial_answer)
            else:
                try:
                    parsed_initial_answer = json.loads(initial_answer.content)

                    extracted_answers.append(parsed_initial_answer)

                except JSONDecodeError as e:
                    needs_reprocessing.append({"answer": "Failed to parse", "idx": idx, 'reason': e})
                    extracted_answers.append(initial_answer.content)
        # avoid rate limit
        time.sleep(2)
        
    return extracted_answers, needs_reprocessing




In [29]:

qa_template = PromptTemplate(
    # "Diagnose the patients condition using the given passage with a short factoid answer.\n"
    "Question: {question}\n"
    "Rationale: a step-by-step deduction that identifies the correct response. {prompt}" # remember this is where the prompt goes
    )

# All of the questions that we want responded too
questions = [{'question': question, 'prompt': dr_prompt } for question in gpt_4_responses['question'].values]

llm_4o = ChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4
)



In [30]:
dr_answers, need_reprocess = await async_answer(llm_4o, questions, qa_template, batch_size = 100)

 processing 1/494 files 

 processing 2/494 files 

 processing 3/494 files 

 processing 4/494 files 

 processing 5/494 files 

 processing 6/494 files 

 processing 7/494 files 

 processing 8/494 files 

 processing 9/494 files 

 processing 10/494 files 

 processing 11/494 files 

 processing 12/494 files 

 processing 13/494 files 

 processing 14/494 files 

 processing 15/494 files 

 processing 16/494 files 

 processing 17/494 files 

 processing 18/494 files 

 processing 19/494 files 

 processing 20/494 files 

 processing 21/494 files 

 processing 22/494 files 

 processing 23/494 files 

 processing 24/494 files 

 processing 25/494 files 

 processing 26/494 files 

 processing 27/494 files 

 processing 28/494 files 

 processing 29/494 files 

 processing 30/494 files 

 processing 31/494 files 

 processing 32/494 files 

 processing 33/494 files 

 processing 34/494 files 

 processing 35/494 files 

 processing 36/494 files 

 processing 37/494 files 

 processin

In [31]:
print(dr_answers[50])

To determine the most likely diagnosis for this patient, we need to go through a step-by-step analysis of the clinical presentation, imaging findings, and biopsy results. 

1. **Clinical Presentation:**
   - Chronic cough, aching joints, malaise: These symptoms suggest a possible infectious or inflammatory process.
   - Recent trip to the desert near Phoenix, Arizona: This geographical clue directs us toward diseases endemic to specific regions.

2. **Symptoms Review:**
   - Fever (38.6°C), elevated heart rate (tachycardia), and mild respiratory distress (respiration rate 20/min) indicate a systemic inflammatory response, possibly from an infection.

3. **Chest Examination and Imaging:**
   - Focused chest exam shows mild fremitus and dullness on percussion right side, indicating localized lung pathology.
   - Chest X-ray and CT reveal right-sided lung consolidation, an opacity, pleural effusion, and hilar adenopathy. These are common signs seen in pneumonia, but given the context, cou

In [27]:
gpt_4_responses['answer'].iloc[50]

'Valley fever'

As you can see above the model respondes with **Coccidioidomycosis** but in the final step 7 of it's chain of thought says
> Spherules with endospores: This is a specific feature of Coccidioidomycosis (also known as "Valley Fever"), an endemic fungal infection found in the Southwestern United States, including Arizona.


In [32]:
eval_responses(gpt_4_responses, 'answer', dr_answers)

# correct = 252 / 494


0.5101214574898786

## So already after only upgrading the model to 4o istead of 4, there's about a 10% increase in accuracy


# assign a role and responsibility to the prompt

In [38]:


role_response_template = PromptTemplate("""
You are an expert medical doctor who carefully thinks through patients symptoms and diagnoses them.
Question: {question}
Rationale: a step-by-step deduction that identifies the correct response. {prompt}
""")

In [39]:
formatted_questions = [{'question': question, 'prompt': dr_prompt } for question in gpt_4_responses['question'].values]

populated_formatted_questions = generic_get_populated_prompts(formatted_questions, role_response_template)

In [40]:
print(populated_formatted_questions[0])


You are an expert medical doctor who carefully thinks through patients symptoms and diagnoses them.
Question: Two weeks after undergoing an emergency cardiac catherization with stenting for unstable angina pectoris, a 61-year-old man has decreased urinary output and malaise. He has type 2 diabetes mellitus and osteoarthritis of the hips. Prior to admission, his medications were insulin and naproxen. He was also started on aspirin, clopidogrel, and metoprolol after the coronary intervention. His temperature is 38°C (100.4°F), pulse is 93/min, and blood pressure is 125/85 mm Hg. Examination shows mottled, reticulated purplish discoloration of the feet. Laboratory studies show:
Hemoglobin count 14 g/dL
Leukocyte count 16,400/mm3
Segmented neutrophils 56%
Eosinophils 11%
Lymphocytes 31%
Monocytes 2%
Platelet count 260,000/mm3
Erythrocyte sedimentation rate 68 mm/h
Serum
Urea nitrogen 25 mg/dL
Creatinine 4.2 mg/dL
Renal biopsy shows intravascular spindle-shaped vacuoles. What is the most l

In [41]:
role_response_dr_answers, need_reprocess = await async_answer(llm_4o, questions, role_response_template, batch_size = 100)

 processing 1/494 files 

 processing 2/494 files 

 processing 3/494 files 

 processing 4/494 files 

 processing 5/494 files 

 processing 6/494 files 

 processing 7/494 files 

 processing 8/494 files 

 processing 9/494 files 

 processing 10/494 files 

 processing 11/494 files 

 processing 12/494 files 

 processing 13/494 files 

 processing 14/494 files 

 processing 15/494 files 

 processing 16/494 files 

 processing 17/494 files 

 processing 18/494 files 

 processing 19/494 files 

 processing 20/494 files 

 processing 21/494 files 

 processing 22/494 files 

 processing 23/494 files 

 processing 24/494 files 

 processing 25/494 files 

 processing 26/494 files 

 processing 27/494 files 

 processing 28/494 files 

 processing 29/494 files 

 processing 30/494 files 

 processing 31/494 files 

 processing 32/494 files 

 processing 33/494 files 

 processing 34/494 files 

 processing 35/494 files 

 processing 36/494 files 

 processing 37/494 files 

 processin

In [42]:
eval_responses(gpt_4_responses, 'answer', role_response_dr_answers)

# correct = 248 / 494


0.5020242914979757

Hmm so performance went down on adding the role and responsibility - I want to look at some of the "incorrect" answers to see why

In [43]:
incorrect = get_incorrect(gpt_4_responses, 'answer', role_response_dr_answers)
incorrect

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect_df['llm_responses'] = incorrect_llm_responses


Unnamed: 0,question,answer,llm_responses
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,To diagnose the cause of this patient's sympto...
5,A 55-year-old man comes to the physician becau...,Femoropopliteal artery stenosis,To determine the most likely diagnosis for thi...
6,A 56-year-old man with a history of hypertensi...,Aldosterone excess,To identify the underlying etiology of this pa...
9,A 3-week-old male newborn is brought to the ho...,Transplacental passage of TSH receptor antibodies,"To approach this case, let's analyze the situa..."
11,A 59-year-old man is evaluated for progressive...,Deposition of calcium pyrophosphate CPP crys...,To arrive at the correct diagnosis for this 59...
...,...,...,...
485,A 27-year-old G2P0A2 woman comes to the office...,Theca lutein cysts,To determine the most likely cause of the pati...
486,A 78-year-old man is brought to the emergency ...,Occlusion of the left middle cerebral artery,"To approach this case, we must carefully evalu..."
490,A previously healthy 21-year-old man comes to ...,Chronic cerebral hypoxia,"To diagnose this patient's condition, we can a..."
492,A 9-year-old boy is brought to a pediatric psy...,Attention deficit hyperactivity disorder,To arrive at the most likely diagnosis for thi...


In [46]:
print(role_response_dr_answers[9])

To approach this case, let's analyze the situation step by step:

### Step 1: Patient Demographics and Birth History
- **Age**: The patient is a 3-week-old male newborn.
- **Gestational Age and Birth Weight**: He was born at 38 weeks' gestation weighing 3005 g (6 lb, 10 oz), which is appropriate for gestational age.

### Step 2: Current Clinical Findings
- **Current Weight**: He now weighs 2835 g (6 lb, 4 oz), indicating significant weight loss.
- **Vital Signs**: Elevated temperature (38.9°C), tachycardia (pulse 176/min), and tachypnea (respirations 42/min).
- **Physical Exam**: Irritability, diaphoretic (sweating), poor subcutaneous fat, and neck swelling at the midline.

### Step 3: Maternal Medical History
- **Mother's Condition**: The mother has a history of Graves' disease and underwent near-total thyroidectomy during pregnancy, followed by L-thyroxine therapy.

### Step 4: Differential Diagnosis Considerations
Given these findings, differential diagnoses might include:
- **Conge

### so you can see above that really the correct answer is in there, it's just not an *exact* substring - could this be solved with some ....... **Structured output**????

In [47]:
from lib.llms.chats.openai import StructuredOutputChatOpenAI
from pydantic import BaseModel


class DiagnosesModel(BaseModel):
    chain_of_thought: str
    possible_diagnoses: list[str]
    final_diagnoses: str

structured_llm = StructuredOutputChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o-mini',
    temperature= 0.8,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4,
    response_schema=DiagnosesModel
)

In [48]:
structured_role_response_dr_answers, need_reprocess = await async_answer(structured_llm, questions, role_response_template, batch_size = 100)

 processing 1/494 files 

 processing 2/494 files 

 processing 3/494 files 

 processing 4/494 files 

 processing 5/494 files 

 processing 6/494 files 

 processing 7/494 files 

 processing 8/494 files 

 processing 9/494 files 

 processing 10/494 files 

 processing 11/494 files 

 processing 12/494 files 

 processing 13/494 files 

 processing 14/494 files 

 processing 15/494 files 

 processing 16/494 files 

 processing 17/494 files 

 processing 18/494 files 

 processing 19/494 files 

 processing 20/494 files 

 processing 21/494 files 

 processing 22/494 files 

 processing 23/494 files 

 processing 24/494 files 

 processing 25/494 files 

 processing 26/494 files 

 processing 27/494 files 

 processing 28/494 files 

 processing 29/494 files 

 processing 30/494 files 

 processing 31/494 files 

 processing 32/494 files 

 processing 33/494 files 

 processing 34/494 files 

 processing 35/494 files 

 processing 36/494 files 

 processing 37/494 files 

 processin

In [49]:
print(structured_role_response_dr_answers[0]['final_diagnoses'])

Atheroembolism due to recent cardiac catheterization.


In [50]:
from pprint import pprint
pprint(structured_role_response_dr_answers[0])

{'chain_of_thought': '1. **Patient Background**: The patient is a 61-year-old '
                     'man with a history of unstable angina, type 2 diabetes, '
                     'and osteoarthritis. He recently underwent cardiac '
                     'catheterization with stenting and was started on several '
                     'medications, including aspirin, clopidogrel, metoprolol, '
                     'and continued other medications (insulin, naproxen). \n'
                     '\n'
                     '2. **Current Symptoms**: He presents with decreased '
                     'urinary output, malaise, fever (38°C), and purplish '
                     'discoloration of the feet. These symptoms are concerning '
                     'for a possible adverse reaction related to the recent '
                     'cardiac procedure, particularly renal issues given the '
                     'elevated creatinine levels (4.2 mg/dL) and urea nitrogen '
                     '(25 mg

In [51]:
eval_responses(gpt_4_responses, 'answer', [ob['final_diagnoses'] for ob in structured_role_response_dr_answers])

# correct = 156 / 494


0.3157894736842105

#### lol no that made it worse.....


I'm going to start going off script here and see what I can do with some explicit diagnostic steps

In [52]:
diagnostic_template = PromptTemplate("""
You are an experienced medical diagnostician AI. Your task is to accurately diagnose a medical condition or sickness based on patient symptoms and laboratory data. You will use deductive reasoning from a differential diagnosis to rule out unlikely diagnoses and arrive at the correct one.

First, review the patient case:
<patient_case>
{question}
</patient_case>

To diagnose the patient's condition, follow these steps:

1. Create a list of potential diagnoses (differential diagnosis) based on the presented case.

2. For each potential diagnosis in your list:
   a. Evaluate how well it matches the patient's symptoms and lab results.
   b. Consider any inconsistencies or contradictions between the diagnosis and the available information.
   c. Assess the likelihood of the diagnosis based on prevalence and patient demographics (if available).

3. Rule out unlikely diagnoses by explaining why they don't fit the patient's presentation.

4. Narrow down your list to the most probable diagnosis or diagnoses.

5. If multiple diagnoses remain plausible, rank them in order of likelihood.

6. Provide a final diagnosis (or top differential diagnoses if a single diagnosis cannot be determined with high confidence).

7. Explain your reasoning for the final diagnosis, including:
   a. How it accounts for the patient's symptoms and lab results.
   b. Why it is more likely than the other potential diagnoses you considered.
   c. Any additional tests or information that could help confirm the diagnosis.

Present your analysis and final diagnosis in the following format:

differential_diagnosis: List your initial differential diagnosis here, with brief explanations for each potential condition.

ruled_out_diagnoses: Explain which diagnoses you ruled out and why, based on the available information.

final_diagnosis: State your final diagnosis (or top differential diagnoses if a single diagnosis cannot be determined with high confidence).

explanation: Provide a detailed explanation of your reasoning for the final diagnosis, addressing points 7a, 7b, and 7c from the instructions above.

Remember to use clear, logical reasoning throughout your analysis. Base your conclusions solely on the provided patient symptoms and lab data, and do not introduce any additional information or assumptions unless explicitly stated in the patient's presentation.
""")


class StructuredDiagnosesModel(BaseModel):
    differential_diagnosis: list[str]
    ruled_out_diagnoses: str
    final_diagnoses: str
    explanation: str
   

structured_llm = StructuredOutputChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o-mini',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4,
    response_schema=StructuredDiagnosesModel
)

In [53]:
diagnostic_answers, need_reprocess = await async_answer(structured_llm, questions, diagnostic_template, batch_size = 100)



 processing 1/494 files 

 processing 2/494 files 

 processing 3/494 files 

 processing 4/494 files 

 processing 5/494 files 

 processing 6/494 files 

 processing 7/494 files 

 processing 8/494 files 

 processing 9/494 files 

 processing 10/494 files 

 processing 11/494 files 

 processing 12/494 files 

 processing 13/494 files 

 processing 14/494 files 

 processing 15/494 files 

 processing 16/494 files 

 processing 17/494 files 

 processing 18/494 files 

 processing 19/494 files 

 processing 20/494 files 

 processing 21/494 files 

 processing 22/494 files 

 processing 23/494 files 

 processing 24/494 files 

 processing 25/494 files 

 processing 26/494 files 

 processing 27/494 files 

 processing 28/494 files 

 processing 29/494 files 

 processing 30/494 files 

 processing 31/494 files 

 processing 32/494 files 

 processing 33/494 files 

 processing 34/494 files 

 processing 35/494 files 

 processing 36/494 files 

 processing 37/494 files 

 processin

In [54]:
eval_responses(gpt_4_responses, 'answer', [ob['final_diagnoses'] for ob in diagnostic_answers])

# correct = 124 / 494


0.25101214574898784

In [55]:
gpt_4_responses.head()

Unnamed: 0,question,answer,CoT.2,Rationale,CoT Grade,DR,Rationale.1,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,Cholesterol embolization syndrome,This patient's presentation of decreased urina...,Y,Cholesterol embolization syndrome,The differential for this patient includes acu...,Y,Atheroembolic renal disease,"This patient has decreased urinary output, mal...",Y,Cholesterol embolism,The patient's recent history of cardiac cathet...,Y,Contrast-induced nephropathy,This patient's symptoms and clinical history s...,N
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,Alcoholic neuropathy,This patient presents with altered mental stat...,N,Cerebellar disease,The differential for this patient includes alc...,N,Wernicke's encephalopathy,This patient's symptoms of altered mental stat...,N,Wernicke's encephalopathy,"The patient's symptoms (somnolence, slurred sp...",N,Wernicke's encephalopathy,The prior probability of Wernicke's encephalop...,N
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,Histoplasmosis,This patient has recently been hiking in Missi...,Y,Histoplasmosis,The differential diagnosis in this case includ...,Y,Histoplasmosis,"This patient presents with fever, cough, chest...",Y,Histoplasmosis,The patient's history of recent hiking in Miss...,Y,Histoplasma capsulatum,The prior probability of Histoplasmosis (a fun...,Y
3,A 67-year-old man who was diagnosed with arthr...,Psoriatic arthritis,Psoriatic Arthritis,This patient has a long history of arthritis a...,Y,Psoriatic Arthritis,The differential diagnosis for this patient in...,Y,Psoriatic Arthritis,The patient's clinical presentation suggests p...,Y,Psoriatic Arthritis,The patient presents with several features sug...,Y,Psoriatic Arthritis,The prior probability of psoriatic arthritis i...,Y
4,A one-day-old male is evaluated in the hospita...,Duodenal atresia,Duodenal atresia,The baby in question is displaying signs of in...,Y,Duodenal atresia,"This neonate presents with bilious vomiting, a...",Y,Meconium ileus,This neonate's clinical presentation of biliou...,N,Intestinal atresia,"The symptoms presented by this neonate, such a...",Y,Duodenal atresia in Down Syndrome,"In Bayesian reasoning, we begin with a prior p...",Y


In [56]:
diagnostic_answers[0]

{'differential_diagnosis': ['Acute Kidney Injury (AKI) due to Contrast-Induced Nephropathy',
  'Acute Interstitial Nephritis (AIN)',
  'Vasculitis (e.g., ANCA-associated vasculitis)',
  'Thrombotic Microangiopathy (e.g., Thrombotic Thrombocytopenic Purpura)',
  'Sepsis with Acute Kidney Injury'],
 'ruled_out_diagnoses': '1. Acute Kidney Injury due to Contrast-Induced Nephropathy: While the patient underwent cardiac catheterization, the timeline is two weeks later and the renal biopsy results are more indicative of a different process than straightforward contrast nephropathy. 2. Acute Interstitial Nephritis: The presence of eosinophilia suggests it could be a drug reaction, but the specific findings on the biopsy were not consistent. 3. Sepsis: While fever and leukocytosis were present, the predominant findings on the renal biopsy and the severity of the creatinine elevation are more characteristic of vascular pathology than infection.',
 'final_diagnoses': 'Thrombotic Microangiopathy 

In [57]:
diagnostic_answers[3]

{'differential_diagnosis': ['Rheumatoid Arthritis (RA) - Symmetrical joint involvement, morning stiffness, and possible extra-articular manifestations; inflammation is common in RA cases.',
  'Psoriatic Arthritis (PsA) - Associated with skin conditions like psoriasis (dandruff may indicate seborrheic dermatitis), asymmetric joint involvement, nail changes such as pitting, and enthesitis could align with recent symptoms.',
  'Gout - Usually causes acute monoarthritis and is less likely based on the chronicity and bilateral knee involvement; typically presents as sudden pain, redness, and swelling in one joint.',
  'Osteoarthritis - Although joint pain and swelling are common, the age and history of sudden bilateral issues and significant back pain suggest inflammatory processes.',
  'Ankylosing Spondylitis - Common in older males, characterized by back pain and morning stiffness, also involves peripheral joints.'],
 'ruled_out_diagnoses': 'Gout was ruled out due to the chronic nature of

In [58]:
class EvalAnswer(BaseModel):
    reasoning: str
    answer: bool


eval_llm = StructuredOutputChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o-mini',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4,
    response_schema=EvalAnswer
)

In [59]:
eval_prompt_template = PromptTemplate("""
You are an expert medical grader who specializes in grading whether a diagnoses is correct or not.

If the correct diagnoses is {correct_answer}, could a diagnoses of {predicted_answer} also be considered correct?

provide a brief explanation and an answer that is TRUE or FALSE. TRUE means that the other diagnoses would be considered correct and functionally the same as the "correct" diagnoses.
""")


another_template = PromptTemplate("""
You are a medical AI assistant tasked with determining whether two given medical diagnoses are essentially the same or different. Your goal is to compare the diagnoses and decide if they refer to the same medical condition, even if they might use slightly different terminology.

Here are the two diagnoses you need to compare:

diagnoses1 = {correct_answer}
                                  
diagnoses2 = {predicted_answer}

Compare these two diagnoses carefully. Consider the following aspects:
1. The main condition or disease mentioned
2. Any specific subtypes or variations
3. The affected body part or system
4. Any mentioned symptoms or characteristics
5. The severity or stage of the condition (if specified)

Provide your reasoning and decision in the following format:
1. First, explain your thought process and analysis inside "reasoning" field.
2. Then, give your final decision as either True or False inside the "answer". Answer True if the diagnoses are the SAME or False if they are DIFFERENT.

Guidelines for comparison:
- Diagnoses should be considered the same if they refer to the same underlying condition, even if the exact wording is different.
- Minor differences in terminology (e.g., "heart attack" vs. "myocardial infarction") should not necessarily be considered different diagnoses.
- However, specific subtypes or variations of a condition should be considered different (e.g., "Type 1 diabetes" vs. "Type 2 diabetes").
- If one diagnosis is more specific than the other but still refers to the same general condition, they should be considered the same. (eg "Type 1 diabetes" and "diabetes")

Here are some examples:
- "Acute myocardial infarction" and "Heart attack" would be considered the SAME.
- "Bronchitis" and "Pneumonia" would be considered DIFFERENT.
- "Migraine with aura" and "Migraine without aura" would be considered DIFFERENT.
- "Osteoarthritis of the knee" and "Degenerative joint disease of the knee" would be considered the SAME.

Remember to consider all aspects of the diagnoses before making your decision. If you're unsure, explain your uncertainty in the reasoning section.

Begin your analysis now.
""")

async def llm_eval_responses(response_df: pd.DataFrame, eval_llm: ChatLLM, true_col:str, pred_col: str):
    '''
    Given a dataframe of responses, a true column label that is in the dataframe and an array of
    responses from the llm

    ex: question: What color is the sky? answer = blue, llm_res = "the sky is blue" would evaluate to True
    '''
    no_nulls = response_df[~response_df[true_col].isna()].reset_index()

    filled = [eval_prompt_template.populate(correct_answer = correct_answer, predicted_answer = predicted_answer) for (correct_answer, predicted_answer) in no_nulls[[true_col, pred_col]].values ]
    
    is_correct_answers = await asyncio.gather(*[ async_answer_from_question(eval_llm, p) for p in filled])

    answer_is_correct = []
    for is_correct in is_correct_answers:
        parsed = json.loads(is_correct.content)
        answer_is_correct.append(parsed['answer'])

    correct = pd.Series(answer_is_correct)
    print(f'# correct = {correct.sum()} / {len(correct)}')

    return correct.sum() / len(correct), correct





In [60]:
r, correct = await llm_eval_responses(gpt_4_responses, eval_llm, 'answer', 'DR')

# correct = 98 / 494


UFF dah This is not trivial

In [120]:
correct

0       True
1      False
2       True
3       True
4       True
       ...  
489    False
490     True
491    False
492    False
493    False
Length: 494, dtype: bool