In [1]:
import os
import json

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np

med_qa_data = pd.read_csv('data/cleaned_MedQA_complete_graded_data_encoded.csv')

In [2]:
med_qa_data.head()

Unnamed: 0,question,answer,new_IR,IR Grade,new_AR,AR Grade,new_BR,BR Grade,new_DR,DR Grade,...,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,Thrombotic microangiopathy,N,Thrombotic microangiopathy.,N,Thrombotic microangiopathy.,N,Thrombotic microangiopathy,N,...,Y,Atheroembolic renal disease,"This patient has decreased urinary output, mal...",Y,Cholesterol embolism,The patient's recent history of cardiac cathet...,Y,Contrast-induced nephropathy,This patient's symptoms and clinical history s...,N
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,Alcohol intoxication,N,Alcohol intoxication,N,Alcohol intoxication,N,Alcohol intoxication,N,...,N,Wernicke's encephalopathy,This patient's symptoms of altered mental stat...,N,Wernicke's encephalopathy,"The patient's symptoms (somnolence, slurred sp...",N,Wernicke's encephalopathy,The prior probability of Wernicke's encephalop...,N
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,Valley Fever,N,Coccidioidomycosis,N,Histoplasmosis,Y,Fungal infection,N,...,Y,Histoplasmosis,"This patient presents with fever, cough, chest...",Y,Histoplasmosis,The patient's history of recent hiking in Miss...,Y,Histoplasma capsulatum,The prior probability of Histoplasmosis (a fun...,Y
3,A 67-year-old man who was diagnosed with arthr...,Psoriatic arthritis,Psoriatic arthritis,Y,Psoriatic arthritis,Y,Psoriatic arthritis,Y,Psoriatic arthritis,Y,...,Y,Psoriatic Arthritis,The patient's clinical presentation suggests p...,Y,Psoriatic Arthritis,The patient presents with several features sug...,Y,Psoriatic Arthritis,The prior probability of psoriatic arthritis i...,Y
4,A one-day-old male is evaluated in the hospita...,Duodenal atresia,Gastroschisis,N,Intestinal obstruction,N,Gastroschisis,N,Duodenal atresia,Y,...,Y,Meconium ileus,This neonate's clinical presentation of biliou...,N,Intestinal atresia,"The symptoms presented by this neonate, such a...",Y,Duodenal atresia in Down Syndrome,"In Bayesian reasoning, we begin with a prior p...",Y


In [3]:
med_qa_data.columns

Index(['question', 'answer', 'new_IR', 'IR Grade', 'new_AR', 'AR Grade',
       'new_BR', 'BR Grade', 'new_DR', 'DR Grade', 'CoT', 'CoT.1', 'GPT-4->',
       'CoT.2', 'Rationale', 'CoT Grade', 'DR', 'Rationale.1', 'DR Grade.1',
       'IR', 'Rationale.2', 'IR Grade.1', 'AR', 'Rationale.3', 'AR Grade.1',
       'BR', 'Rationale.4', 'BR Grade.1'],
      dtype='object')

In [4]:
# separate out the gpt 3.5 responses from the gpt4 responses - these are all listed after the 'GPT-4->' column
gpt4_cols = ['CoT.2', 'Rationale', 'CoT Grade', 'DR', 'Rationale.1', 'DR Grade.1',
       'IR', 'Rationale.2', 'IR Grade.1', 'AR', 'Rationale.3', 'AR Grade.1',
       'BR', 'Rationale.4', 'BR Grade.1']
gpt_4_responses = med_qa_data[['question','answer', *gpt4_cols]]

# the last row is a total row, so leave it out
gpt_4_responses = gpt_4_responses.iloc[0:len(gpt_4_responses) - 1]

In [5]:
# dropping the na answer rows from the gpt_4_responses

gpt_4_responses[np.any(gpt_4_responses[gpt4_cols].isna(), axis = 1)]

Unnamed: 0,question,answer,CoT.2,Rationale,CoT Grade,DR,Rationale.1,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
50,A 36-year-old woman comes to the physician bec...,Papillary carcinoma of the thyroid,Thyroid cancer,This patient presents with a painless lump on ...,Y,Thyroid cancer,The differential for this patient includes: be...,Y,Papillary thyroid carcinoma,This patient presents with a painless lump on ...,Y,Papillary thyroid carcinoma,"The presence of a painless, firm, and irregula...",Y,,The prior probability of thyroid cancer in a y...,
51,A 45-year-old man presents to the physician be...,Pseudomonas keratitis,Insufficient information,The question is missing the photograph of the ...,N,,,,,,,,,,,,
97,A 75-year-old man comes to the physician for t...,Aortic valve stenosis,,,,,,,,,,,,,,,
114,A patient presents to the emergency department...,Medication reaction,The patient presented to the emergency departm...,Ischemic stroke,,The differential diagnosis for this patient co...,Cerebrovascular Accident (Stroke),N,This patient presents with severe headache tha...,Medication-induced ischemic stroke,Y,This patient's presentation is consistent with...,Ischemic stroke,N,The prior probability of a stroke in a relativ...,Stroke,N
179,A 50-year-old man comes to the physician for a...,Epidermoid cyst,"Apologies, but there's a missing component in ...",,N,"In the absence of a provided photograph, we'll...",Sebaceous cyst,N,Without an actual photograph or detailed descr...,Lipoma,N,Since no photograph or further description of ...,Lipoma,N,The prior probability of metastatic disease in...,Metastatic colon cancer,N
189,A 63-year-old man comes to the physician for t...,Malignant melanoma,,,,,,,,,,,,,,,
192,A 7-month-old boy is brought to the pediatrici...,Aldolase B,,,,,,,,,,,,,,,
196,A 77-year-old woman is brought to her primary ...,Lewy body dementia,,,,,,,,,,,,,,,
208,A 24-year-old man is brought to the physician ...,Osteoclastoma,,,,,,,,,,,,,,,
279,A 32-year-old man comes to the physician becau...,Chlamydia trachomatis,,,,,,,,,,,,,,,


In [6]:
gpt_4_responses = gpt_4_responses[~np.any(gpt_4_responses[gpt4_cols].isna(), axis = 1)]

In [7]:
gpt_4_responses.shape

(494, 17)

In [8]:
gpt_35_responses = med_qa_data[['question','answer','new_IR', 'IR Grade', 'new_AR', 'AR Grade',
       'new_BR', 'BR Grade', 'new_DR', 'DR Grade', 'CoT', 'CoT.1']]

gpt_35_responses = gpt_35_responses.iloc[0:len(gpt_35_responses) - 1]

In [9]:
gpt_4_responses.head()

Unnamed: 0,question,answer,CoT.2,Rationale,CoT Grade,DR,Rationale.1,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,Cholesterol embolization syndrome,This patient's presentation of decreased urina...,Y,Cholesterol embolization syndrome,The differential for this patient includes acu...,Y,Atheroembolic renal disease,"This patient has decreased urinary output, mal...",Y,Cholesterol embolism,The patient's recent history of cardiac cathet...,Y,Contrast-induced nephropathy,This patient's symptoms and clinical history s...,N
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,Alcoholic neuropathy,This patient presents with altered mental stat...,N,Cerebellar disease,The differential for this patient includes alc...,N,Wernicke's encephalopathy,This patient's symptoms of altered mental stat...,N,Wernicke's encephalopathy,"The patient's symptoms (somnolence, slurred sp...",N,Wernicke's encephalopathy,The prior probability of Wernicke's encephalop...,N
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,Histoplasmosis,This patient has recently been hiking in Missi...,Y,Histoplasmosis,The differential diagnosis in this case includ...,Y,Histoplasmosis,"This patient presents with fever, cough, chest...",Y,Histoplasmosis,The patient's history of recent hiking in Miss...,Y,Histoplasma capsulatum,The prior probability of Histoplasmosis (a fun...,Y
3,A 67-year-old man who was diagnosed with arthr...,Psoriatic arthritis,Psoriatic Arthritis,This patient has a long history of arthritis a...,Y,Psoriatic Arthritis,The differential diagnosis for this patient in...,Y,Psoriatic Arthritis,The patient's clinical presentation suggests p...,Y,Psoriatic Arthritis,The patient presents with several features sug...,Y,Psoriatic Arthritis,The prior probability of psoriatic arthritis i...,Y
4,A one-day-old male is evaluated in the hospita...,Duodenal atresia,Duodenal atresia,The baby in question is displaying signs of in...,Y,Duodenal atresia,"This neonate presents with bilious vomiting, a...",Y,Meconium ileus,This neonate's clinical presentation of biliou...,N,Intestinal atresia,"The symptoms presented by this neonate, such a...",Y,Duodenal atresia in Down Syndrome,"In Bayesian reasoning, we begin with a prior p...",Y


## Below is the template that the researchers used to get their responses

Got this from the GPT3.5_API.ipynb notebook

In [10]:
qa_template = (
    "Diagnose the patients condition using the given passage with a short factoid answer.\n"
    "Question: {question}\n"
    "Rationale: a step-by-step deduction that identifies the correct response. {prompt}"
               )

In [11]:
from lib.llms import PromptTemplate

prompt_template = PromptTemplate(qa_template)

In [12]:
print(prompt_template.populate(question = "this is my special question", prompt = "this is a COT prompt"))

Diagnose the patients condition using the given passage with a short factoid answer.
Question: this is my special question
Rationale: a step-by-step deduction that identifies the correct response. this is a COT prompt


In [13]:
# intuitive
ir_prompt = "Use symptom, signs and laboratory disease associations to step by step deduce the correct response"

# differential
dr_prompt = "Use step by step deduction to create a differential diagnosis and then use step by step deduction to determine the correct response."

# analytic
ar_prompt = "Use analytic reasoning to deduce the physiologic or biochemical pathophysiology of the patient and step by step identify the correct response."

# bayesian
br_prompt = "Use step by step Bayesian Inference to create a prior probability that is updated with new information in the history to produce a posterior probability and determine the final diagnosis."

## populate the prompt template with a real question and prompt from the dataset

In [14]:
to_send = prompt_template.populate( question = gpt_4_responses['question'][0], prompt = dr_prompt)
print(to_send)

Diagnose the patients condition using the given passage with a short factoid answer.
Question: Two weeks after undergoing an emergency cardiac catherization with stenting for unstable angina pectoris, a 61-year-old man has decreased urinary output and malaise. He has type 2 diabetes mellitus and osteoarthritis of the hips. Prior to admission, his medications were insulin and naproxen. He was also started on aspirin, clopidogrel, and metoprolol after the coronary intervention. His temperature is 38°C (100.4°F), pulse is 93/min, and blood pressure is 125/85 mm Hg. Examination shows mottled, reticulated purplish discoloration of the feet. Laboratory studies show:
Hemoglobin count 14 g/dL
Leukocyte count 16,400/mm3
Segmented neutrophils 56%
Eosinophils 11%
Lymphocytes 31%
Monocytes 2%
Platelet count 260,000/mm3
Erythrocyte sedimentation rate 68 mm/h
Serum
Urea nitrogen 25 mg/dL
Creatinine 4.2 mg/dL
Renal biopsy shows intravascular spindle-shaped vacuoles. What is the most likely cause of t

Lets see what GPT 4o gives back when we ask this question

In [15]:
from lib.llms.chats import ChatOpenAI

llm_4o = ChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4
)

In [16]:
res = llm_4o(to_send)

In [17]:
print(res.text)

Based on the information provided in the passage, let's go through the step-by-step deduction process for diagnosing the patient's condition:

1. **Patient Background and Symptoms:** 
   - The patient is a 61-year-old man who underwent cardiac catheterization with stenting for unstable angina.
   - He has decreased urinary output and malaise.
   - He has a medical history of type 2 diabetes mellitus and osteoarthritis, and post-intervention medications include aspirin, clopidogrel, and metoprolol.
   - He presents with a temperature of 38°C, pulse 93/min, blood pressure 125/85 mmHg, and mottled, reticulated purplish discoloration of the feet.

2. **Laboratory Findings:**
   - Elevated leukocyte count with eosinophilia (11% eosinophils).
   - Elevated erythrocyte sedimentation rate (ESR).
   - Elevated serum creatinine (4.2 mg/dL) and urea nitrogen (25 mg/dL), indicating renal impairment.
   - Renal biopsy shows intravascular spindle-shaped vacuoles.

3. **Possible Diagnoses:**
   - **A

# Before I go any further, lets create a rudimentary eval harness

In [67]:
def eval_responses(response_df: pd.DataFrame, true_col:str, llm_res: list):
    '''
    Given a dataframe of responses, a true column label that is in the dataframe and an array of
    responses from the llm
    test whether answer from the true column is contained within the corresponding llm_res element

    ex: question: What color is the sky? answer = blue, llm_res = "the sky is blue" would evaluate to True
    '''
    no_nulls = response_df[~response_df[true_col].isna()].reset_index()
    true_answers = no_nulls[true_col]
    
    correct = pd.Series([a.lower() in b.lower() for a, b in zip(true_answers, llm_res)])

    print(f'# correct = {correct.sum()} / {len(correct)}')

    return correct.sum() / len(correct)


def get_incorrect(response_df: pd.DataFrame, true_col:str, llm_res: list):

    no_nulls = response_df[~response_df[true_col].isna()].reset_index()
    true_answers = no_nulls[true_col]
    
    correct = pd.Series([a.lower() in b.lower() for a, b in zip(true_answers, llm_res)])

    incorrect_df = no_nulls[~correct]

    incorrect_llm_responses = [llm_res[i] for i in incorrect_df.index]

    incorrect_df['llm_responses'] = incorrect_llm_responses

    return incorrect_df[['question', true_col, 'llm_responses']]

## lets test this out on the study responses

### We can see that that answers that were marked as correct don't *exactly* match the "golden answer" (adhd = "attention deficit hyperactivity disorder")

I don't expect this to replicate exactly because the answers from the study were graded by real human doctors

In [23]:
study_answer_cols = ['CoT.2', 'DR',
       'IR', 'AR','BR']

study_responses = [gpt_4_responses[col].values for col in study_answer_cols]

for i, col in enumerate(study_answer_cols):
       percent = eval_responses(gpt_4_responses, 'answer', study_responses[i])
       print(f'reasoning type {col} = {percent * 100}%')

# correct = 177 / 494
reasoning type CoT.2 = 35.82995951417004%
# correct = 201 / 494
reasoning type DR = 40.688259109311744%
# correct = 192 / 494
reasoning type IR = 38.8663967611336%
# correct = 191 / 494
reasoning type AR = 38.663967611336034%
# correct = 164 / 494
reasoning type BR = 33.198380566801625%


## As you can see from above, DR is still the best, and the results here don't match the study results. We could get a lot fancier with the evaluation, but we are really interested for this session in prompt engineering



#### Now I'm going to define a function that will take in an array of the MedQA questions and then hit an LLM with them and get the answers

The template that I'm going to use will have the keywords "context" and "question". The "context" variable is where we will put the real medical question (sorry confusing) and the "question" variable is where we will put our prompt

In [24]:
from json import JSONDecodeError
from collections import defaultdict
from lib.reasoning.pipelines.answer_with_context import AnswerWithContextPipeline
from itertools import islice
import time
import asyncio
from typing import Union
import numpy as np



def batch_tasks(iterable, batch_size):
    iterator = iter(iterable)
    while True:
        batch = list(islice(iterator, batch_size))
        if not batch:
            break
        yield batch


async def async_get_answer(answer_with_context_pipeline: AnswerWithContextPipeline, prompt: str, evidence: str, history = []):
    try:
        extracted_answer = await answer_with_context_pipeline.ainvoke(question = prompt, evidence = evidence, history = history)

        return extracted_answer
    except Exception as e:
        print(e)
        return {"explanation": "Failed to parse",  'reason': e}
    

async def async_answer(answer_with_context_pipeline: AnswerWithContextPipeline, prompt, evidence_arr, batch_size = 100, history = []):
    n_data = len(evidence_arr)
    
    tasks = []
    for i, evidence in enumerate(evidence_arr):
        print(f'====== \n processing {i + 1}/{n_data} files \n')

        tasks.append(async_get_answer(answer_with_context_pipeline, prompt, evidence, history = history))

    extracted_answers = []
    needs_reprocessing = []
    for batch_number, batch in enumerate(batch_tasks(tasks, batch_size)):
        print(f'processing batch {batch_number} of size {batch_size}')
        settled_tasks = await asyncio.gather(*batch)
        for i,initial_extraction in enumerate(settled_tasks):
            idx = batch_number*batch_size + i
            if isinstance(initial_extraction, dict):
                needs_reprocessing.append(initial_extraction)
                extracted_answers.append(initial_extraction)
            else:
                try:
                    parsed_initial_extraction = json.loads(initial_extraction.content)

                    extracted_answers.append(parsed_initial_extraction)

                except JSONDecodeError as e:
                    needs_reprocessing.append({"answer": "Failed to parse", "idx": idx, 'reason': e})
                    extracted_answers.append(initial_extraction.content)
        # avoid rate limit
        time.sleep(6)
        
    return extracted_answers, needs_reprocessing

In [None]:

# All of the questions that we want responded too
questions = gpt_4_responses['question'].values

qa_template = (
    "Question: {context}\n"
    "Rationale: a step-by-step deduction that identifies the correct response. {question}" # remember this is where the prompt goes
    )

llm_4o = ChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4
)


answer_pipeline = AnswerWithContextPipeline(
    llm = llm_4o,
    qa_template = qa_template
    # system_prompt="you could put a system prompt here"
)

In [27]:
dr_answers, need_reprocess = await async_answer(answer_pipeline, dr_prompt, questions, batch_size = 100)

 processing 1/494 files 

 processing 2/494 files 

 processing 3/494 files 

 processing 4/494 files 

 processing 5/494 files 

 processing 6/494 files 

 processing 7/494 files 

 processing 8/494 files 

 processing 9/494 files 

 processing 10/494 files 

 processing 11/494 files 

 processing 12/494 files 

 processing 13/494 files 

 processing 14/494 files 

 processing 15/494 files 

 processing 16/494 files 

 processing 17/494 files 

 processing 18/494 files 

 processing 19/494 files 

 processing 20/494 files 

 processing 21/494 files 

 processing 22/494 files 

 processing 23/494 files 

 processing 24/494 files 

 processing 25/494 files 

 processing 26/494 files 

 processing 27/494 files 

 processing 28/494 files 

 processing 29/494 files 

 processing 30/494 files 

 processing 31/494 files 

 processing 32/494 files 

 processing 33/494 files 

 processing 34/494 files 

 processing 35/494 files 

 processing 36/494 files 

 processing 37/494 files 

 processin

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


processing batch 1 of size 100
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 images
Got 0 i

In [28]:
eval_responses(gpt_4_responses, 'answer', dr_answers)

# correct = 256 / 494


0.5182186234817814

## So already after only upgrading the model to 4o istead of 4, there's about a 10% increase in accuracy


# assign a role and responsibility to the prompt

In [53]:
from lib.base import AIMessage, Document, HumanMessage, SystemMessage
from lib.llms.chats.base import ChatLLM

def get_populated_prompts(questions: list[str], template: PromptTemplate):
    return [
        template.populate(question = question)
        for question in questions
    ]

async def async_answer_from_question(llm: ChatLLM, prompt, examples: list[tuple] = [], system_prompt = ''):
    messages = []
    if system_prompt:
        messages.append(SystemMessage(content=system_prompt))

    for human, ai in examples:
        messages.append(HumanMessage(content=human))
        messages.append(AIMessage(content=ai))

    messages.append(HumanMessage(content=prompt))

    try:
        output = await llm.ainvoke(messages)

        return Document(text = output.text )
    except Exception as e:
        print("hit error async invoking in answer with context: ", e)
        raise e
    


async def async_answer(answer_with_llm: ChatLLM, questions: list[str], template: PromptTemplate, examples: list[tuple[str]] = [], system_prompt = '', batch_size = 100):
    n_data = len(questions)
    
    populated_prompts = get_populated_prompts(questions, template)
    tasks = []
    for i, pop_prompt in enumerate(populated_prompts):
        print(f'====== \n processing {i + 1}/{n_data} files \n')

        tasks.append(async_answer_from_question(answer_with_llm, pop_prompt, examples= examples, system_prompt=system_prompt))

    extracted_answers = []
    needs_reprocessing = []
    for batch_number, batch in enumerate(batch_tasks(tasks, batch_size)):
        print(f'processing batch {batch_number} of size {batch_size}')
        settled_tasks = await asyncio.gather(*batch)
        for i,initial_answer in enumerate(settled_tasks):
            idx = batch_number*batch_size + i
            if isinstance(initial_answer, dict):
                needs_reprocessing.append(initial_answer)
                extracted_answers.append(initial_answer)
            else:
                try:
                    parsed_initial_answer = json.loads(initial_answer.content)

                    extracted_answers.append(parsed_initial_answer)

                except JSONDecodeError as e:
                    needs_reprocessing.append({"answer": "Failed to parse", "idx": idx, 'reason': e})
                    extracted_answers.append(initial_answer.content)
        # avoid rate limit
        time.sleep(2)
        
    return extracted_answers, needs_reprocessing




In [57]:

llm_4o = ChatOpenAI(
    base_url='https://api.openai.com/v1',
    model = 'gpt-4o-mini',
    temperature= 1,
    api_key = os.environ.get('OPENAI_API_KEY'),
    max_retries = 4
)


role_response_template = PromptTemplate("""
You are an expert medical doctor who carefully thinks through patients symptoms and diagnoses them.
Question: {question}
Rationale: a step-by-step deduction that identifies the correct response. Use step by step deduction to create a differential diagnosis and then use step by step deduction to determine the correct response.
""")

In [58]:
role_response_dr_answers, need_reprocess = await async_answer(llm_4o, questions, role_response_template, batch_size = 100)

 processing 1/494 files 

 processing 2/494 files 

 processing 3/494 files 

 processing 4/494 files 

 processing 5/494 files 

 processing 6/494 files 

 processing 7/494 files 

 processing 8/494 files 

 processing 9/494 files 

 processing 10/494 files 

 processing 11/494 files 

 processing 12/494 files 

 processing 13/494 files 

 processing 14/494 files 

 processing 15/494 files 

 processing 16/494 files 

 processing 17/494 files 

 processing 18/494 files 

 processing 19/494 files 

 processing 20/494 files 

 processing 21/494 files 

 processing 22/494 files 

 processing 23/494 files 

 processing 24/494 files 

 processing 25/494 files 

 processing 26/494 files 

 processing 27/494 files 

 processing 28/494 files 

 processing 29/494 files 

 processing 30/494 files 

 processing 31/494 files 

 processing 32/494 files 

 processing 33/494 files 

 processing 34/494 files 

 processing 35/494 files 

 processing 36/494 files 

 processing 37/494 files 

 processin

In [59]:
eval_responses(gpt_4_responses, 'answer', role_response_dr_answers)

# correct = 216 / 494


0.43724696356275305

Hmm so performance went down on adding the role and responsibility - I want to look at some of the "incorrect" answers to see why

In [70]:
incorrect = get_incorrect(gpt_4_responses, 'answer', role_response_dr_answers)
incorrect

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect_df['llm_responses'] = incorrect_llm_responses


Unnamed: 0,question,answer,llm_responses
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,To evaluate this 61-year-old man with decrease...
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,"To evaluate this patient effectively, let's fo..."
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,To evaluate the symptoms and make a differenti...
5,A 55-year-old man comes to the physician becau...,Femoropopliteal artery stenosis,To identify the most likely diagnosis in this ...
6,A 56-year-old man with a history of hypertensi...,Aldosterone excess,"To analyze this case of the 56-year-old man, w..."
...,...,...,...
486,A 78-year-old man is brought to the emergency ...,Occlusion of the left middle cerebral artery,To determine the most likely cause of this 78-...
490,A previously healthy 21-year-old man comes to ...,Chronic cerebral hypoxia,To arrive at a diagnosis for this 21-year-old ...
491,A 15-year-old boy is brought to the physician ...,Kallmann syndrome,To determine the most likely diagnosis for thi...
492,A 9-year-old boy is brought to a pediatric psy...,Attention deficit hyperactivity disorder,To arrive at a diagnosis for this 9-year-old b...


In [72]:
print(role_response_dr_answers[1])

To evaluate this patient effectively, let's follow the structured approach to differential diagnosis and deduction.

### Step 1: Review Symptoms and Findings

- **Altered Mental Status**: The patient is somnolent but can be aroused, indicating a level of consciousness disturbance.
- **Alcoholic Smell**: This suggests possible acute alcohol intoxication or withdrawal, although his measured blood alcohol concentration (BAC) is relatively low at 0.04%.
- **Vital Signs**: 
  - Pulse: 64/min (bradycardia)
  - Respiratory Rate: 15/min (normal)
  - Blood Pressure: 120/75 mm Hg (normal)
- **Neurological Exam**:
  - Diminished deep tendon reflexes (possible neuromuscular involvement)
  - Ataxic gait (indicates possible central nervous system involvement)
- **Pupils**: Normal, which rules out opioid overdose or other anticholinergic effects that typically see abnormal pupil sizes.
- **ECG**: No abnormalities noted.

### Step 2: Consider Possible Causes (Differential Diagnosis)
1. **Alcohol Intox

In [73]:
print(dr_answers[1])

To determine the most likely cause of this patient's symptoms, we need to consider the key clinical findings and work through a differential diagnosis systematically.

### Key Clinical Findings:
1. **Altered Mental Status**: The patient is somnolent but responsive when aroused.
2. **Vital Signs**: Pulse is 64/min, respiratory rate is 15/min, blood pressure is 120/75 mm Hg.
3. **Physical Examination**:
   - Alcoholic smell and slurred speech
   - Diminished deep tendon reflexes bilaterally
   - Ataxic gait
   - Normal pupils
4. **Laboratory and ECG Findings**: 
   - Blood alcohol concentration is 0.04% (considered low for intoxication)
   - ECG shows no abnormalities

### Differential Diagnosis:
1. **Alcohol Intoxication**: The presence of alcohol smell, slurred speech, and ataxic gait suggests intoxication. However, the blood alcohol concentration is 0.04%, which is relatively low and might not fully explain the symptoms.
   
2. **Wernicke's Encephalopathy**: This condition is associat

In [74]:
gpt_4_responses.head()

Unnamed: 0,question,answer,CoT.2,Rationale,CoT Grade,DR,Rationale.1,DR Grade.1,IR,Rationale.2,IR Grade.1,AR,Rationale.3,AR Grade.1,BR,Rationale.4,BR Grade.1
0,Two weeks after undergoing an emergency cardia...,Cholesterol embolization,Cholesterol embolization syndrome,This patient's presentation of decreased urina...,Y,Cholesterol embolization syndrome,The differential for this patient includes acu...,Y,Atheroembolic renal disease,"This patient has decreased urinary output, mal...",Y,Cholesterol embolism,The patient's recent history of cardiac cathet...,Y,Contrast-induced nephropathy,This patient's symptoms and clinical history s...,N
1,A 46-year-old man is brought to the emergency ...,Benzodiazepine intoxication,Alcoholic neuropathy,This patient presents with altered mental stat...,N,Cerebellar disease,The differential for this patient includes alc...,N,Wernicke's encephalopathy,This patient's symptoms of altered mental stat...,N,Wernicke's encephalopathy,"The patient's symptoms (somnolence, slurred sp...",N,Wernicke's encephalopathy,The prior probability of Wernicke's encephalop...,N
2,A 30-year-old African American woman comes to ...,Histoplasma capsulatum infection,Histoplasmosis,This patient has recently been hiking in Missi...,Y,Histoplasmosis,The differential diagnosis in this case includ...,Y,Histoplasmosis,"This patient presents with fever, cough, chest...",Y,Histoplasmosis,The patient's history of recent hiking in Miss...,Y,Histoplasma capsulatum,The prior probability of Histoplasmosis (a fun...,Y
3,A 67-year-old man who was diagnosed with arthr...,Psoriatic arthritis,Psoriatic Arthritis,This patient has a long history of arthritis a...,Y,Psoriatic Arthritis,The differential diagnosis for this patient in...,Y,Psoriatic Arthritis,The patient's clinical presentation suggests p...,Y,Psoriatic Arthritis,The patient presents with several features sug...,Y,Psoriatic Arthritis,The prior probability of psoriatic arthritis i...,Y
4,A one-day-old male is evaluated in the hospita...,Duodenal atresia,Duodenal atresia,The baby in question is displaying signs of in...,Y,Duodenal atresia,"This neonate presents with bilious vomiting, a...",Y,Meconium ileus,This neonate's clinical presentation of biliou...,N,Intestinal atresia,"The symptoms presented by this neonate, such a...",Y,Duodenal atresia in Down Syndrome,"In Bayesian reasoning, we begin with a prior p...",Y
