In [None]:
import os
import pandas as pd
import re
from tqdm.auto import tqdm

# Function to load CSV files from a folder and add a 'question_type' column
def load_csv_from_folder(folder_path):
    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Get the question_type from the file name (text prior to the first underscore)
            label_type = file_name

            # Load the CSV file into a DataFrame
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            # Append the data to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data

folder = "../data/apixaban"
df = load_csv_from_folder(folder)



In [None]:
bool_question_definitions = {
    'afib': 'Does the note describe the patient as having atrial fibrillation (afib)? Answer "No" if the note describes the patient as having afib secondary to another reversible cause?', 
    'mdd': 'Does the note describe the patient as ever being diagnosed with depression or major depressive disorder (MDD)? Answer "No" unless the note describes a diagnosis or history of depression.',
    'schizophrenia': 'Does the note describe the patient as ever being diagnosed with schizophrenia or any schizoaffective disorders? Answer "No" unless the note describes a diagnosis or history of a schizoaffective disorder.',
    'bipolar': 'Does the note describe the patient as ever being diagnosed with bipolar disorder?  Answer "No" unless the note describes a diagnosis or history of bipolar disorder', 
    'hemorrhagic': 'Does the note describe the patient as ever having any hemorrhagic tendencies or blood dyscrasias? Answer "No" unless the note describes a diagnosis or history of hemorrhaghic tendencies or blood dyscrasias.',
    'recent_stroke': 'Does the note describe the patient as having a stroke during this admission or within the last month? (Answer "Yes" for any recent stroke if the date is unclear, answer "No" if no stroke is mentioned or a prior stroke occurred but it was not recent)',
    'peptic_ulcer_disease': 'Does the note describe the patient as ever having peptic ulcer disease?',
    'bleeding': 'Does the note describe the patient as having a serious bleeding in the past 6 months? Answer "No" unless the note describes a serious recent bleeding issue.',
    'afib_ablation': 'Does the note describe the patient as having a planned or past ablation procedure for afib? Answer "No" unless the note includes information about a past or planned ablation for afib.',
    'surgical_valvular_disease': 'Does the note describe the patient as ever having valvular disease (stenosis) requiring surgery? Answer "No" if there is mention of stenosis without surgery.',
    'heart_failure': 'Does the note describe the patient as having heart failure?',
    't2d': 'Does the note describe the patient as having diabetes mellitus (DM1, DM2, T2D, T1DM, T2DM)?',
    'arterial_hypertension': 'Does the note describe the patient as having arterial hypertension (high bp e.g. >140, or HTN)? This includes pre-existing hypertension and treated hypertension.',
    'prior_stroke': 'Does the note describe the patient as ever having a stroke or transient ischemic attack (TIA)? Answer "No" unless the note includes information about the patient having a prior stroke or TIA',
    'med_decisions': 'Does the note describe the patient as being unable to make medical decisions upon discharge? Answer "No" unless there is evidence the patient cannot make their own medical decisions. Answer "Yes" if there is clear mention of dementia or patient is deceased.'
}

num_question_definitions = {
    'PLT': 'What is the lowest platelet count (PLT) recorded for the patient in the note? Answer "NA" if there is no platelet count (PLT) that can be found in the note.', 
    'BILI': 'What is the higest total bilirubin (TotBili, Bili) mentioned in the note? Answer "NA" if no bilirubin value is available in the note.', 
    'AST': 'What is the higest aspartate aminotransferase level (AST) mentioned in the note? Answer "NA" if no AST value is available in the note.',
    'CREAT': 'What is the higest serum creatinine (Creat) mentioned in the note? Answer "NA" if no creatinine value is available in the note.',
    'HGB': 'What is the lowest hemoglobin (HGB) mentioned in the note? Answer "NA" if no HGB value is available in the note.',
    'chads2': 'What is the highest CHADS2 score mentioned? Answer "NA" if no CHADS2 score is in the note.',
    'lvef': 'What is the lowest left ventricular ejection (LVEF, ef, ejection fraction) fraction mentioned in the note? Answer "NA" if no LVEF is in the note, Answer 55 if the lowest value is 55%% or greater.',
    'blood_glucose': 'What is the highest blood glucose lab mentioned? Answer "NA" if no blood glucose score is in the note.',
}

In [None]:
def create_prompt(note, question, qtype):
    if qtype == 'yes_no':
        type_str = "str - 'Yes/No'"
        answer_str = "str - 'Yes' or 'No'"
        json_example = f"""```json
{{
    "question" : "Does the note state that the patient is breathing normally on room air?",
    "type": "Yes/No",
    "answer": "No",
    "section": "History of Present Illness",
    "difficulty": "2",
    "source": "She currently is dependent on oxygen and wears 1.5-2 liters around the clock",
    "explanation": "The note states that she relies on oxygen and provides the amount as 1.5-2 liters so she is not breathing room air. We can assume since she is receiving o2 supplmentation and dependent on it, she cannot breathe normally on room air."
}}```"""
    elif qtype == 'numeric':
        type_str = "str - 'Numeric'"
        answer_str = "float - a single number (e.g., 92.5) or 'NA' if the answer is not in the note."
        json_example = f"""```json
{{
    "question": "What was the patient's highest creatinine measurement recorded in the note?",
    "type": "Numeric",
    "answer": "1.4",
    "section": "Pertinent Results",
    "difficulty": "4",
    "source": "12/03/2023: CREAT: 1.4 \n 12/07/2023: CREAT: 1.1",
    "explanation": "The highest CREAT measurement was 1.4 because the value 1.4 on 12/03/2023 is higher than the 1.1 value on 12/07/2023."
}}```"""
    else:
        raise Exception('Not Implemented')

    prompt = f"""***PATIENT NOTE:
    {note}

    Answer the following 
    *** QUESTION:
    {question}

    *** Format your response as a JSON object with the following keys: 
    * question: str - the question you were asked to answer
    * type: {type_str}
    * answer: {answer_str}
    * section: str - the specific section of the note which contains the answer to the question (Example Answers: 'History of Present Illness', 'Past Medical History', 'Social History', 'Family History', 'Physical Exam', 'Pertinent Results', 'Brief Hospital Course', 'Discharge Medications', 'Discharge Disposition', 'Discharge Condition', 'Discharge Instructions')
    * source: str - exact quote of content in the note that allowed you to answer the question, this should be a quote directly taken from the "***PATIENT NOTE". Copy and pasting this string should exactly match content in the Note.
    * difficulty: int - a score from 1-10 indicating how difficult this question is to answer based on the note
    * explanation: str - explanation of why the answer is correct and how the source in the note helped to answer the question

    An example of how your JSON response should be formatted is shown below (this is only an example of one question, others should be in a list, your answer should be based on the ***Patient Note provided above):
    ***EXAMPLE RESPONSE:
    {json_example}

    Provide a response that can be directly read by the Python JSON Parser library based on the ***PATIENT NOTE at the beginning of this message.
    """
    return prompt

In [None]:
full_df = None
for key, question in bool_question_definitions.items():
    print(key, question)

    examples = []
    
    crit_df = df.loc[df['criterion'] == key.split('-')[0]]
    for i, row in tqdm(crit_df.iterrows(), total=crit_df.shape[0]):
        example_dict = {
            'id': i,
            'label': row['criterion'],
            'answer': row['answer'],
            'question_label': key,
            'prompt': create_prompt(row['text'], question, qtype='yes_no')
        }
        examples.append(example_dict)

    save_df = pd.DataFrame(examples)

    if full_df is None:
        full_df = save_df
    else: 
        full_df = pd.concat([full_df, save_df], axis=0)

for key, question in num_question_definitions.items():
    print(key, question)

    examples = []
    
    crit_df = df.loc[df['criterion'] == key.split('-')[0]]
    for i, row in tqdm(crit_df.iterrows(), total=crit_df.shape[0]):
        example_dict = {
            'id': i,
            'label': row['criterion'],
            'answer': row['answer'],
            'question_label': key,
            'prompt': create_prompt(row['text'], question, qtype='numeric')
        }
        examples.append(example_dict)

    save_df = pd.DataFrame(examples)

    if full_df is None:
        full_df = save_df
    else: 
        full_df = pd.concat([full_df, save_df], axis=0)
        
full_df.to_csv('../eval-data/apixaban/prompts.csv', index=False)