In [None]:
import os
import pandas as pd
import re
from tqdm.auto import tqdm

# Function to load CSV files from a folder and add a 'question_type' column
def load_csv_from_folder(folder_path):
    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Get the question_type from the file name (text prior to the first underscore)
            label_type = file_name

            # Load the CSV file into a DataFrame
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            # Append the data to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data

folder = "../data/annotated-synthetic"
df = load_csv_from_folder(folder)



In [None]:
def create_prompt(note, question, qtype):
    if qtype in ['yes', 'na-bool']:
        type_str = "str - 'Yes/No'"
        answer_str = "str - 'Yes', 'No' or 'NA' if the answer is not in the note."
        json_example = f"""```json
{{
    "question" : "Does the note state that the patient is breathing normally on room air?",
    "type": 'Yes/No (answer "NA" if the answer cannot be found in the note)',
    "answer": "No",
    "section": "History of Present Illness",
    "difficulty": "2",
    "source": "She currently is dependent on oxygen and wears 1.5-2 liters around the clock",
    "explanation": "The note states that she relies on oxygen and provides the amount as 1.5-2 liters so she is not breathing room air. We can assume since she is receiving o2 supplmentation and dependent on it, she cannot breathe normally on room air."
}}```"""
    elif qtype in ['numeric', 'na-numeric']:
        type_str = "str - 'Numeric'"
        answer_str = "float - a single number (e.g., 92.5) or 'NA' if the answer is not in the note."
        json_example = f"""```json
{{
    "question": "What was the patient's highest creatinine measurement recorded in the note?",
    "type": 'Numeric (answer "NA" if the answer cannot be found in the note)',
    "answer": "1.4",
    "section": "Pertinent Results",
    "difficulty": "4",
    "source": "12/03/2023: CREAT: 1.4 \n 12/07/2023: CREAT: 1.1",
    "explanation": "The highest CREAT measurement was 1.4 because the value 1.4 on 12/03/2023 is higher than the 1.1 value on 12/07/2023."
}}```"""
    else:
        raise Exception('Not Implemented')

    prompt = f"""***PATIENT NOTE:
    {note}

    Answer the following 
    *** QUESTION:
    {question}

    *** Format your response as a JSON object with the following keys: 
    * question: str - the question you were asked to answer
    * type: {type_str}
    * answer: {answer_str}
    * section: str - the specific section of the note which contains the answer to the question (Example Answers: 'History of Present Illness', 'Past Medical History', 'Social History', 'Family History', 'Physical Exam', 'Pertinent Results', 'Brief Hospital Course', 'Discharge Medications', 'Discharge Disposition', 'Discharge Condition', 'Discharge Instructions')
    * source: str - exact quote of content in the note that allowed you to answer the question, this should be a quote directly taken from the "***PATIENT NOTE". Copy and pasting this string should exactly match content in the Note.
    * difficulty: int - a score from 1-10 indicating how difficult this question is to answer based on the note
    * explanation: str - explanation of why the answer is correct and how the source in the note helped to answer the question

    An example of how your JSON response should be formatted is shown below (this is only an example of one question, others should be in a list, your answer should be based on the ***Patient Note provided above):
    ***EXAMPLE RESPONSE:
    {json_example}

    Provide a response that can be directly read by the Python JSON Parser library based on the ***PATIENT NOTE at the beginning of this message.
    """
    return prompt

In [None]:
examples = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    example_dict = {
        'id': i,
        'question': row['question'],
        'label': row['answer'],
        'difficulty': row['difficulty'],
        'question_type': row['type'],
        'prompt': create_prompt(row['text'], row['question'], qtype=row['type'])
    }
    examples.append(example_dict)

save_df = pd.DataFrame(examples)
display(save_df)
save_df.to_csv('./eval-data/annotated-mimic/prompts.csv', index=False)