In [None]:
from random import randrange
import pandas as pd
import os
from bs4 import BeautifulSoup 
from datasets import Dataset
import json
import re
from tqdm.auto import tqdm

train_dir = '../eval-data/i2b2/raw/train/'
test_dir = '../eval-data/i2b2/raw/test/'

# @TODO this is not currently used
max_seq_length = 64000

def read_dir(input_dir):
    dict_list = []
    for filename in os.listdir(input_dir):
        if filename[-4:] == '.xml':
            with open(input_dir + filename, 'r') as f:
                data = f.read() 

            # Passing the stored data inside the beautifulsoup parser 
            bs_data = BeautifulSoup(data, 'xml') 
            # print(bs_data)
            
            # Finding all instances of tag   
            bs_dict = {}
            bs_dict['TEXT'] = bs_data.find_all('TEXT')[0].text

            bs_dict['ABDOMINAL'] = bs_data.find_all('ABDOMINAL')[0]['met']
            bs_dict['ADVANCED-CAD'] = bs_data.find_all('ADVANCED-CAD')[0]['met']
            bs_dict['ALCOHOL-ABUSE'] = bs_data.find_all('ALCOHOL-ABUSE')[0]['met']
            bs_dict['ASP-FOR-MI'] = bs_data.find_all('ASP-FOR-MI')[0]['met']
            bs_dict['CREATININE'] = bs_data.find_all('CREATININE')[0]['met']
            bs_dict['DIETSUPP-2MOS'] = bs_data.find_all('DIETSUPP-2MOS')[0]['met']
            bs_dict['DRUG-ABUSE'] = bs_data.find_all('DRUG-ABUSE')[0]['met']
            bs_dict['ENGLISH'] = bs_data.find_all('ENGLISH')[0]['met']
            bs_dict['HBA1C'] = bs_data.find_all('HBA1C')[0]['met']
            bs_dict['KETO-1YR'] = bs_data.find_all('KETO-1YR')[0]['met']
            bs_dict['MAJOR-DIABETES'] = bs_data.find_all('MAJOR-DIABETES')[0]['met']
            bs_dict['MAKES-DECISIONS'] = bs_data.find_all('MAKES-DECISIONS')[0]['met']
            bs_dict['MI-6MOS'] = bs_data.find_all('MI-6MOS')[0]['met']
            bs_dict['TEXT_length'] = len(bs_data.find_all('TEXT')[0].text)
        
            # Find all matches in the text
            matches = re.findall(r"Record date: (\d{4}-\d{2}-\d{2})", bs_data.find_all('TEXT')[0].text)
            
            # Return the last match if any matches are found
            if matches:
                bs_dict['LAST-NOTE-DATE'] = (matches[-1])
            else:
                bs_dict['LAST-NOTE-DATE'] = "Not Available"
            
            dict_list.append(bs_dict)
            
            df = pd.DataFrame.from_dict(dict_list)
            df.replace('not met', 0, inplace=True)
            df.replace('met', 1, inplace=True)
    return df


train_df = read_dir(train_dir)
test_df = read_dir(test_dir)
print('train shape: ', train_df.shape)
print('test shape: ', test_df.shape)

In [None]:
system_instruction = f"""You are a clinical research assistant helping to accurately answer qeustions from clinical notes. You answer with a single valid JSON object based on the patient note. 
All dates have been shifted to keep data de-identified, so they may be in the future. We care only about information captured in the note, for example when asking what is the highest lab a patient has, we mean the highest lab recorded in the note. 
If you cannot find information to answer the question asked of you in the note answer NA, unless the rest of the prompt recommends something different. 
"""

question_definitions = {
    'ABDOMINAL': 'Has this patient had a intra-abdominal surgery, small or large intestine resection or small bowel obstruction? Assume No unless a intra-abdominal surgery (e.g., appendectomy, cholecystectomy, abdominoperineal resection), small or large intestine resection (e.g., colectomy, hemicolectomy, low anterior resection), or small bowel obstruction (e.g., laparotomy, bowel resection) is mentioned.',
    'ADVANCED-CAD': 'Does this patient meet at least two of the following criteria: a.) taking two or more medications to treat cardiovascular disease, b.) any myocardial infarction, c.) current or recent angina, or d.) any ischemia? If they only have cardiovascular disease, answer no, they must have at least two of the four to answer Yes.',
    'ALCOHOL-ABUSE': 'Does this patient consume more than 7 drinks per week if female, or 14 drinks per week if male? Answer No unless the note explicitly states they consume more than the recommended number of drinks per week or otherwise states they abuse alcohol, heavy EtOH etc..',
    'ASP-FOR-MI': 'Does this patient take aspirin?',
    'CREATININE_num': "What was the patient's highest recorded creatinine level? Answer NA if there are no values. Ignore instructions to provide a Yes/No answer for this question, instead give a numeric answer.",
    'DIETSUPP-2MOS': "Does the patient take a dietary supplement?",
    'DRUG-ABUSE': 'Has this patient ever abused drugs? Answer No unless there is evidence the patient uses or used illegal or illicit drugs',
    'ENGLISH': 'Does this patient speak English? Answer Yes unless there is clear evidence that the patient does not speak english and requires a translator or communication in a different language.',
    'HBA1C_num': "What was the patient's highest recorded hemoglobin A1c (HbA1c) value? Answer NA if there are no values. Ignore instructions to provide a Yes/No answer for this question, instead give a numeric answer",
    'KETO-1YR': 'Does this note mention a diagnosis of ketoacidosis in the last year? Answer No unless a diagnosis of ketoacidosis is clearly specified.',
    'MAJOR-DIABETES': 'Has the patient had any major diabetes-related complications? Examples of “major complication” (as opposed to “minor complication”) include, but are not limited to, any of the following that are a result of (or strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • Retinopathy • nephropathy • neuropathy. Additionally, if multiple conditions together imply a severe case of diabetes, then count that as a major complication.',
    'MAKES-DECISIONS': 'Does this patient make their own medical decisions? Answer Yes, unless there is evidence that the patient cannot make medical decisions, for example they are deceased, in a coma, mentally incapacitated or otherwise require a spouse, family member, lawyer or someone else to have power of attorney.',
    'MI-6MOS': 'Has the patient had a myocardial infarction (MI) in the past 6 months?',
}

In [None]:
def create_prompt(note, question, qtype):
    if qtype == 'yes_no':
        type_str = "str - 'Yes/No'"
        answer_str = "str - 'Yes' or 'No'"
        json_example = f"""```json
{{
    "question" : "Does the note state that the patient is breathing normally on room air?",
    "type": "Yes/No",
    "answer": "No",
    "section": "History of Present Illness",
    "difficulty": "2",
    "source": "She currently is dependent on oxygen and wears 1.5-2 liters around the clock",
    "explanation": "The note states that she relies on oxygen and provides the amount as 1.5-2 liters so she is not breathing room air. We can assume since she is receiving o2 supplmentation and dependent on it, she cannot breathe normally on room air."
}}```"""
    elif qtype == 'numeric':
        type_str = "str - 'Numeric'"
        answer_str = "float - a single number (e.g., 92.5)"
        json_example = f"""```json
{{
    "question": "What was the patient's highest creatinine measurement recorded in the note?",
    "type": "Numeric",
    "answer": "1.4",
    "section": "Pertinent Results",
    "difficulty": "4",
    "source": "12/03/2023: CREAT: 1.4 \n 12/07/2023: CREAT: 1.1",
    "explanation": "The highest CREAT measurement was 1.4 because the value 1.4 on 12/03/2023 is higher than the 1.1 value on 12/07/2023."
}}```"""
    else:
        raise Exception('Not Implemented')

    prompt = f"""***PATIENT NOTE:
    {note}

    Answer the following 
    *** QUESTION:
    {question}

    *** Format your response as a JSON object with the following keys: 
    * question: str - the question you were asked to answer
    * type: {type_str}
    * answer: {answer_str}
    * section: str - the specific section of the note which contains the answer to the question (Example Answers: 'History of Present Illness', 'Past Medical History', 'Social History', 'Family History', 'Physical Exam', 'Pertinent Results', 'Brief Hospital Course', 'Discharge Medications', 'Discharge Disposition', 'Discharge Condition', 'Discharge Instructions')
    * source: str - exact quote of content in the note that allowed you to answer the question, this should be a quote directly taken from the "***PATIENT NOTE". Copy and pasting this string should exactly match content in the Note.
    * difficulty: int - a score from 1-10 indicating how difficult this question is to answer based on the note
    * explanation: str - explanation of why the answer is correct and how the source in the note helped to answer the question

    An example of how your JSON response should be formatted is shown below. Your answer should be based on the ***Patient Note provided above:
    ***EXAMPLE RESPONSE:
    {json_example}

    Provide a response that can be directly read by the Python JSON Parser library based on the ***PATIENT NOTE at the beginning of this message. Your answer needs to be "Yes" or "No", if you can't find the answer provide your best guess only responding "Yes" or "No" unless you were specifically told to give a number.
    """
    return prompt

In [None]:
full_df = None
for key, question in question_definitions.items():
    print(key, question)

    examples = []
    
    for i, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
        if key.split('_')[-1] == 'num':
            qtype = 'numeric'
        else:
            qtype = 'yes_no'
        
        example_dict = {
            'id': i,
            'label': row[key.split('_')[0]],
            'question_label': key,
            'prompt': create_prompt(row['TEXT'], question, qtype=qtype)
        }
        examples.append(example_dict)

    save_df = pd.DataFrame(examples)
    save_df.to_csv(f"../eval-data/i2b2/train_{key}.csv", index=False)

    if full_df is None:
        full_df = save_df
    else: 
        full_df = pd.concat([full_df, save_df], axis=0)
full_df.to_csv('../eval-data/i2b2/train.csv', index=False)
print(full_df.shape)

full_df = None
for key, question in question_definitions.items():
    examples = []
    for i, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        if key.split('_')[-1] == 'num':
            qtype = 'numeric'
        else:
            qtype = 'yes_no'

        example_dict = {
            'id': i,
            'label': row[key.split('_')[0]],
            'question_label': key,
            'prompt': create_prompt(row['TEXT'], question, qtype=qtype)
        }

        examples.append(example_dict)
    
    save_df = pd.DataFrame(examples)
    save_df.to_csv(f"./eval-data/i2b2/test_{key}.csv", index=False)

    if full_df is None:
        full_df = save_df
    else: 
        full_df = pd.concat([full_df, save_df], axis=0)
full_df.to_csv('../eval-data/i2b2/test.csv', index=False)
print(full_df.shape)
    
    