In [None]:
import pandas as pd
from tqdm.auto import tqdm
import os


input_file="bool_num"
min_difficulty=1
file_name = f"synth_{input_file}_{min_difficulty}_df"

df = pd.read_csv(f"../data/synthetic-raw/{file_name}.csv", compression='gzip')
display(df['question_type'].value_counts())
print(df.shape)

# balance df 
# df = df.iloc[:26415]


In [None]:
min_count = 50000
# new_df = df.groupby('question_type').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
new_df = df.groupby('question_type').apply(lambda x: x.nlargest(min_count, 'difficulty')).reset_index(drop=True)
display(new_df['question_type'].value_counts())

In [None]:
system_instruction = f"""You are a clinical research assistant helping to accurately answer questions from clinical notes. You answer with a single valid JSON object based on the patient note. 
All dates have been shifted to keep data de-identified, so they may be in the future. We care only about information captured in the note, for example when asking what is the highest lab a patient has, we mean the highest lab recorded in the note. 
If you cannot find information to answer the question asked of you in the note answer Not Available, unless the rest of the prompt recommends something different. 
"""

def create_prompt(note, question, qtype):
    if qtype in ['yes', 'na-bool']:
        type_str = "str - 'Yes/No'"
        answer_str = "str - 'Yes' or 'No' if the question can be answered based on the note, if the question cannot be answered based on the content of the note answer 'NA'"
        json_example = f"""```json
{{
    "question" : "Does the note state that the patient is breathing normally on room air?",
    "type": "Yes/No",
    "answer": "No",
    "section": "History of Present Illness",    
    "source": "She currently is dependent on oxygen and wears 1.5-2 liters around the clock",
    "explanation": "The note states that she relies on oxygen and provides the amount as 1.5-2 liters so she is not breathing room air. We can assume since she is receiving o2 supplmentation and dependent on it, she cannot breathe normally on room air."
}}```"""
        # "difficulty": "2",
    elif qtype in ['numeric', 'na-numeric']:
        type_str = "str - 'Numeric'"
        answer_str = "float - a single number (e.g., 92.5) or NA if the answer is not in the note"
        json_example = f"""```json
{{
    "question": "What was the patient's highest creatinine measurement recorded in the note?",
    "type": "Numeric",
    "answer": "1.4",
    "section": "Pertinent Results",    
    "source": "12/03/2023: CREAT: 1.4 \n 12/07/2023: CREAT: 1.1",
    "explanation": "The highest CREAT measurement was 1.4 because 12/03/2023 is before 12/07/2023."
}}```"""
        # "difficulty": "4",
    else:
        print(qtype)
        raise Exception('Not Implemented')

    prompt = f"""***PATIENT NOTE:
    {note}

    Answer the following 
    *** QUESTION:
    {question}

    *** Format your response as a JSON object with the following keys: 
    * question: str - the question you were asked to answer
    * type: {type_str}
    * answer: {answer_str}
    * section: str - the specific section of the note which contains the answer to the question (Example Answers: 'History of Present Illness', 'Past Medical History', 'Social History', 'Family History', 'Physical Exam', 'Pertinent Results', 'Brief Hospital Course', 'Discharge Medications', 'Discharge Disposition', 'Discharge Condition', 'Discharge Instructions')
    * source: str - exact quote of content in the note that allowed you to answer the question, this should be a quote directly taken from the "***PATIENT NOTE". Copy and pasting this string should exactly match content in the Note.
    * explanation: str - explanation of why the answer is correct and how the source in the note helped to answer the question
    
    An example of how your JSON response should be formatted is shown below (this is only an example of one question, others should be in a list, your answer should be based on the ***Patient Note provided above):
    ***EXAMPLE RESPONSE:
    {json_example}

    Provide a response that can be directly read by the Python JSON Parser library based on the ***PATIENT NOTE at the beginning of this message.
    """
    # * difficulty: int - a score from 1-10 indicating how difficult this question is to answer based on the note
    return prompt

In [None]:
import math

# string compatible isnan
def isNaN(num):
    return num != num

def format_output(r):
    if isNaN(r['answer']):
        answer = 'NA'
    else:
        answer = r['answer']
    output_json = f"""```json
{{
    "question" : "{r['question']}",
    "type": "{r['question_type']}",
    "answer": "{answer}",
    "section": "{r['section']}",
    "source": "{r['source']}",
    "explanation": "{r['explanation']}"
}}```"""
    return output_json


examples = []
for i, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):
    example_dict = {
            'id': i,
            'output': format_output(row),
            'prompt': create_prompt(row['text'], row['question'], qtype=row['question_type'])
        }

    examples.append(example_dict)


    
    

In [None]:
from sklearn.model_selection import train_test_split

save_df = pd.DataFrame(examples)
print(new_df.shape, save_df.shape)
# Shuffle and split new_df
new_train_df, new_test_df = train_test_split(new_df, test_size=0.1, random_state=42, shuffle=True)

# Ensure the same split and indices for save_df
save_train_df = save_df.loc[new_train_df.index]
save_test_df = save_df.loc[new_test_df.index]

print(new_train_df.shape, new_test_df.shape)
print(save_train_df.shape, save_test_df.shape)


In [None]:
# now save
from datasets import Dataset

folder_path = "../data/synthetic-raw"

train_file = f"{folder_path}/train_{input_file}_{min_difficulty}-{min_count}-hardest.arrow"
test_file = f"{folder_path}/test_{input_file}_{min_difficulty}-{min_count}-hardest.arrow"


save_train_ds = Dataset.from_pandas(save_train_df)
save_test_ds = Dataset.from_pandas(save_test_df)

save_train_ds.save_to_disk(train_file)
save_test_ds.save_to_disk(test_file)

new_train_df.to_csv(f"{folder_path}/train_{input_file}_{min_difficulty}-{min_count}-hardest.csv", index=False, compression='gzip')
new_test_df.to_csv(f"{folder_path}/test_{input_file}_{min_difficulty}-{min_count}-hardest.csv", index=False, compression='gzip')


