In [None]:
# train on all - binary, numeric, and na-numeric
# train on all - binary only
# train on all - binary and numeric
# train on all - binary, numeric, and na-numeric, na-bool

In [None]:
# folder_path = './outputs/100000/Meta-Llama-3.1-70B-Instruct/'
folder_path = '../synthetic_output/100000'

import os
import pandas as pd

# Function to load CSV files from a folder and add a 'question_type' column
def load_csv_with_question_type(folder_path):
    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Get the question_type from the file name (text prior to the first underscore)
            # question_type = file_name.split('_')[0]

            # Load the CSV file into a DataFrame
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)

            question_type = file_path.split('/')[-1].split('_')[1]
            # Add the 'question_type' column
            df['question_type'] = question_type

            # Append the data to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)

    return all_data


In [None]:
df = load_csv_with_question_type(folder_path)
print(df[df['question_type']=='yes'].shape, df[df['question_type']=='numeric'].shape)
print('cur shape', df.shape)
df

In [None]:
import pandas as pd
import re
import json

def extract_attributes(df, column_name):
    results = []
    failure_list = []
    
    def clean_text(text):
        """ Clean the text to improve JSON parsing """
        # Remove the code block markers and extra new lines
        text = re.sub(r'```json', '', text, flags=re.DOTALL)
        text = re.sub(r'```', '', text, flags=re.DOTALL)
        text = re.sub(r'\n+', ' ', text, flags=re.DOTALL)
        
        # Find the JSON list part in the text
        json_start = text.find('[')
        json_end = text.rfind(']')
        
        if json_start == -1 or json_end == -1:
            return None
        
        json_text = text[json_start:json_end + 1]
        
        return json_text
    
    def extract_from_json_like(text):
        cleaned_text = clean_text(text)
        if not cleaned_text:
            return None
        try:
            data = json.loads(cleaned_text)
            return data
        except (json.JSONDecodeError, TypeError):
            return None

    def extract_from_regex(text):
        pattern = re.compile(
            r'{"question":\s*"(?P<question>.*?)",\s*"type":\s*"(?P<type>.*?)",\s*"answer":\s*"(?P<answer>.*?)",\s*"section":\s*"(?P<section>.*?)",\s*"source":\s*"(?P<source>.*?)",\s*"explanation":\s*"(?P<explanation>.*?)",\s*"difficulty":\s*"(?P<difficulty>.*?)"}',
            re.DOTALL
        )
        return [match.groupdict() for match in pattern.finditer(text)]

    for idx, row in df.iterrows():
        text = row[column_name]
        subject_id = row['subject_id']
        hadm_id = row['hadm_id']
        question_type = row['question_type']
        
        if pd.isnull(text) or not isinstance(text, str):
            failure_list.append(f"{idx} Blank - {text}")
            continue
        
        extracted_data = extract_from_json_like(text)
        if not extracted_data:
            extracted_data = extract_from_regex(text)
        
        if not extracted_data:
            failure_list.append(f"{idx} Extraction Failed - {text}")
        else:
            for item in extracted_data:
                try:
                    item.update({
                        'subject_id': subject_id,
                        'hadm_id': hadm_id,
                        'question_type': question_type,
                        'llm_response': text
                    })
                    results.append(item)
                except Exception as e:
                    failure_list.append(f"{idx} Data Update Failed - {str(e)} - {text}")
    
    result_df = pd.DataFrame(results, columns=['subject_id', 'hadm_id', 'question_type', 'llm_response', 'question', 'type', 'answer', 'section', 'source', 'explanation', 'difficulty'])
    return result_df, failure_list


In [None]:
na_num_result_df, na_num_failures = extract_attributes(df[df['question_type']=='na-numeric'], 'output')
yes_result_df, yes_failures = extract_attributes(df[df['question_type']=='yes'], 'output')
numeric_result_df, numeric_failures = extract_attributes(df[df['question_type']=='numeric'], 'output')
na_bool_result_df, na_bool_failures = extract_attributes(df[df['question_type']=='na-bool'], 'output')

In [None]:
print(len(na_num_failures), len(yes_failures), len(numeric_failures), len(na_bool_failures))
print(na_num_result_df.shape, yes_result_df.shape, numeric_result_df.shape, na_bool_result_df.shape)

# 26 97 323 18
# (106245, 10) (212133, 10) (209705, 10) (106288, 10)

In [None]:
na_num_result_df['difficulty'] = pd.to_numeric(na_num_result_df['difficulty'], errors='coerce')
na_bool_result_df['difficulty'] = pd.to_numeric(na_bool_result_df['difficulty'], errors='coerce')
numeric_result_df['difficulty'] = pd.to_numeric(numeric_result_df['difficulty'], errors='coerce')
yes_result_df['difficulty'] = pd.to_numeric(yes_result_df['difficulty'], errors='coerce')

In [None]:
print('na-num')
display(na_num_result_df['difficulty'].value_counts())
print('na-bool')
display(na_bool_result_df['difficulty'].value_counts())
print('numeric')
display(numeric_result_df['difficulty'].value_counts())
print('bool')
display(yes_result_df['difficulty'].value_counts())

In [None]:
display(na_num_result_df['question_type'].value_counts())
display(na_bool_result_df['question_type'].value_counts())
display(numeric_result_df['question_type'].value_counts())
display(yes_result_df['question_type'].value_counts())


In [None]:
full_df = pd.concat([numeric_result_df, yes_result_df, na_num_result_df, na_bool_result_df])
full_df.sort_values(['subject_id', 'hadm_id'])

print(full_df.shape)
notes_df = pd.read_csv('../data/mimic/notes_100000.csv', compression='gzip')
full_df = full_df[['subject_id', 'hadm_id', 'question_type', 'question',
                   'type', 'answer', 'section', 'source', 'explanation', 'difficulty']].merge(notes_df, how='left', on=['subject_id', 'hadm_id'])
print(full_df.shape)


In [None]:
full_df['difficulty'].value_counts()

In [None]:
for i in range (9, 0, -1):
    print(i)
    save_df = full_df.loc[full_df['difficulty'] >= i]
    print(save_df.shape)
    save_df.to_csv(f"../data/synthetic-raw/synth_all_{i}_df.csv", index=False, compression='gzip')

for i in range (9, 0, -1):
    print(i)
    save_df = full_df.loc[(full_df['difficulty'] >= i) & (full_df['question_type'].isin(['yes']))]
    print(save_df.shape)
    save_df.to_csv(f"../data/synthetic-raw/synth_bool_{i}_df.csv", index=False, compression='gzip')



In [None]:
for i in range (9, 0, -1):
    print(i)
    save_df = full_df.loc[(full_df['difficulty'] >= i) & (full_df['question_type'].isin(['yes', 'numeric']))]
    print(save_df.shape)
    save_df.to_csv(f"../data/synthetic-raw/synth_bool_num_{i}_df.csv", index=False, compression='gzip')


In [None]:
full_df.to_csv('../data/synthetic-raw/synth_all_df.csv', index=False)