In [None]:
import os
import pandas as pd
import re

model_id_list = ['Llama-3.2-1B-Instruct', 'Llama-3.2-1B_all_1_8_16', 'Llama-3.2-3B-Instruct', 'Llama-3.2-3B_all_1_8_16'] 

# Function to load CSV files from a folder and add a 'question_type' column
def load_csv_with_question_type(folder_path):
    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Get the question_type from the file name (text prior to the first underscore)
            label_type = file_name

            # Load the CSV file into a DataFrame
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df['label_type'] = label_type
            
            # Append the data to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data

full_df = None
for model_id in model_id_list:
    model_id = model_id.split('/')[-1]
    print(model_id)
    
    folder_path = f"../outputs/i2b2/{model_id}"
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        continue 
    
    print(folder_path)
    df = load_csv_with_question_type(folder_path)
    df['model'] = model_id

    if full_df is None:
        full_df = df
    else:
        full_df = pd.concat([full_df, df], axis=0)
    
    print(full_df.shape)

# full_df = full_df[~full_df['question_label'].isna()]
df = full_df
print(df.shape)


In [23]:
import json
def extract_answer_from_json(text, start_delimiter='```json', end_delimiter='```'):
    try:
        # start_index = text.index(start_delimiter) + len(start_delimiter)
        # start_index=0
        # end_index = text.index(end_delimiter, start_index)
        # json_string = text[start_index:end_index].strip()
        json_string = text

        # Extract the answer value using regex
        match = re.search(r'"answer"\s*:\s*(".*?"|\d+(\.\d+)?)', json_string)
        if match:
            value = match.group(1)
            # Remove quotes if the value is a string
            if value.startswith('"') and value.endswith('"'):
                value = value[1:-1]
            return value
        return None
    except ValueError as e:
        print(f"ValueError: Error extracting answer in text: {text}\n{e}")
        return None
    
def extract_info_from_json(df, json_column):
    extracted_data = []
    failure_list = []

    regex_patterns = {
        "question": r'"question"\s*:\s*"([^"]+)"',
        "type": r'"type"\s*:\s*"([^"]+)"',
        "answer": r'"answer"\s*:\s*"([^"]+)"|:\s*([\d\.]+)',
        "section": r'"section"\s*:\s*"([^"]+)"',
        "source": r'"source"\s*:\s*"([^"]+)"',
        "explanation": r'"explanation"\s*:\s*"([^"]+)"'
    }

    for idx, row in df.iterrows():
        json_string = row[json_column]
        extracted_row = {"question": None, "type": None, "answer": None, "section": None, "source": None, "explanation": None}
        
        # Handle leading/trailing unwanted characters
        json_string = json_string.strip().replace('***', '').replace('```', '').replace('json', '')

        # Handle cases where the string contains multiple JSON objects
        json_objects = re.findall(r'\{.*?\}', json_string)
        if json_objects:
            json_string = json_objects[0]

        # Try to parse the JSON string
        try:
            data = json.loads(json_string)
            if isinstance(data, list):
                data = data[0]  # Take the first object from a list

            # Extract the required fields
            for key in extracted_row:
                extracted_row[key] = data.get(key)

        except json.JSONDecodeError:
            # Fall back to regex if JSON parsing fails
            for key, pattern in regex_patterns.items():
                match = re.search(pattern, json_string)
                if match:
                    extracted_row[key] = match.group(1) or match.group(2)
            # continue

        extracted_row['model'] = row['model']
        extracted_row['label'] = row['label']
        extracted_row['question_label'] = row['question_label']
        extracted_row['label_type'] = row['label_type'].split('_')[0]
        extracted_row['format'] = row['label_type'].split('_')[1]

        # Add even if we don't have the 'answer'
        if extracted_row['answer'] is None:
            failure_list.append(idx)
            extracted_data.append(extracted_row)
        else:
            extracted_data.append(extracted_row)

    extracted_df = pd.DataFrame(extracted_data)
    return extracted_df, failure_list


In [None]:
info_df, info_failure_list  = extract_info_from_json(df, 'output')
print(info_df.shape, len(info_failure_list))
info_df[info_df['label'].isna()]

In [None]:
df.iloc[info_failure_list]['output'].values[0]

In [28]:
# Function to process each row and extract the "answer" element
def process_row(row):
    row['answer'] = extract_answer_from_json(row['output'])
    return row

# Applying the function to each row
df = df.apply(process_row, axis=1)


In [None]:
print(df[df['answer'].isna()].shape)
print(df[df['answer'].isna()]['model'].value_counts())

print(info_df[info_df['answer'].isna()].shape)
print(info_df[info_df['answer'].isna()]['model'].value_counts())

In [None]:
df.groupby('question_label')['label'].mean()

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score

df.fillna('No', inplace=True)
# Convert the 'answer' column to binary format
df['answer_binary'] = df['answer'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)

# Group by 'label_type' and 'model' and calculate metrics
metrics = []

for (question_label, label_type, model), group in df.groupby(['question_label', 'label_type', 'model']):
    y_true = group['label']

    # @TODO this would be better if including Sex (e.g., should be 1.1 for women, 1.3 for men)
    if question_label == 'CREATININE_num':
        y_pred_numeric = pd.to_numeric(group['answer'], errors='coerce')
        y_pred_filled = y_pred_numeric.fillna(0)
        y_pred = (y_pred_filled > 1.3).astype(int)
    elif question_label == 'HBA1C_num':
        y_pred_numeric = pd.to_numeric(group['answer'], errors='coerce')
        y_pred_filled = y_pred_numeric.fillna(0)
        y_pred = ((y_pred_filled >= 6.5) & ((y_pred_filled <= 9.5))).astype(int)
        # y_pred = ((y_pred_filled >= 6.5)).astype(int)
    else:
        y_pred = group['answer_binary']
    
    auc = roc_auc_score(y_true, y_pred) if len(y_true.unique()) > 1 else None
    accuracy = accuracy_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    
    metrics.append({
        'train_test': label_type.split('_')[0],
        'format': label_type.split('_')[1],
        'question_label': question_label,
        'model': model,
        'auc': auc,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    })

# Creating a DataFrame for metrics
metrics_df = pd.DataFrame(metrics)
pd.set_option('display.max_rows', 400)
display(metrics_df.sort_values(by='question_label'))
pd.set_option('display.max_rows', 20)

In [None]:
metrics_df.groupby(['model', 'train_test'])[['auc', 'accuracy', 'balanced_accuracy', 'macro_f1', 'micro_f1']].mean()

In [None]:
metrics_df.groupby(['model', 'train_test', 'question_label'])[['accuracy', 'balanced_accuracy', 'macro_f1', 'micro_f1']].mean() #.to_csv('../results/eval/performance.csv')
