In [7]:
import os
import pandas as pd
import re

# Function to load CSV files from a folder and add a 'question_type' column
def load_csv_with_question_type(folder_path):
    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Get the question_type from the file name (text prior to the first underscore)
            label_type = file_name

            # Load the CSV file into a DataFrame
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df['label_type'] = label_type

            # Append the data to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data

dataset = 'annotated-mimic'
# train_test = ['test', 'train']
train_test = ['prompts']

model_id_list = ['Llama-3.2-1B-Instruct', 'Llama-3.2-1B_all_1_8_16', 'Llama-3.2-3B-Instruct', 'Llama-3.2-3B_all_1_8_16'] 

# model_id_list = ['Meta-Llama-3.1-8B-Instruct',
#                  'Meta-Llama-3.1-70B-Instruct', 'Meta-Llama-3.1-8B_all_1',
#                  'Meta-Llama-3.1-8B_all_1-25000-hardest', 
#                  'Meta-Llama-3.1-8B_all_1-25000-nosupport',
#                  'Meta-Llama-3.1-8B_bool_num_1-10000-hardest'] #, "meta-llama/Meta-Llama-3.1-70B-Instruct"]

# temperature_list = [0, 0.3, 0.5, 0.7, 0.8, 1]
temperature_list = [0]

param_folder_list = []
for tt in train_test:
    for model_id in model_id_list: 
        for temp in temperature_list:
            if temp == 1:
                top_p_list = [0.5, 0.7, 0.9]
            else:
                top_p_list = [1]
                
            for top_p in top_p_list:
                param_folder_list.append(
                    {
                        'train_test': tt,
                        'model_id': model_id,
                        'temperature': temp,
                        'top_p': top_p
                    }
                )

# model_id = 'Meta-Llama-3.1-8B_all_1-25000-hardest'
# temperature_list = [0, 0.2, 0.5, 0.7, 0.8]
# top_p_list = [0.5, 0.8, 0.9, 0.95]

# param_folder_list = []

# for temp in temperature_list:
#     for top_p in top_p_list:
#         param_folder_list.append({'temperature': temp,
#                                     'top_p': top_p
#                                     }
#                                 )
        



In [None]:
import os

full_df = None
for params in param_folder_list:
    folder_path = f"./outputs/{dataset}/params/{params['model_id']}/{params['temperature']}_{params['top_p']}"
    print(folder_path)
    
    # Check if folder_path exists
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        continue  # Skip to the next iteration if the folder doesn't exist
    
    df = load_csv_with_question_type(folder_path)
    df['model'] = params['model_id']
    df['temperature'] = params['temperature']
    df['top_p'] = params['top_p']
    df['train_test'] = params['train_test']

    if full_df is None:
        full_df = df
    else:
        full_df = pd.concat([full_df, df], axis=0)
    
    print(full_df.shape)


In [None]:
import json
import re

def extract_answer_from_json(text, start_delimiter='```json', end_delimiter='```'):
    if not isinstance(text, str):
        # If text is not a string, return None
        return None
    
    try:
        start_index = text.index(start_delimiter) + len(start_delimiter)
        end_index = text.index(end_delimiter, start_index)
        json_string = text[start_index:end_index].strip()

        # Extract the answer value using regex
        match = re.search(r'"answer"\s*:\s*(".*?"|\d+(\.\d+)?)', json_string)
        if match:
            value = match.group(1)
            # Remove quotes if the value is a string
            if value.startswith('"') and value.endswith('"'):
                value = value[1:-1]
            return value
        return None
    except (ValueError, AttributeError) as e:
        # Handle ValueError and any unexpected AttributeError 
        return None


def extract_info_from_json(df, json_column):
    extracted_data = []
    failure_list = []

    regex_patterns = {
        "question": r'"question"\s*:\s*"([^"]+)"',
        "type": r'"type"\s*:\s*"([^"]+)"',
        "answer": r'"answer"\s*:\s*"([^"]+)"|:\s*([\d\.]+)',
        "section": r'"section"\s*:\s*"([^"]+)"',
        "source": r'"source"\s*:\s*"([^"]+)"',
        "explanation": r'"explanation"\s*:\s*"([^"]+)"'
    }

    for idx, row in df.iterrows():
        json_string = row[json_column]
        extracted_row = {"question": None, "type": None, "answer": None, "section": None, "source": None, "explanation": None}
        

        # if extracted_row['model'].isna() or extracted_row['label'].isna() or extracted_row['question_label'].isna():
        #     print(row)
        #     raise Exception('stop')
        
        # Handle leading/trailing unwanted characters
        try:
            json_string = json_string.strip().replace('***', '').replace('```', '').replace('json', '')
        except:
            failure_list.append(idx)
            extracted_row['model'] = row['model']
            extracted_row['label'] = row['label']
            extracted_row['question_type'] = row['question_type']
            extracted_row['label_type'] = row['label_type']
            extracted_row['temperature'] = row['temperature']
            extracted_row['top_p'] = row['top_p']
           
            extracted_data.append(extracted_row)
            continue

        # Handle cases where the string contains multiple JSON objects
        json_objects = re.findall(r'\{.*?\}', json_string)
        if json_objects:
            json_string = json_objects[0]

        # Try to parse the JSON string
        try:
            data = json.loads(json_string)
            if isinstance(data, list):
                data = data[0]  # Take the first object from a list

            # Extract the required fields
            for key in extracted_row:
                extracted_row[key] = data.get(key)

        except json.JSONDecodeError:
            # Fall back to regex if JSON parsing fails
            for key, pattern in regex_patterns.items():
                match = re.search(pattern, json_string)
                if match:
                    extracted_row[key] = match.group(1) or match.group(2)

        extracted_row['model'] = row['model']
        extracted_row['label'] = row['label']
        extracted_row['question_type'] = row['question_type']
        # extracted_row['question_label'] = row['question_label']
        extracted_row['label_type'] = row['label_type']
        extracted_row['temperature'] = row['temperature']
        extracted_row['top_p'] = row['top_p']
        

        # Add even if we don't have the 'answer'
        if extracted_row['answer'] is None:
            failure_list.append(idx)
            extracted_data.append(extracted_row)
        else:
            extracted_data.append(extracted_row)

    extracted_df = pd.DataFrame(extracted_data)
    return extracted_df, failure_list

In [5]:
# info_df['label'].value_counts()

In [None]:
info_df, info_failure_list  = extract_info_from_json(full_df, 'output')
print(info_df.shape, len(info_failure_list))
# print('na label')
# display(info_df[info_df['label'].isna()])
display(info_df)

In [None]:
info_df['question_type'].value_counts()

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.utils import resample

def preprocess_values(y_true, y_pred):
    # Define a helper function to handle numeric and string comparisons
    def normalize(val):
        if isinstance(val, str):
            # If string, strip spaces and lower the case
            return val.strip().lower()
        elif isinstance(val, (int, float)):
            # If numeric, convert to float for comparison (handles 188 == 188.0)
            return float(val)
        else:
            return val
    
    y_true_normalized = [normalize(val) for val in y_true]
    y_pred_normalized = [normalize(val) for val in y_pred]
    
    return np.array(y_true_normalized), np.array(y_pred_normalized)

def bootstrap_metric(y_true, y_pred, metric_func, n=1000):
    """Performs bootstrapping to generate confidence intervals."""
    bootstrapped_scores = []
    
    # Resample n times
    for i in range(n):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        # Calculate metric on resampled data
        score = metric_func(y_true_resampled, y_pred_resampled)
        bootstrapped_scores.append(score)
    
    # Compute the 95% confidence interval (2.5th and 97.5th percentiles)
    lower_bound = np.percentile(bootstrapped_scores, 2.5)
    upper_bound = np.percentile(bootstrapped_scores, 97.5)
    return np.mean(bootstrapped_scores), lower_bound, upper_bound

# List to store metrics for each model and data type
metrics = []

# Fill missing values
info_df['label'].fillna('NA', inplace=True)
info_df['answer'].fillna('NA', inplace=True)

def convert_to_binary(y_true, y_pred):
    return np.array([1 if yt == yp else 0 for yt, yp in zip(y_true, y_pred)])

# Example: iterate over the groups
for (question_type, model, temperature, top_p), group in info_df.groupby(['question_type', 'model', 'temperature', 'top_p']):
    y_true = group['label']
    y_pred = group['answer']
    
    # Preprocess y_true and y_pred
    y_true, y_pred = preprocess_values(y_true, y_pred)
    y_true_binary = convert_to_binary(y_true, y_true)  # Ground truth stays as is
    y_pred_binary = convert_to_binary(y_true, y_pred)  # Prediction binary matches with true values
    
    # Perform bootstrapping for accuracy
    accuracy_mean, accuracy_lower, accuracy_upper = bootstrap_metric(y_true_binary, y_pred_binary, accuracy_score)
    
    # Perform bootstrapping for balanced accuracy
    balanced_acc_mean, balanced_acc_lower, balanced_acc_upper = bootstrap_metric(y_true_binary, y_pred_binary, balanced_accuracy_score)
    
    # Perform bootstrapping for F1 score (macro)
    macro_f1_mean, macro_f1_lower, macro_f1_upper = bootstrap_metric(y_true_binary, y_pred_binary, lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'))
    
    # Perform bootstrapping for F1 score (micro)
    micro_f1_mean, micro_f1_lower, micro_f1_upper = bootstrap_metric(y_true_binary, y_pred_binary, lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro'))
    
    # Append the results to the list
    metrics.append({
        'model': model,
        'temperature': temperature,
        'top_p': top_p,
        'question_type': question_type,
        'accuracy_mean': accuracy_mean,
        'accuracy_ci': (accuracy_lower, accuracy_upper),
        'balanced_acc_mean': balanced_acc_mean,
        'balanced_acc_ci': (balanced_acc_lower, balanced_acc_upper),
        'macro_f1_mean': macro_f1_mean,
        'macro_f1_ci': (macro_f1_lower, macro_f1_upper),
        'micro_f1_mean': micro_f1_mean,
        'micro_f1_ci': (micro_f1_lower, micro_f1_upper)
    })

# Convert the results into a DataFrame for display
metrics_info_df = pd.DataFrame(metrics)
# pd.set_option('display.max_rows', 400)
# display(metrics_info_df.sort_values(by='model'))
# pd.set_option('display.max_rows', 20)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.utils import resample
from tqdm import tqdm  # For progress bars

def preprocess_values(y_true, y_pred):
    # Define a helper function to handle numeric and string comparisons
    def normalize(val):
        if isinstance(val, str):
            # If string, strip spaces and lower the case
            return val.strip().lower()
        elif isinstance(val, (int, float)):
            # If numeric, convert to float for comparison (handles 188 == 188.0)
            return float(val)
        else:
            return val
    
    y_true_normalized = [normalize(val) for val in y_true]
    y_pred_normalized = [normalize(val) for val in y_pred]
    
    return np.array(y_true_normalized), np.array(y_pred_normalized)

def bootstrap_metric(y_true, y_pred, metric_func, n=1000):
    """Performs bootstrapping to generate confidence intervals."""
    bootstrapped_scores = []
    
    # Resample n times
    for i in range(n):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        # Calculate metric on resampled data
        score = metric_func(y_true_resampled, y_pred_resampled)
        bootstrapped_scores.append(score)
    
    # Compute the 95% confidence interval (2.5th and 97.5th percentiles)
    lower_bound = np.percentile(bootstrapped_scores, 2.5)
    upper_bound = np.percentile(bootstrapped_scores, 97.5)
    return np.mean(bootstrapped_scores), lower_bound, upper_bound

# List to store metrics for each model and data type
metrics = []

# Fill missing values
info_df['label'].fillna('NA', inplace=True)
info_df['answer'].fillna('NA', inplace=True)

def convert_to_binary(y_true, y_pred):
    return np.array([1 if yt == yp else 0 for yt, yp in zip(y_true, y_pred)])

# Example: iterate over the groups with a progress bar
grouped = info_df.groupby(['question_type', 'model', 'temperature', 'top_p'])

# Use tqdm to wrap the loop for progress tracking
for (question_type, model, temperature, top_p), group in tqdm(grouped, desc="Processing groups"):
    y_true = group['label']
    y_pred = group['answer']
    
    # Preprocess y_true and y_pred
    y_true, y_pred = preprocess_values(y_true, y_pred)
    y_true_binary = convert_to_binary(y_true, y_true)  # Ground truth stays as is
    y_pred_binary = convert_to_binary(y_true, y_pred)  # Prediction binary matches with true values
    
    # Perform bootstrapping for accuracy
    accuracy_mean, accuracy_lower, accuracy_upper = bootstrap_metric(y_true_binary, y_pred_binary, accuracy_score)
    
    # Perform bootstrapping for balanced accuracy
    balanced_acc_mean, balanced_acc_lower, balanced_acc_upper = bootstrap_metric(y_true_binary, y_pred_binary, balanced_accuracy_score)
    
    # Perform bootstrapping for F1 score (macro)
    macro_f1_mean, macro_f1_lower, macro_f1_upper = bootstrap_metric(y_true_binary, y_pred_binary, lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'))
    
    # Perform bootstrapping for F1 score (micro)
    micro_f1_mean, micro_f1_lower, micro_f1_upper = bootstrap_metric(y_true_binary, y_pred_binary, lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro'))
    
    # Append the results to the list
    metrics.append({
        'model': model,
        'temperature': temperature,
        'top_p': top_p,
        'question_type': question_type,
        'accuracy_mean': accuracy_mean,
        'accuracy_ci': (accuracy_lower, accuracy_upper),
        'balanced_acc_mean': balanced_acc_mean,
        'balanced_acc_ci': (balanced_acc_lower, balanced_acc_upper),
        'macro_f1_mean': macro_f1_mean,
        'macro_f1_ci': (macro_f1_lower, macro_f1_upper),
        'micro_f1_mean': micro_f1_mean,
        'micro_f1_ci': (micro_f1_lower, micro_f1_upper)
    })

# Convert the results into a DataFrame for display
metrics_info_df = pd.DataFrame(metrics)
# pd.set_option('display.max_rows', 400)
# display(metrics_info_df.sort_values(by='model'))
# pd.set_option('display.max_rows', 20)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.utils import resample

# Assuming metrics_info_df already has 'accuracy_mean', 'accuracy_ci', and 'num_samples'

# First, calculate the number of samples per model/question_type
metrics_info_df['num_samples'] = info_df.groupby(['model', 'question_type']).size().reset_index(name='count')['count']

# Define a helper function to format the mean and 95% confidence interval as percentages
def format_mean_ci_percentage(row):
    mean = row['accuracy_mean'] * 100  # Convert to percentage
    lower_ci, upper_ci = [ci * 100 for ci in row['accuracy_ci']]  # Convert CI to percentage
    return f"{mean:.1f}% ({lower_ci:.1f}%, {upper_ci:.1f}%)"

# Apply the formatting function to create a new column for formatted mean (95% CI) as percentages
metrics_info_df['accuracy_mean_ci'] = metrics_info_df.apply(format_mean_ci_percentage, axis=1)

# Now create a pivot table that displays the mean (95% CI) for each model and question type
pivot_table = metrics_info_df.pivot_table(
    index='model',
    columns='question_type',
    values='accuracy_mean_ci',
    aggfunc='first'  # We use 'first' because we already have the mean (95% CI) precomputed
)

# Perform bootstrapping to calculate the weighted average with CI
def bootstrap_weighted_avg(model_df, n=1000):
    """Bootstrap the weighted average accuracy and calculate the 95% CI."""
    bootstrapped_means = []
    
    # Perform bootstrapping n times
    for _ in range(n):
        # Resample with replacement
        resampled_df = resample(model_df)
        # Calculate the weighted average of the resampled data
        weighted_avg = np.average(resampled_df['accuracy_mean'], weights=resampled_df['num_samples'])
        bootstrapped_means.append(weighted_avg)
    
    # Calculate the mean and 95% confidence interval
    mean_avg = np.mean(bootstrapped_means) * 100  # Convert to percentage
    lower_ci = np.percentile(bootstrapped_means, 2.5) * 100  # Convert CI to percentage
    upper_ci = np.percentile(bootstrapped_means, 97.5) * 100  # Convert CI to percentage
    return mean_avg, lower_ci, upper_ci

# Calculate the weighted average accuracy and 95% CI for each model
weighted_avgs_with_ci = []
for model, group in metrics_info_df.groupby('model'):
    mean_avg, lower_ci, upper_ci = bootstrap_weighted_avg(group)
    weighted_avgs_with_ci.append({
        'model': model,
        'weighted_avg_mean': mean_avg,
        'weighted_avg_ci': (lower_ci, upper_ci)
    })

# Convert the results to a DataFrame for easier manipulation
weighted_avg_df = pd.DataFrame(weighted_avgs_with_ci)

# Add the weighted average with 95% CI as percentages to the pivot table
weighted_avg_df['weighted_avg_mean_ci'] = weighted_avg_df.apply(
    lambda row: f"{row['weighted_avg_mean']:.1f}% ({row['weighted_avg_ci'][0]:.1f}%, {row['weighted_avg_ci'][1]:.1f}%)", axis=1
)

# Merge the pivot table with the weighted average CI
pivot_table['Average'] = weighted_avg_df.set_index('model')['weighted_avg_mean_ci']

# Convert the pivot table to a CSV format (with ',' separator)
# csv_output = pivot_table.to_csv()

# Print the CSV output so you can directly copy it and paste into Excel
# print(csv_output)


In [None]:
# pivot_table

In [None]:
# pd.set_option('display.max_rows', 400)
# display(metrics_info_df[['question_type', 'model', 'temperature', 'top_p', 'accuracy']].sort_values(by='question_label'))
# pd.set_option('display.max_rows', 20)

In [None]:
# pd.set_option('display.max_rows', 400)
# # display(metrics_info_df.groupby(['model', 'temperature', 'top_p', 'train_test'])[['auc', 'accuracy', 'balanced_accuracy', 'macro_f1', 'micro_f1']].mean())
# display(metrics_info_df.groupby(['model', 'temperature', 'top_p'])[['macro_f1', 'micro_f1']].mean())
# pd.set_option('display.max_rows', 50)