In [None]:
import os
import pandas as pd
import re

# Function to load CSV files from a folder and add a 'question_type' column
def load_csv_with_question_type(folder_path):
    # Initialize an empty DataFrame to store all data
    all_data = pd.DataFrame()

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Get the question_type from the file name (text prior to the first underscore)
            label_type = file_name

            # Load the CSV file into a DataFrame
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df['label_type'] = label_type

            # Append the data to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)
    return all_data

dataset = 'apixaban'
# train_test = ['test', 'train']
train_test = ['prompts']

model_id_list = ['Llama-3.2-1B-Instruct', 'Llama-3.2-1B_all_1_8_16', 'Llama-3.2-3B-Instruct', 'Llama-3.2-3B_all_1_8_16'] 

# temperature_list = [0, 0.3, 0.5, 0.7, 0.8, 1]
temperature_list = [0]

param_folder_list = []
for tt in train_test:
    for model_id in model_id_list: 
        for temp in temperature_list:
            if temp == 1:
                top_p_list = [0.5, 0.7, 0.9]
            else:
                top_p_list = [1]
                
            for top_p in top_p_list:
                param_folder_list.append(
                    {
                        'train_test': tt,
                        'model_id': model_id,
                        'temperature': temp,
                        'top_p': top_p
                    }
                )





In [None]:
import os

full_df = None
for params in param_folder_list:
    # folder_path = f"./outputs/apixaban-manuscript-run/params/{params['model_id']}/{params['temperature']}_{params['top_p']}"
    folder_path = f"./outputs/apixaban/params/{params['model_id']}/{params['temperature']}_{params['top_p']}"
    print(folder_path)
    
    # Check if folder_path exists
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        continue  # Skip to the next iteration if the folder doesn't exist
    
    df = load_csv_with_question_type(folder_path)
    df['model'] = params['model_id']
    df['temperature'] = params['temperature']
    df['top_p'] = params['top_p']
    df['train_test'] = params['train_test']

    if full_df is None:
        full_df = df
    else:
        full_df = pd.concat([full_df, df], axis=0)
    
    print(full_df.shape)


In [None]:
print(full_df.shape)
full_df = full_df.rename(columns={'label': 'label_name'})
full_df = full_df.rename(columns={'answer': 'label'})

if full_df is not None:
    full_df = full_df[~full_df['label_name'].isna()]
print(full_df.shape)

In [None]:


# full_df['label_name'] = full_df['label']
# full_df['label'] = full_df['answer']
full_df

In [None]:
import json
import re

def extract_answer_from_json(text, start_delimiter='```json', end_delimiter='```'):
    if not isinstance(text, str):
        # If text is not a string, return None
        return None
    
    try:
        start_index = text.index(start_delimiter) + len(start_delimiter)
        end_index = text.index(end_delimiter, start_index)
        json_string = text[start_index:end_index].strip()

        # Extract the answer value using regex
        match = re.search(r'"answer"\s*:\s*(".*?"|\d+(\.\d+)?)', json_string)
        if match:
            value = match.group(1)
            # Remove quotes if the value is a string
            if value.startswith('"') and value.endswith('"'):
                value = value[1:-1]
            return value
        return None
    except (ValueError, AttributeError) as e:
        # Handle ValueError and any unexpected AttributeError 
        return None


def extract_info_from_json(df, json_column):
    extracted_data = []
    failure_list = []

    regex_patterns = {
        "question": r'"question"\s*:\s*"([^"]+)"',
        "type": r'"type"\s*:\s*"([^"]+)"',
        "answer": r'"answer"\s*:\s*"([^"]+)"|:\s*([\d\.]+)',
        "section": r'"section"\s*:\s*"([^"]+)"',
        "source": r'"source"\s*:\s*"([^"]+)"',
        "explanation": r'"explanation"\s*:\s*"([^"]+)"'
    }

    for idx, row in df.iterrows():
        json_string = row[json_column]
        extracted_row = {"question": None, "type": None, "answer": None, "section": None, "source": None, "explanation": None}
        

        # if extracted_row['model'].isna() or extracted_row['label'].isna() or extracted_row['question_label'].isna():
        #     print(row)
        #     raise Exception('stop')
        
        # Handle leading/trailing unwanted characters
        try:
            json_string = json_string.strip().replace('***', '').replace('```', '').replace('json', '')
        except:
            failure_list.append(idx)
            extracted_row['model'] = row['model']
            extracted_row['label'] = row['label']
            extracted_row['question_label'] = row['question_label']
            extracted_row['label_type'] = row['label_type']
            extracted_row['temperature'] = row['temperature']
            extracted_row['top_p'] = row['top_p']
           
            extracted_data.append(extracted_row)
            continue

        # Handle cases where the string contains multiple JSON objects
        json_objects = re.findall(r'\{.*?\}', json_string)
        if json_objects:
            json_string = json_objects[0]

        # Try to parse the JSON string
        try:
            data = json.loads(json_string)
            if isinstance(data, list):
                data = data[0]  # Take the first object from a list

            # Extract the required fields
            for key in extracted_row:
                extracted_row[key] = data.get(key)

        except json.JSONDecodeError:
            # Fall back to regex if JSON parsing fails
            for key, pattern in regex_patterns.items():
                match = re.search(pattern, json_string)
                if match:
                    extracted_row[key] = match.group(1) or match.group(2)

        extracted_row['model'] = row['model']
        extracted_row['label'] = row['label']
        extracted_row['question_label'] = row['question_label']
        extracted_row['label_type'] = row['label_type']
        extracted_row['temperature'] = row['temperature']
        extracted_row['top_p'] = row['top_p']
        

        # Add even if we don't have the 'answer'
        if extracted_row['answer'] is None:
            failure_list.append(idx)
            extracted_data.append(extracted_row)
        else:
            extracted_data.append(extracted_row)

    extracted_df = pd.DataFrame(extracted_data)
    return extracted_df, failure_list

In [None]:
# info_df['label'].value_counts()

In [None]:
info_df, info_failure_list  = extract_info_from_json(full_df, 'output')
print(info_df.shape, len(info_failure_list))
# print('na label')
# display(info_df[info_df['label'].isna()])
display(info_df)

In [None]:
pd.set_option('display.max_rows', 100)
info_df[((info_df['question_label'] == 'PLT') &
         (info_df['model'] == 'Meta-Llama-3.1-8B_all_1-25000-hardest'))][['answer', 'label']]

In [None]:
pd.set_option('display.max_rows', 100)
info_df[((info_df['question_label'] == 'hemorrhagic') &
         (info_df['model'] == 'Meta-Llama-3.1-8B_all_1'))][['answer', 'label']]

In [None]:
info_df.loc[info_df['type'].isin(['yes', 'na-bool']), 'answer'].fillna('No', inplace=True)
info_df.loc[info_df['type'].isin(['yes', 'na-bool']), 'label'].fillna('No', inplace=True)
info_df.loc[info_df['type'].isin(['numeric', 'na-numeric']), 'answer'].fillna('NA', inplace=True)
info_df.loc[info_df['type'].isin(['numeric', 'na-numeric']), 'label'].fillna('NA', inplace=True)

In [None]:
# pd.set_option('display.max_rows', 400)
# display(info_df.groupby('question_label')[['label']].value_counts())
# pd.set_option('display.max_rows', 40)
info_df

In [None]:
# pd.set_option('display.max_rows', 800)
# display(info_df.groupby('question_label')[['answer']].value_counts())
# pd.set_option('display.max_rows', 40)

bool_question_definitions = {
    'afib': 'Does the note describe the patient as having atrial fibrillation (afib)? Answer "No" if the note describes the patient as having afib secondary to another reversible cause?', 
    'mdd': 'Does the note describe the patient as ever being diagnosed with depression or major depressive disorder (MDD)? Answer "No" unless the note describes a diagnosis or history of depression.',
    'schizophrenia': 'Does the note describe the patient as ever being diagnosed with schizophrenia or any schizoaffective disorders? Answer "No" unless the note describes a diagnosis or history of a schizoaffective disorder.',
    'bipolar': 'Does the note describe the patient as ever being diagnosed with bipolar disorder?  Answer "No" unless the note describes a diagnosis or history of bipolar disorder', 
    'hemorrhagic': 'Does the note describe the patient as ever having any hemorrhagic tendencies or blood dyscrasias (i.e., any disorder of the blood, bone marrow, clotting proteins, or lymph tissue)? Examples could include anemia (hemoglobin deficiency), leukopenia (low WBC count), thrompocytopenia (low platelent count),  any forms of leukemia, or clotting disorders.  Answer "No" unless the note describes a diagnosis or history that could be considered a blood or clotting disorder.',
    'recent_stroke': 'Does the note describe the patient as having a stroke during this admission or within the last month? (Answer "Yes" for a stroke within the last 30 days or if it was recent but the date is unclear, answer "No" if no stroke is mentioned or a prior stroke occurred but was not recent)',
    'peptic_ulcer_disease': 'Does the note describe the patient as ever having peptic ulcer disease?',
    'bleeding': 'Does the note describe the patient as having serious bleeding (e.g., hemorrhage) in the past 6 months? Answer "No" unless the note describes a serious recent bleeding issue.',
    'afib_ablation': 'Does the note describe the patient as having a planned or past ablation procedure for afib? Answer "No" unless the note includes information about a past or planned ablation for afib.',
    'surgical_valvular_disease': 'Does the note describe the patient as ever having valvular disease (stenosis) requiring surgery? Answer "No" if there is mention of stenosis without surgery.',
    'heart_failure': 'Does the note describe the patient as having heart failure?',
    't2d': 'Does the note describe the patient as ever having type 2 diabetes (T2D)? Answer "No" if the note does not include a diagnosis or history of Type 2 diabetes, T2D, Type II diabetes etc..',
    't2d-1': 'Does the note describe the patient as having diabetes mellitus (DM1, DM2, T2D, T1DM, T2DM)?',
    'arterial_hypertension': 'Does the note describe the patient as having arterial hypertension (high bp e.g. >140, or HTN)? This includes pre-existing hypertension and treated hypertension.',
    'prior_stroke': 'Does the note describe the patient as ever having a stroke or transient ischemic attack (TIA)? Answer "No" unless the note includes information about the patient having a prior stroke or TIA',
    'med_decisions': 'Does the note describe the patient as being unable to make medical decisions upon discharge? Answer "No" unless there is evidence the patient cannot make their own medical decisions because they are not cabable (for example if they are not mentally competent, not awake, not conscious, have dementia, or are deceased) or there is evidence someone else (e.g., a husband, wife, family member, or attorney) is designated to make their medical decisions.'
}

num_question_definitions = {
    'PLT': 'What is the lowest platelet count (PLT) recorded for the patient in the note? Answer "NA" if there is no platelet count (PLT) that can be found in the note.', 
    'BILI': 'What is the higest total bilirubin (TotBili, Bili) mentioned in the note? Answer "NA" unless there is a numeric bilirubin (or total bilirubin) count in the note.', 
    'AST': 'What is the higest aspartate aminotransferase level (AST) mentioned in the note? Answer "NA" if no AST value is available in the note.',
    'CREAT': 'What is the higest serum creatinine (Creat) mentioned in the note? Answer "NA" if no creatinine value is available in the note.',
    'HGB': 'What is the lowest hemoglobin (HGB) mentioned in the note? Answer "NA" if no HGB value is available in the note.',
    'chads2': 'What is the highest CHADS2 score mentioned? Answer "NA" if no CHADS2 score is in the note.',
    'lvef': 'What is the lowest left ventricular ejection (LVEF, ef, ejection fraction) fraction mentioned in the note? Answer "NA" if no LVEF is in the note, Answer 55 if the lowest value is 55%% or greater.',
    'blood_glucose': 'What is the highest blood glucose lab mentioned? Answer "NA" if no blood glucose score is in the note.',
}

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score
from sklearn.utils import resample

def preprocess_values(y_true, y_pred):
    def normalize(val):
        if isinstance(val, str):
            return val.strip().lower()
        elif isinstance(val, (int, float)):
            return float(val)
        else:
            return val
    
    y_true_normalized = [normalize(val) for val in y_true]
    y_pred_normalized = [normalize(val) for val in y_pred]
    
    return np.array(y_true_normalized), np.array(y_pred_normalized)

metrics = []

info_df['label'].fillna('NA', inplace=True)
info_df['answer'].fillna('NA', inplace=True)

def convert_to_binary(y_true, y_pred):
    return np.array([1 if yt == yp else 0 for yt, yp in zip(y_true, y_pred)])

# Example: iterate over the groups
for (question_label, type, model, temperature, top_p), group in info_df.groupby(['question_label', 'type', 'model', 'temperature', 'top_p']):
    if question_label in bool_question_definitions.keys():
        y_pred = group['label'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)
        y_true = group['answer'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)    

        auc = roc_auc_score(y_true, y_pred) if len(y_true.unique()) > 1 else 1
        accuracy = accuracy_score(y_true, y_pred)
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        micro_f1 = f1_score(y_true, y_pred, average='micro')

    elif question_label in num_question_definitions.keys():
        y_true = group['label']
        y_pred = group['answer']
        y_true, y_pred = preprocess_values(y_true, y_pred)
        y_true_binary = convert_to_binary(y_true, y_true)
        y_pred_binary = convert_to_binary(y_true, y_pred)
        accuracy = accuracy_score(y_true_binary, y_pred_binary)
        balanced_acc = balanced_accuracy_score(y_true_binary, y_pred_binary)
        macro_f1 = f1_score(y_true_binary, y_pred_binary, average='macro')
        micro_f1 = f1_score(y_true_binary, y_pred_binary, average='micro')
    else:
        continue
        # raise Exception('Not Implemented')

    metrics.append({
        'question_label': question_label,
        'model': model,
        'temperature': temperature,
        'top_p': top_p,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    })

metrics_info_df = pd.DataFrame(metrics)

# Create the pivot table
pivot_table = metrics_info_df.pivot_table(
    index='question_label',
    columns='model',
    values=['balanced_accuracy', 'micro_f1'],
    aggfunc='mean'
)
pivot_table

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score
from sklearn.utils import resample

def preprocess_values(y_true, y_pred):
    def normalize(val):
        if isinstance(val, str):
            return val.strip().lower()
        elif isinstance(val, (int, float)):
            return float(val)
        else:
            return val
    
    y_true_normalized = [normalize(val) for val in y_true]
    y_pred_normalized = [normalize(val) for val in y_pred]
    
    return np.array(y_true_normalized), np.array(y_pred_normalized)

metrics = []

info_df['label'].fillna('NA', inplace=True)
info_df['answer'].fillna('NA', inplace=True)

def convert_to_binary(y_true, y_pred):
    return np.array([1 if yt == yp else 0 for yt, yp in zip(y_true, y_pred)])

# Example: iterate over the groups
for (question_label, type, model, temperature, top_p), group in info_df.groupby(['question_label', 'type', 'model', 'temperature', 'top_p']):
    if question_label in bool_question_definitions.keys():
        y_pred = group['label'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)
        y_true = group['answer'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)    

        auc = roc_auc_score(y_true, y_pred) if len(y_true.unique()) > 1 else 1
        accuracy = accuracy_score(y_true, y_pred)
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        micro_f1 = f1_score(y_true, y_pred, average='micro')

    elif question_label in num_question_definitions.keys():
        y_true = group['label']
        y_pred = group['answer']
        y_true, y_pred = preprocess_values(y_true, y_pred)
        y_true_binary = convert_to_binary(y_true, y_true)
        y_pred_binary = convert_to_binary(y_true, y_pred)
        accuracy = accuracy_score(y_true_binary, y_pred_binary)
        balanced_acc = balanced_accuracy_score(y_true_binary, y_pred_binary)
        macro_f1 = f1_score(y_true_binary, y_pred_binary, average='macro')
        micro_f1 = f1_score(y_true_binary, y_pred_binary, average='micro')
    else:
        continue
        # raise Exception('Not Implemented')

    metrics.append({
        'question_label': question_label,
        'model': model,
        'temperature': temperature,
        'top_p': top_p,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    })

metrics_info_df = pd.DataFrame(metrics)

# Create the pivot table
pivot_table = metrics_info_df.pivot_table(
    index='question_label',
    columns='model',
    values=['balanced_accuracy', 'micro_f1'],
    aggfunc='mean'
)

# Reindex columns for a specific order
column_order = [
    # ('balanced_accuracy', 'Meta-Llama-3.1-70B-Instruct'), 
    # ('balanced_accuracy', 'Meta-Llama-3.1-8B-Instruct'),
    # ('balanced_accuracy', 'Meta-Llama-3.1-8B_all_1'), 
    # ('balanced_accuracy', 'Meta-Llama-3.1-8B_all_1-25000-hardest'),
    # ('micro_f1', 'Meta-Llama-3.1-70B-Instruct'), 
    # ('micro_f1', 'Meta-Llama-3.1-8B-Instruct'),
    # ('micro_f1', 'Meta-Llama-3.1-8B_all_1'), 
    # ('micro_f1', 'Meta-Llama-3.1-8B_all_1-25000-hardest')
    ('balanced_accuracy', 'Llama-3.2-3B-Instruct'), 
    ('balanced_accuracy', 'Llama-3.2-3B_all_1_8_16'),
    ('balanced_accuracy', 'Llama-3.2-1B-Instruct'), 
    ('balanced_accuracy', 'Llama-3.2-1B_all_1_8_16'),
    ('micro_f1', 'Llama-3.2-3B-Instruct'), 
    ('micro_f1', 'Llama-3.2-3B_all_1_8_16'),
    ('micro_f1', 'Llama-3.2-1B-Instruct'), 
    ('micro_f1', 'Llama-3.2-1B_all_1_8_16')
    
]
pivot_table = pivot_table.reindex(columns=column_order)

# Perform bootstrapping to calculate 95% CI for each model and metric
def bootstrap_model_metric(df, metric, n=1000):
    """Bootstrap the average of a given metric and calculate the 95% CI for a model."""
    bootstrapped_means = []
    
    # Bootstrapping
    for _ in range(n):
        resampled_df = resample(df)
        model_avg = resampled_df[metric].mean()
        bootstrapped_means.append(model_avg)
    
    mean_avg = np.mean(bootstrapped_means) * 100  # Convert to percentage
    lower_ci = np.percentile(bootstrapped_means, 2.5) * 100
    upper_ci = np.percentile(bootstrapped_means, 97.5) * 100
    return mean_avg, lower_ci, upper_ci

# Calculate bootstrapped 95% CI for each model and metric (balanced_accuracy and micro_f1)
metrics_with_ci = {}
for metric in ['balanced_accuracy', 'micro_f1']:
    for model in metrics_info_df['model'].unique():
        model_df = metrics_info_df[metrics_info_df['model'] == model]
        mean_avg, lower_ci, upper_ci = bootstrap_model_metric(model_df, metric)
        metrics_with_ci[(metric, model)] = f"{mean_avg:.1f}% ({lower_ci:.1f}%, {upper_ci:.1f}%)"

# Formatting the table and adding bootstrapped averages with CI for each model
formatted_pivot_table = pivot_table.applymap(lambda x: f"{x*100:.2f}%")
for (metric, model), ci_value in metrics_with_ci.items():
    formatted_pivot_table.loc['Average', (metric, model)] = ci_value

# Convert the pivot table to CSV format for Excel copy-pasting
csv_output = formatted_pivot_table.to_csv()

# Print CSV output
print(csv_output)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score

def preprocess_values(y_true, y_pred):
    # Define a helper function to handle numeric and string comparisons
    def normalize(val):
        if isinstance(val, str):
            # If string, strip spaces and lower the case
            return val.strip().lower()
        elif isinstance(val, (int, float)):
            # If numeric, convert to float for comparison (handles 188 == 188.0)
            return float(val)
        else:
            return val
    
    y_true_normalized = [normalize(val) for val in y_true]
    y_pred_normalized = [normalize(val) for val in y_pred]
    
    return np.array(y_true_normalized), np.array(y_pred_normalized)

metrics = []

info_df['label'].fillna('NA', inplace=True)
info_df['answer'].fillna('NA', inplace=True)

def convert_to_binary(y_true, y_pred):
    return np.array([1 if yt == yp else 0 for yt, yp in zip(y_true, y_pred)])

# Example: iterate over the groups
for (question_label, type, model, temperature, top_p), group in info_df.groupby(['question_label', 'type', 'model', 'temperature', 'top_p']):
    print(question_label)
    if question_label in bool_question_definitions.keys():
        print('Boolean')
        y_pred = group['label'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)
        y_true = group['answer'].apply(lambda x: 1 if x and x.lower() == 'yes' else 0)    

        auc = roc_auc_score(y_true, y_pred) if len(y_true.unique()) > 1 else 1
        accuracy = accuracy_score(y_true, y_pred)
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        micro_f1 = f1_score(y_true, y_pred, average='micro')

    elif question_label in num_question_definitions.keys():
        print('Numeric')
        y_true = group['label']
        y_pred = group['answer']

        # Preprocess y_true and y_pred
        y_true, y_pred = preprocess_values(y_true, y_pred)
        y_true_binary = convert_to_binary(y_true, y_true)  # Ground truth stays as is
        y_pred_binary = convert_to_binary(y_true, y_pred)  # Prediction binary matches with true values

        # Compute statistics
        # auc = roc_auc_score(y_true, y_pred) if len(np.unique(y_true)) > 1 else 1
        accuracy = accuracy_score(y_true_binary, y_pred_binary)
        balanced_acc = balanced_accuracy_score(y_true_binary, y_pred_binary)
        macro_f1 = f1_score(y_true_binary, y_pred_binary, average='macro')
        micro_f1 = f1_score(y_true_binary, y_pred_binary, average='micro')
    else:
        continue
        # raise Exception('Not Implemented')

    
    
    # Print or store the statistics as needed
    metrics.append({
        'question_label': question_label,
        'model': model,
        'temperature': temperature,
        'top_p': top_p,
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    })

metrics_info_df = pd.DataFrame(metrics)
# pd.set_option('display.max_rows', 400)
# display(metrics_info_df.sort_values(by='question_label'))
# pd.set_option('display.max_rows', 20)


    

In [None]:
pd.set_option('display.max_rows', 50)
display(metrics_info_df[metrics_info_df['model']=='Meta-Llama-3.1-8B_all_1-25000-hardest'].groupby('question_label')['accuracy'].mean())

In [None]:
metrics_info_df['model'].value_counts()

In [None]:
display(metrics_info_df[metrics_info_df['model']=='Meta-Llama-3.1-8B-Instruct'].groupby('question_label')['accuracy'].mean()-metrics_info_df[metrics_info_df['model']=='Meta-Llama-3.1-8B-Instruct'].groupby('question_label')['accuracy'].mean())
# display(metrics_info_df[metrics_info_df['model']=='Meta-Llama-3.1-8B_all_1'].groupby('question_label')['accuracy'].mean()-metrics_info_df[metrics_info_df['model']=='Meta-Llama-3.1-8B-Instruct'].groupby('question_label')['accuracy'].mean())

In [None]:
pd.set_option('display.max_rows', 400)
display(metrics_info_df[['question_label', 'model', 'balanced_accuracy']])
pd.set_option('display.max_rows', 20)

In [None]:
metrics_info_df = metrics_info_df[metrics_info_df['question_label'] != 'PLT']

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 50)
# Assuming your data is already loaded into metrics_info_df
# Pivot table creation
pivot_table = metrics_info_df.pivot_table(
    index='question_label',
    columns='model',
    values=['balanced_accuracy', 'micro_f1'],
    aggfunc='mean'
)

# Specifying column order (replace placeholders with your actual model names)
column_order = [
    ('balanced_accuracy', 'Meta-Llama-3.1-70B-Instruct'), 
    ('balanced_accuracy', 'Meta-Llama-3.1-8B-Instruct'),
    ('balanced_accuracy', 'Meta-Llama-3.1-8B_all_1'), 
    ('balanced_accuracy', 'Meta-Llama-3.1-8B_all_1-25000-hardest'),
    ('micro_f1', 'Meta-Llama-3.1-70B-Instruct'), 
    ('micro_f1', 'Meta-Llama-3.1-8B-Instruct'),
    ('micro_f1', 'Meta-Llama-3.1-8B_all_1'), 
    ('micro_f1', 'Meta-Llama-3.1-8B_all_1-25000-hardest')
]
pivot_table = pivot_table.reindex(columns=column_order)

# Formatting to two decimal places for all values including the average
formatted_pivot_table = pivot_table.applymap(lambda x: f"{x:.2f}")
formatted_pivot_table.loc['Average'] = pivot_table.mean().apply(lambda x: f"{x:.2f}")

# Displaying the formatted table with averages formatted
formatted_pivot_table
