In [5]:
import pandas as pd
import numpy as np

In [None]:
# Define a function to evaluate predictions
def evaluate_predictions(labels, submission):
    # Helper function to determine coarse equivalence
    def map_to_coarse_T(value):
        if value in ['Tis', 'T1mi', 'T1a', 'T1b', 'T1c']:
            return 'T1'
        elif value in ['T2a', 'T2b']:
            return 'T2'
        return value

    def map_to_coarse_M(value):
        if value in ['M1a', 'M1b', 'M1c']:
            return 'M1'
        return value

    # Merge the data on 'id'
    merged = pd.merge(labels, submission, on='id', suffixes=('_label', '_pred'))

    # Fine-grained accuracies
    merged['T_correct_fine'] = merged['t_label'] == merged['t_pred']
    merged['N_correct_fine'] = merged['n_label'] == merged['n_pred']
    merged['M_correct_fine'] = merged['m_label'] == merged['m_pred']
    merged['Joint_correct_fine'] = (
        merged['T_correct_fine'] & 
        merged['N_correct_fine'] & 
        merged['M_correct_fine']
    )

    # Coarse-grained mappings
    merged['t_label_coarse'] = merged['t_label'].map(map_to_coarse_T)
    merged['t_pred_coarse'] = merged['t_pred'].map(map_to_coarse_T)
    merged['m_label_coarse'] = merged['m_label'].map(map_to_coarse_M)
    merged['m_pred_coarse'] = merged['m_pred'].map(map_to_coarse_M)

    # Coarse-grained accuracies
    merged['T_correct_coarse'] = merged['t_label_coarse'] == merged['t_pred_coarse']
    merged['M_correct_coarse'] = merged['m_label_coarse'] == merged['m_pred_coarse']
    merged['Joint_correct_coarse'] = (
        merged['T_correct_coarse'] & 
        merged['N_correct_fine'] & 
        merged['M_correct_coarse']
    )

    # Calculate metrics
    metrics = {
        'Joint accuracy (fine)': float(merged['Joint_correct_fine'].mean()),
        'T accuracy (fine)': float(merged['T_correct_fine'].mean()),
        'N accuracy (fine)': float(merged['N_correct_fine'].mean()),
        'M accuracy (fine)': float(merged['M_correct_fine'].mean()),
        'Joint accuracy (coarse)': float(merged['Joint_correct_coarse'].mean()),
        'T accuracy (coarse)': float(merged['T_correct_coarse'].mean()),
        'N accuracy (coarse)': float(merged['N_correct_fine'].mean()),  # Same as fine
        'M accuracy (coarse)': float(merged['M_correct_coarse'].mean())
    }

    return metrics, merged

def show_metrics(pred_path, gt_path):
    print(os.path.basename(pred_path))
    # Load the ground truth and predictions
    gt = pd.read_csv(gt_path)
    pred = pd.read_csv(pred_path)

    # Evaluate the predictions
    metrics, merged = evaluate_predictions(gt, pred)
    return metrics


In [7]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gpt-4o-2024-05-13.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.7037037037037037,
 'T accuracy (fine)': 0.8518518518518519,
 'N accuracy (fine)': 0.9259259259259259,
 'M accuracy (fine)': 0.8333333333333334,
 'Joint accuracy (coarse)': 0.7962962962962963,
 'T accuracy (coarse)': 0.9444444444444444,
 'N accuracy (coarse)': 0.9259259259259259,
 'M accuracy (coarse)': 0.8888888888888888}

In [8]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gpt-4o-mini-2024-07-18.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.35185185185185186,
 'T accuracy (fine)': 0.5925925925925926,
 'N accuracy (fine)': 0.7407407407407407,
 'M accuracy (fine)': 0.8703703703703703,
 'Joint accuracy (coarse)': 0.5185185185185185,
 'T accuracy (coarse)': 0.7592592592592593,
 'N accuracy (coarse)': 0.7407407407407407,
 'M accuracy (coarse)': 0.9444444444444444}

In [9]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_o1-preview-2024-09-12.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.9074074074074074,
 'T accuracy (fine)': 0.9444444444444444,
 'N accuracy (fine)': 0.9629629629629629,
 'M accuracy (fine)': 1.0,
 'Joint accuracy (coarse)': 0.9259259259259259,
 'T accuracy (coarse)': 0.9629629629629629,
 'N accuracy (coarse)': 0.9629629629629629,
 'M accuracy (coarse)': 1.0}

In [10]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gemma-2-2b-jpn-it_pretrained.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.07407407407407407,
 'T accuracy (fine)': 0.14814814814814814,
 'N accuracy (fine)': 0.5555555555555556,
 'M accuracy (fine)': 0.5370370370370371,
 'Joint accuracy (coarse)': 0.3888888888888889,
 'T accuracy (coarse)': 0.48148148148148145,
 'N accuracy (coarse)': 0.5555555555555556,
 'M accuracy (coarse)': 0.7777777777777778}

In [11]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gemma-2-2b-jpn-it_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.2777777777777778,
 'T accuracy (fine)': 0.3888888888888889,
 'N accuracy (fine)': 0.8148148148148148,
 'M accuracy (fine)': 0.8333333333333334,
 'Joint accuracy (coarse)': 0.48148148148148145,
 'T accuracy (coarse)': 0.6296296296296297,
 'N accuracy (coarse)': 0.8148148148148148,
 'M accuracy (coarse)': 0.9444444444444444}

In [12]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama3-Preferred-MedSwallow-70B_pretrained.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.0,
 'T accuracy (fine)': 0.018518518518518517,
 'N accuracy (fine)': 0.48148148148148145,
 'M accuracy (fine)': 0.5,
 'Joint accuracy (coarse)': 0.0,
 'T accuracy (coarse)': 0.018518518518518517,
 'N accuracy (coarse)': 0.48148148148148145,
 'M accuracy (coarse)': 0.5}

In [13]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_pretrained.csv'

show_metrics(submission_file_path, label_file_path)


{'Joint accuracy (fine)': 0.2777777777777778,
 'T accuracy (fine)': 0.4074074074074074,
 'N accuracy (fine)': 0.8148148148148148,
 'M accuracy (fine)': 0.8888888888888888,
 'Joint accuracy (coarse)': 0.5370370370370371,
 'T accuracy (coarse)': 0.7222222222222222,
 'N accuracy (coarse)': 0.8148148148148148,
 'M accuracy (coarse)': 0.9814814814814815}

In [15]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.4444444444444444,
 'T accuracy (fine)': 0.5925925925925926,
 'N accuracy (fine)': 0.8703703703703703,
 'M accuracy (fine)': 0.7592592592592593,
 'Joint accuracy (coarse)': 0.6851851851851852,
 'T accuracy (coarse)': 0.7407407407407407,
 'N accuracy (coarse)': 0.8703703703703703,
 'M accuracy (coarse)': 0.8518518518518519}

In [16]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.4074074074074074,
 'T accuracy (fine)': 0.5185185185185185,
 'N accuracy (fine)': 0.8333333333333334,
 'M accuracy (fine)': 0.7777777777777778,
 'Joint accuracy (coarse)': 0.6851851851851852,
 'T accuracy (coarse)': 0.7222222222222222,
 'N accuracy (coarse)': 0.8333333333333334,
 'M accuracy (coarse)': 0.8518518518518519}