In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Define a function to evaluate predictions
def evaluate_predictions(labels, submission):
    # Helper function to determine coarse equivalence
    def map_to_coarse_T(value):
        if value in ['Tis', 'T1mi', 'T1a', 'T1b', 'T1c']:
            return 'T1'
        elif value in ['T2a', 'T2b']:
            return 'T2'
        return value

    def map_to_coarse_M(value):
        if value in ['M1a', 'M1b', 'M1c']:
            return 'M1'
        return value

    # Merge the data on 'id'
    merged = pd.merge(labels, submission, on='id', suffixes=('_label', '_pred'))

    # Fine-grained accuracies
    merged['T_correct_fine'] = merged['t_label'] == merged['t_pred']
    merged['N_correct_fine'] = merged['n_label'] == merged['n_pred']
    merged['M_correct_fine'] = merged['m_label'] == merged['m_pred']
    merged['Joint_correct_fine'] = (
        merged['T_correct_fine'] & 
        merged['N_correct_fine'] & 
        merged['M_correct_fine']
    )

    # Coarse-grained mappings
    merged['t_label_coarse'] = merged['t_label'].map(map_to_coarse_T)
    merged['t_pred_coarse'] = merged['t_pred'].map(map_to_coarse_T)
    merged['m_label_coarse'] = merged['m_label'].map(map_to_coarse_M)
    merged['m_pred_coarse'] = merged['m_pred'].map(map_to_coarse_M)

    # Coarse-grained accuracies
    merged['T_correct_coarse'] = merged['t_label_coarse'] == merged['t_pred_coarse']
    merged['M_correct_coarse'] = merged['m_label_coarse'] == merged['m_pred_coarse']
    merged['Joint_correct_coarse'] = (
        merged['T_correct_coarse'] & 
        merged['N_correct_fine'] & 
        merged['M_correct_coarse']
    )

    # Calculate metrics
    metrics = {
        'Joint accuracy (fine)': float(merged['Joint_correct_fine'].mean()),
        'T accuracy (fine)': float(merged['T_correct_fine'].mean()),
        'N accuracy (fine)': float(merged['N_correct_fine'].mean()),
        'M accuracy (fine)': float(merged['M_correct_fine'].mean()),
        'Joint accuracy (coarse)': float(merged['Joint_correct_coarse'].mean()),
        'T accuracy (coarse)': float(merged['T_correct_coarse'].mean()),
        'N accuracy (coarse)': float(merged['N_correct_fine'].mean()),  # Same as fine
        'M accuracy (coarse)': float(merged['M_correct_coarse'].mean())
    }

    return metrics, merged

def show_metrics(pred_path, gt_path):
    print(os.path.basename(pred_path))
    # Load the ground truth and predictions
    gt = pd.read_csv(gt_path)
    pred = pd.read_csv(pred_path)

    # Evaluate the predictions
    metrics, merged = evaluate_predictions(gt, pred)
    return metrics


In [5]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gpt-4o-2024-11-20.csv'

show_metrics(submission_file_path, label_file_path)

submission_gpt-4o-2024-11-20.csv


{'Joint accuracy (fine)': 0.7592592592592593,
 'T accuracy (fine)': 0.8888888888888888,
 'N accuracy (fine)': 0.9629629629629629,
 'M accuracy (fine)': 0.8703703703703703,
 'Joint accuracy (coarse)': 0.7962962962962963,
 'T accuracy (coarse)': 0.9259259259259259,
 'N accuracy (coarse)': 0.9629629629629629,
 'M accuracy (coarse)': 0.9074074074074074}

In [37]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../fewshot_submission_gpt-4o-2024-11-20.csv'

show_metrics(submission_file_path, label_file_path)

fewshot_submission_gpt-4o-2024-11-20.csv


{'Joint accuracy (fine)': 0.7592592592592593,
 'T accuracy (fine)': 0.8333333333333334,
 'N accuracy (fine)': 0.9259259259259259,
 'M accuracy (fine)': 1.0,
 'Joint accuracy (coarse)': 0.8888888888888888,
 'T accuracy (coarse)': 0.9629629629629629,
 'N accuracy (coarse)': 0.9259259259259259,
 'M accuracy (coarse)': 1.0}

In [8]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gpt-4o-mini-2024-07-18.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.35185185185185186,
 'T accuracy (fine)': 0.5925925925925926,
 'N accuracy (fine)': 0.7407407407407407,
 'M accuracy (fine)': 0.8703703703703703,
 'Joint accuracy (coarse)': 0.5185185185185185,
 'T accuracy (coarse)': 0.7592592592592593,
 'N accuracy (coarse)': 0.7407407407407407,
 'M accuracy (coarse)': 0.9444444444444444}

In [9]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_o1-preview-2024-09-12.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.9074074074074074,
 'T accuracy (fine)': 0.9444444444444444,
 'N accuracy (fine)': 0.9629629629629629,
 'M accuracy (fine)': 1.0,
 'Joint accuracy (coarse)': 0.9259259259259259,
 'T accuracy (coarse)': 0.9629629629629629,
 'N accuracy (coarse)': 0.9629629629629629,
 'M accuracy (coarse)': 1.0}

In [7]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gemma-2-2b-jpn-it_pretrained.csv'

show_metrics(submission_file_path, label_file_path)

submission_gemma-2-2b-jpn-it_pretrained.csv


{'Joint accuracy (fine)': 0.07407407407407407,
 'T accuracy (fine)': 0.07407407407407407,
 'N accuracy (fine)': 0.5370370370370371,
 'M accuracy (fine)': 0.5370370370370371,
 'Joint accuracy (coarse)': 0.35185185185185186,
 'T accuracy (coarse)': 0.37037037037037035,
 'N accuracy (coarse)': 0.5370370370370371,
 'M accuracy (coarse)': 0.6666666666666666}

In [10]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gemma-2-2b-jpn-it_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

submission_gemma-2-2b-jpn-it_finetuned.csv


{'Joint accuracy (fine)': 0.3148148148148148,
 'T accuracy (fine)': 0.48148148148148145,
 'N accuracy (fine)': 0.8888888888888888,
 'M accuracy (fine)': 0.8518518518518519,
 'Joint accuracy (coarse)': 0.46296296296296297,
 'T accuracy (coarse)': 0.6481481481481481,
 'N accuracy (coarse)': 0.8888888888888888,
 'M accuracy (coarse)': 0.8518518518518519}

In [16]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama3-Preferred-MedSwallow-70B_pretrained.csv'

show_metrics(submission_file_path, label_file_path)

submission_Llama3-Preferred-MedSwallow-70B_pretrained.csv


{'Joint accuracy (fine)': 0.037037037037037035,
 'T accuracy (fine)': 0.05555555555555555,
 'N accuracy (fine)': 0.4444444444444444,
 'M accuracy (fine)': 0.5740740740740741,
 'Joint accuracy (coarse)': 0.07407407407407407,
 'T accuracy (coarse)': 0.09259259259259259,
 'N accuracy (coarse)': 0.4444444444444444,
 'M accuracy (coarse)': 0.5740740740740741}

In [17]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama3-Preferred-MedSwallow-70B_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

submission_Llama3-Preferred-MedSwallow-70B_finetuned.csv


{'Joint accuracy (fine)': 0.37037037037037035,
 'T accuracy (fine)': 0.5740740740740741,
 'N accuracy (fine)': 0.9259259259259259,
 'M accuracy (fine)': 0.8518518518518519,
 'Joint accuracy (coarse)': 0.6851851851851852,
 'T accuracy (coarse)': 0.7777777777777778,
 'N accuracy (coarse)': 0.9259259259259259,
 'M accuracy (coarse)': 0.9814814814814815}

In [13]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_pretrained.csv'

show_metrics(submission_file_path, label_file_path)


{'Joint accuracy (fine)': 0.2777777777777778,
 'T accuracy (fine)': 0.4074074074074074,
 'N accuracy (fine)': 0.8148148148148148,
 'M accuracy (fine)': 0.8888888888888888,
 'Joint accuracy (coarse)': 0.5370370370370371,
 'T accuracy (coarse)': 0.7222222222222222,
 'N accuracy (coarse)': 0.8148148148148148,
 'M accuracy (coarse)': 0.9814814814814815}

In [15]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.4444444444444444,
 'T accuracy (fine)': 0.5925925925925926,
 'N accuracy (fine)': 0.8703703703703703,
 'M accuracy (fine)': 0.7592592592592593,
 'Joint accuracy (coarse)': 0.6851851851851852,
 'T accuracy (coarse)': 0.7407407407407407,
 'N accuracy (coarse)': 0.8703703703703703,
 'M accuracy (coarse)': 0.8518518518518519}

In [16]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

{'Joint accuracy (fine)': 0.4074074074074074,
 'T accuracy (fine)': 0.5185185185185185,
 'N accuracy (fine)': 0.8333333333333334,
 'M accuracy (fine)': 0.7777777777777778,
 'Joint accuracy (coarse)': 0.6851851851851852,
 'T accuracy (coarse)': 0.7222222222222222,
 'N accuracy (coarse)': 0.8333333333333334,
 'M accuracy (coarse)': 0.8518518518518519}

In [12]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-8B-Instruct-v0.1_pretrained.csv'

show_metrics(submission_file_path, label_file_path)

submission_Llama-3.1-Swallow-8B-Instruct-v0.1_pretrained.csv


{'Joint accuracy (fine)': 0.0,
 'T accuracy (fine)': 0.018518518518518517,
 'N accuracy (fine)': 0.48148148148148145,
 'M accuracy (fine)': 0.5,
 'Joint accuracy (coarse)': 0.0,
 'T accuracy (coarse)': 0.018518518518518517,
 'N accuracy (coarse)': 0.48148148148148145,
 'M accuracy (coarse)': 0.5}

In [13]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-8B-Instruct-v0.1_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

submission_Llama-3.1-Swallow-8B-Instruct-v0.1_finetuned.csv


{'Joint accuracy (fine)': 0.5185185185185185,
 'T accuracy (fine)': 0.7592592592592593,
 'N accuracy (fine)': 0.9259259259259259,
 'M accuracy (fine)': 0.7592592592592593,
 'Joint accuracy (coarse)': 0.7407407407407407,
 'T accuracy (coarse)': 0.9259259259259259,
 'N accuracy (coarse)': 0.9259259259259259,
 'M accuracy (coarse)': 0.8333333333333334}

In [14]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_pretrained.csv'

show_metrics(submission_file_path, label_file_path)

submission_Llama-3.1-Swallow-70B-Instruct-v0.1_pretrained.csv


{'Joint accuracy (fine)': 0.2777777777777778,
 'T accuracy (fine)': 0.3888888888888889,
 'N accuracy (fine)': 0.8333333333333334,
 'M accuracy (fine)': 0.8888888888888888,
 'Joint accuracy (coarse)': 0.5370370370370371,
 'T accuracy (coarse)': 0.6851851851851852,
 'N accuracy (coarse)': 0.8333333333333334,
 'M accuracy (coarse)': 0.9814814814814815}

In [15]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_Llama-3.1-Swallow-70B-Instruct-v0.1_finetuned.csv'

show_metrics(submission_file_path, label_file_path)

submission_Llama-3.1-Swallow-70B-Instruct-v0.1_finetuned.csv


{'Joint accuracy (fine)': 0.5555555555555556,
 'T accuracy (fine)': 0.7037037037037037,
 'N accuracy (fine)': 0.9814814814814815,
 'M accuracy (fine)': 0.8703703703703703,
 'Joint accuracy (coarse)': 0.8518518518518519,
 'T accuracy (coarse)': 0.8888888888888888,
 'N accuracy (coarse)': 0.9814814814814815,
 'M accuracy (coarse)': 0.9814814814814815}

In [3]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '/home/jubuntu/pCloudDrive/radnlp2024/bert/model_outputs/submission_debarta.csv'

show_metrics(submission_file_path, label_file_path)

submission_debarta.csv


{'Joint accuracy (fine)': 0.2037037037037037,
 'T accuracy (fine)': 0.3888888888888889,
 'N accuracy (fine)': 0.8333333333333334,
 'M accuracy (fine)': 0.7222222222222222,
 'Joint accuracy (coarse)': 0.4074074074074074,
 'T accuracy (coarse)': 0.5555555555555556,
 'N accuracy (coarse)': 0.8333333333333334,
 'M accuracy (coarse)': 0.7962962962962963}

In [4]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '/home/jubuntu/pCloudDrive/radnlp2024/bert/model_outputs/submission_uth.csv'

show_metrics(submission_file_path, label_file_path)

submission_uth.csv


{'Joint accuracy (fine)': 0.2037037037037037,
 'T accuracy (fine)': 0.42592592592592593,
 'N accuracy (fine)': 0.8148148148148148,
 'M accuracy (fine)': 0.7592592592592593,
 'Joint accuracy (coarse)': 0.24074074074074073,
 'T accuracy (coarse)': 0.42592592592592593,
 'N accuracy (coarse)': 0.8148148148148148,
 'M accuracy (coarse)': 0.8148148148148148}

## 最終subへの準備
zero-shotでtrainとevalで間違った予測をfew-shotとしてtestデータの予測にまわす。

In [11]:
import pandas as pd

def evaluate_predictions(labels, submission):
    # Helper function to determine coarse equivalence
    def map_to_coarse_T(value):
        if value in ['Tis', 'T1mi', 'T1a', 'T1b', 'T1c']:
            return 'T1'
        elif value in ['T2a', 'T2b']:
            return 'T2'
        return value

    def map_to_coarse_M(value):
        if value in ['M1a', 'M1b', 'M1c']:
            return 'M1'
        return value

    # Merge the data on 'id'
    merged = pd.merge(labels, submission, on='id', suffixes=('_label', '_pred'))

    # Fine-grained accuracies
    merged['T_correct_fine'] = merged['t_label'] == merged['t_pred']
    merged['N_correct_fine'] = merged['n_label'] == merged['n_pred']
    merged['M_correct_fine'] = merged['m_label'] == merged['m_pred']
    merged['Joint_correct_fine'] = (
        merged['T_correct_fine'] & 
        merged['N_correct_fine'] & 
        merged['M_correct_fine']
    )

    # Coarse-grained mappings
    merged['t_label_coarse'] = merged['t_label'].map(map_to_coarse_T)
    merged['t_pred_coarse'] = merged['t_pred'].map(map_to_coarse_T)
    merged['m_label_coarse'] = merged['m_label'].map(map_to_coarse_M)
    merged['m_pred_coarse'] = merged['m_pred'].map(map_to_coarse_M)

    # Coarse-grained accuracies
    merged['T_correct_coarse'] = merged['t_label_coarse'] == merged['t_pred_coarse']
    merged['M_correct_coarse'] = merged['m_label_coarse'] == merged['m_pred_coarse']
    merged['Joint_correct_coarse'] = (
        merged['T_correct_coarse'] & 
        merged['N_correct_fine'] & 
        merged['M_correct_coarse']
    )

    # Calculate metrics
    metrics = {
        'Joint accuracy (fine)': float(merged['Joint_correct_fine'].mean()),
        'T accuracy (fine)': float(merged['T_correct_fine'].mean()),
        'N accuracy (fine)': float(merged['N_correct_fine'].mean()),
        'M accuracy (fine)': float(merged['M_correct_fine'].mean()),
        'Joint accuracy (coarse)': float(merged['Joint_correct_coarse'].mean()),
        'T accuracy (coarse)': float(merged['T_correct_coarse'].mean()),
        'N accuracy (coarse)': float(merged['N_correct_fine'].mean()),  # Same as fine
        'M accuracy (coarse)': float(merged['M_correct_coarse'].mean())
    }

    # ---- ここが追加部分 ----
    # 食い違う行を抽出: T か N か M のいずれかで異なる行だけ
    diff_rows = merged[
        (~merged['T_correct_fine']) |
        (~merged['N_correct_fine']) |
        (~merged['M_correct_fine'])
    ].copy()

    # 必要に応じて diff_rows に、なぜ食い違ったかをより詳細に見るためのカラムを追加してもOK
    # 例: diff_rows['where_diff'] = diff_rows.apply(..., axis=1)

    return metrics, merged, diff_rows

In [29]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../model_outputs/submission_gpt-4o-2024-11-20.csv'

a,b,diff1 = evaluate_predictions(pd.read_csv(label_file_path), pd.read_csv(submission_file_path))

label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/train/label.csv'
submission_file_path = '../submission_gpt-4o-2024-11-20.csv'

a,b,diff2 = evaluate_predictions(pd.read_csv(label_file_path), pd.read_csv(submission_file_path))

In [30]:
diff = pd.concat([diff1, diff2]).reset_index(drop=True)
diff.to_csv('../model_outputs/4o_diff.csv', index=False)

In [32]:
label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/val/label.csv'
submission_file_path = '../submission_o1-preview-2024-09-12_val.csv'

a,b,diff1 = evaluate_predictions(pd.read_csv(label_file_path), pd.read_csv(submission_file_path))

label_file_path = '../radnlp_2024_train_val_20240731/ja/main_task/train/label.csv'
submission_file_path = '../submission_o1-preview-2024-09-12.csv'

a,b,diff2 = evaluate_predictions(pd.read_csv(label_file_path), pd.read_csv(submission_file_path))

In [33]:
diff = pd.concat([diff1, diff2]).reset_index(drop=True)
diff.to_csv('../model_outputs/o1preview_diff.csv', index=False)

In [34]:

diff

Unnamed: 0,id,t_label,n_label,m_label,t_pred,n_pred,m_pred,T_correct_fine,N_correct_fine,M_correct_fine,Joint_correct_fine,t_label_coarse,t_pred_coarse,m_label_coarse,m_pred_coarse,T_correct_coarse,M_correct_coarse,Joint_correct_coarse
0,2318717,T1mi,N0,M0,T1b,N0,M0,False,True,True,False,T1,T1,M0,M0,True,True,True
1,4644984,T0,N2,M1c,T1,N2,M1c,False,True,True,False,T0,T1,M1,M1,False,True,False
2,4734929,T2a,N0,M0,T2a,N2,M0,True,False,True,False,T2,T2,M0,M0,True,True,False
3,12667350,T2b,N2,M1c,T0,N0,M0,False,False,False,False,T2,T0,M1,M0,False,False,False
4,16572985,Tis,N0,M0,Tis,N0,M1b,True,True,False,False,T1,T1,M0,M1,True,False,False
5,133166,T1c,N0,M0,T1c,N1,M0,True,False,True,False,T1,T1,M0,M0,True,True,False
6,463397,T2b,N0,M0,T2b,N1,M0,True,False,True,False,T2,T2,M0,M0,True,True,False
7,1185427,T1c,N0,M0,T2a,N0,M1a,False,True,False,False,T1,T2,M0,M1,False,False,False
8,1538432,T2b,N1,M0,T0,N0,M0,False,False,True,False,T2,T0,M0,M0,False,True,False
9,1679413,T2b,N0,M0,T4,N0,M1a,False,True,False,False,T2,T4,M0,M1,False,False,False
