 * @ Author: Yohei Ohto
 * @ Create Time: 2025-12-08 11:35:29
 * @ Modified time: 2025-12-08 11:37:25
 * @ Description: 既存MLMへのsequence classificationタスクの追加実装

In [1]:
import glob
import re
import os

import evaluate
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import wandb
from datasets import load_dataset
from sklearn.manifold import TSNE
from tqdm.auto import tqdm
from transformers import (AutoModel, AutoModelForSequenceClassification,
                          AutoTokenizer, DataCollatorWithPadding, Trainer, DataCollatorForTokenClassification,
                          TrainingArguments)

import sys
sys.path.append('..')
from src import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models = [
    "YoheiOhto/251225100096_model"
]

# get raw data

これを参考にして、BLURBのベンチマークを取る https://github.com/michiyasunaga/LinkBERT/tree/main/scripts    
hugging faceだと整備しきってない感  
dataはここから取れる https://nlp.stanford.edu/projects/myasu/LinkBERT/data.zip

In [4]:
zip_path = "../data/raw/data.zip"

import zipfile
os.makedirs("../data/defreeze", exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("../data/defreeze")

## HOC以外は実行できる  
 ---  
* seqcls/DDI_hf 'sentence' & 1 label prediction  
* seqcls/chemprot_hf 'sentence' & 1 label prediction  
* seqcls/GAD_hf 'sentence' & 1 label prediction  
 ---
* seqcls/bioasq_hf 'sentence1', 'sentence2' & 1 label prediction
* seqcls/pubmedqa_hf 'sentence1', 'sentence2' & 1 label prediction  
 ---
* seqcls/hoc_hf 'sentence' & multi-label prediction  
* seqcls/HoC_hf 'sentence' & multi-label prediction  
 ---
* seqcls/BIOSSES_hf 'sentence1', 'sentence2' & regression prediction  
 ---

# DDI, chemprot, GAD

In [6]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [8]:
def preprocess_function(examples):
    result = tokenizer(examples['sentence'], truncation=True, max_length=256)
    result["label"] = [label2id[l] for l in examples["label"]]
    return result

In [9]:
for data_name in ["DDI", "chemprot", "GAD"]:
    data = load_dataset(f"../data/defreeze/data/seqcls/{data_name}_hf")

    label_list = sorted(data['train'].unique('label'))
    num_labels = len(label_list)
    id2label = {i: label for i, label in enumerate(label_list)}
    label2id = {label: i for i, label in enumerate(label_list)}

    for i, name in enumerate(models):
        print("=== Model:", name, " Data:", data_name, " Training ===")

        tokenizer = AutoTokenizer.from_pretrained(name)
        tokenizer.pad_token = "[PAD]" 

        model = AutoModelForSequenceClassification.from_pretrained(
        name, num_labels=num_labels, id2label=id2label, label2id=label2id)

        model_name = name.split("/")[-1]

        wandb.init(
        entity="250502_ohto_research",
        project=data_name, name=model_name, 
        config={
            "model_name": model_name,
            "learning_rate": 2e-5,
            "batch_size": 16,
            "num_epochs": 10,
            "dataset": data_name,
        })

        tokenized_datasets = data.map(preprocess_function, batched=True)

        training_args = TrainingArguments(
            output_dir=f"../data/result/{data_name}/{model_name}",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=10,
            save_strategy="no",
            load_best_model_at_end=False,
            weight_decay=0.01,
            eval_strategy="epoch",
            report_to="wandb",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        metrics = trainer.evaluate()
        trainer.save_metrics("all", metrics)

        df_results = result_output_seq_classification(
            trainer,
            tokenized_datasets,
            tokenizer,
            id2label,
            output_filename=f"../data/result/{data_name}/{model_name}/{data_name.lower()}_results.csv"
        )

=== Model: YoheiOhto/251225100096_model  Data: DDI  Training ===


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at YoheiOhto/251225100096_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2496/2496 [00:00<00:00, 11709.67 examples/s]
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None, 'pad_token_id': 0}.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1312,0.223788,0.94992,0.831689,0.901655,0.808193
2,0.053,0.246672,0.953125,0.81162,0.929172,0.763854
3,0.0318,0.285444,0.953926,0.796742,0.844644,0.768714
4,0.0178,0.342526,0.953526,0.82917,0.831729,0.83035
5,0.0124,0.3767,0.950721,0.782726,0.809041,0.760905
6,0.0049,0.438698,0.947917,0.807048,0.809626,0.808306
7,0.0053,0.421532,0.951122,0.790325,0.818376,0.768967
8,0.0027,0.439227,0.952324,0.798873,0.809257,0.790769
9,0.0026,0.467989,0.951522,0.803925,0.840055,0.77444
10,0.0019,0.456337,0.952724,0.80019,0.819478,0.784847


FileNotFoundError: [Errno 2] No such file or directory: '../data/result/DDI/251225100096_model/all_results.json'

# bioasq, pubmedqa

In [27]:
def preprocess_function(examples):
    if "sentence1" in examples and "sentence2" in examples:
        result = tokenizer(
            examples["sentence1"], 
            examples["sentence2"], 
            truncation=True, 
            max_length=256
        )
    elif "sentence" in examples:
        result = tokenizer(
            examples["sentence"], 
            truncation=True, 
            max_length=256
        )
    else:
        raise ValueError("Input text fields (sentence or sentence1/2) not found.")

    if "label" in examples and len(examples["label"]) > 0:
        first_label = examples["label"][0]

        if isinstance(first_label, float):
            result["label"] = examples["label"]

        elif isinstance(first_label, list):
            result["label"] = [[float(l) for l in labels] for labels in examples["label"]]

        else:
            processed_labels = []
            for l in examples["label"]:
                if 'label2id' in globals() and l in label2id:
                    processed_labels.append(label2id[l])
                elif isinstance(l, str) and l.isdigit():
                    processed_labels.append(int(l))
                elif isinstance(l, int):
                    processed_labels.append(l)
                else:
                    processed_labels.append(l)
            
            result["label"] = processed_labels

    return result

In [28]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, r2_score
from scipy.stats import pearsonr

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    if labels.dtype == np.float32 or labels.dtype == np.float64:
        predictions = np.squeeze(predictions)
        labels = np.squeeze(labels)
        
        mse = mean_squared_error(labels, predictions)
        pearson_corr, _ = pearsonr(labels, predictions)
        
        return {
            "mse": mse,
            "pearson": pearson_corr
        }

    elif len(labels.shape) > 1 and labels.shape[1] > 1:
        probs = 1 / (1 + np.exp(-predictions))
        preds = (probs > 0.5).astype(int)
        
        f1_micro = f1_score(y_true=labels, y_pred=preds, average='micro')
        f1_macro = f1_score(y_true=labels, y_pred=preds, average='macro')
        accuracy = accuracy_score(y_true=labels, y_pred=preds)
        
        return {
            "accuracy": accuracy,
            "f1_micro": f1_micro,
            "f1_macro": f1_macro
        }

    else:
        preds = np.argmax(predictions, axis=1)
        
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average="macro")
        precision = precision_score(labels, preds, average="macro", zero_division=0)
        recall = recall_score(labels, preds, average="macro", zero_division=0)
        
        return {
            "accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall,
        }

In [29]:
import torch
import numpy as np
import pandas as pd
import os

def result_output_unified(
    trainer,
    tokenized_datasets,
    tokenizer,
    id2label=None,
    target_split="validation",
    output_filename='prediction_results.csv'
):
    print(f"Generating predictions for {target_split} set...")
    
    predictions, labels, _ = trainer.predict(tokenized_datasets[target_split])

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    is_regression = (id2label is None) or (labels.dtype == np.float32) or (labels.dtype == np.float64)
    
    is_multilabel = (not is_regression) and (len(labels.shape) > 1 and labels.shape[1] > 1)

    is_singlelabel = (not is_regression) and (not is_multilabel)

    all_results_list = []
    num_samples = len(tokenized_datasets[target_split])

    if is_regression:
        pred_scores = np.squeeze(predictions)
        true_scores = np.squeeze(labels)
    elif is_multilabel:
        probs = torch.sigmoid(torch.from_numpy(predictions)).numpy()
    else:
        probs = torch.nn.functional.softmax(torch.from_numpy(predictions), dim=1).numpy()
        pred_ids = np.argmax(predictions, axis=1)

    print(f"Task Type Detected: {'Regression' if is_regression else 'Multi-label' if is_multilabel else 'Single-label'}")

    for i in range(num_samples):
        input_ids = tokenized_datasets[target_split][i]["input_ids"]
        text = tokenizer.decode(input_ids, skip_special_tokens=True)
        
        row_data = {'Text': text}

        if is_regression:
            pred_val = pred_scores[i]
            true_val = true_scores[i]
            row_data['Predicted'] = float(pred_val)
            row_data['True_Label'] = float(true_val)
            row_data['Diff'] = abs(pred_val - true_val) 

        elif is_multilabel:
            active_preds = np.where(probs[i] > 0.5)[0]
            active_trues = np.where(labels[i] == 1)[0]
            
            pred_names = [id2label[idx] for idx in active_preds]
            true_names = [id2label[idx] for idx in active_trues]
            
            row_data['Predicted'] = "; ".join(pred_names)
            row_data['True_Label'] = "; ".join(true_names)
            
            row_data['Is_Perfect_Match'] = (set(pred_names) == set(true_names))
            
            for idx, label_name in id2label.items():
                row_data[f'P({label_name})'] = probs[i][idx]

        else:
            true_id = labels[i]
            pred_id = pred_ids[i]
            
            true_name = id2label.get(true_id, str(true_id))
            pred_name = id2label.get(pred_id, str(pred_id))
            
            row_data['Predicted'] = pred_name
            row_data['True_Label'] = true_name
            row_data['Is_Correct'] = (pred_name == true_name)
            
            row_data['Confidence'] = probs[i][pred_id] 

            for label_id, label_name in id2label.items():
               row_data[f'P({label_name})'] = probs[i][label_id]

        all_results_list.append(row_data)

    df = pd.DataFrame(all_results_list)
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    print(f"Results saved to {output_filename}")
    
    return df

In [None]:
for data_name in ["pubmedqa"]:
    data = load_dataset(f"../data/defreeze/data/seqcls/{data_name}_hf")

    label_list = sorted(data['train'].unique('label'))
    num_labels = len(label_list)
    id2label = {i: label for i, label in enumerate(label_list)}
    label2id = {label: i for i, label in enumerate(label_list)}

    for i, name in enumerate(models):
        print("=== Model:", name, " Data:", data_name, " Training ===")

        tokenizer = AutoTokenizer.from_pretrained(name)
        tokenizer.pad_token = "[PAD]" 

        model = AutoModelForSequenceClassification.from_pretrained(
        name, num_labels=num_labels, id2label=id2label, label2id=label2id)

        model_name = name.split("/")[-1]
        wandb.init(
        entity="250502_ohto_research",
        project=data_name, name=model_name, 
        config={
            "model_name": model_name,
            "learning_rate": 2e-5,
            "batch_size": 16,
            "num_epochs": 10,
            "dataset": data_name,
        })
        tokenized_datasets = data.map(preprocess_function, batched=True)

        training_args = TrainingArguments(
            output_dir=f"../data/result/{data_name}/{model_name}",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=10,
            save_strategy="no",
            load_best_model_at_end=False,
            weight_decay=0.01,
            eval_strategy="epoch",
            report_to="wandb",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        metrics = trainer.evaluate()
        trainer.save_metrics("all", metrics)

        df_results = result_output_seq_classification(
            trainer,
            tokenized_datasets,
            tokenizer,
            id2label,
            output_filename=f"../data/result/{data_name}/{model_name}/{data_name.lower()}_results.csv"
        )

Generating train split: 450 examples [00:00, 258872.14 examples/s]
Generating validation split: 50 examples [00:00, 63148.21 examples/s]
Generating test split: 500 examples [00:00, 316169.46 examples/s]




=== Model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext  Data: pubmedqa  Training ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 450/450 [00:00<00:00, 1801.36 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 1465.12 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2310.51 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.94813,0.54,0.233766,0.18,0.333333
2,No log,0.973132,0.54,0.233766,0.18,0.333333
3,No log,0.995163,0.38,0.269697,0.256039,0.299927
4,No log,1.013896,0.36,0.242137,0.227778,0.258533
5,No log,1.05624,0.5,0.353865,0.34352,0.410312
6,No log,1.019204,0.46,0.306536,0.289364,0.327524
7,No log,1.128141,0.48,0.340477,0.321581,0.37618
8,No log,1.147235,0.44,0.312623,0.297619,0.351489
9,No log,1.181477,0.46,0.326667,0.309716,0.363834
10,No log,1.193448,0.46,0.326667,0.309716,0.363834


Generating predictions for validation set...
Results saved to ../data/result/pubmedqa/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext/pubmedqa_results.csv
=== Model: dmis-lab/biobert-v1.1  Data: pubmedqa  Training ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 450/450 [00:00<00:00, 2708.34 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 2054.38 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 1473.11 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.955573,0.54,0.233766,0.18,0.333333
2,No log,0.960872,0.54,0.233766,0.18,0.333333
3,No log,0.991807,0.48,0.244948,0.229798,0.303558
4,No log,1.057189,0.4,0.256209,0.240048,0.275962
5,No log,1.210974,0.3,0.210909,0.204509,0.228758
6,No log,1.377561,0.54,0.266199,0.347222,0.340595
7,No log,1.477126,0.38,0.245682,0.230324,0.263617
8,No log,1.672839,0.34,0.215335,0.201389,0.231663
9,No log,1.75434,0.4,0.231944,0.213444,0.261438
10,No log,1.778852,0.38,0.222905,0.205026,0.249092


Generating predictions for validation set...
Results saved to ../data/result/pubmedqa/biobert-v1.1/pubmedqa_results.csv
=== Model: google-bert/bert-base-cased  Data: pubmedqa  Training ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 450/450 [00:00<00:00, 2778.68 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 1258.00 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2458.68 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.947539,0.54,0.233766,0.18,0.333333
2,No log,0.956671,0.54,0.233766,0.18,0.333333
3,No log,0.962657,0.54,0.233766,0.18,0.333333
4,No log,0.9978,0.52,0.22807,0.176871,0.320988
5,No log,1.055697,0.36,0.255435,0.254669,0.287582
6,No log,1.288177,0.54,0.233766,0.18,0.333333
7,No log,1.288671,0.44,0.248388,0.233333,0.286129
8,No log,1.48187,0.46,0.235558,0.222222,0.291213
9,No log,1.521832,0.42,0.241302,0.22906,0.273784
10,No log,1.581801,0.46,0.256232,0.25,0.298475


Generating predictions for validation set...
Results saved to ../data/result/pubmedqa/bert-base-cased/pubmedqa_results.csv
=== Model: answerdotai/ModernBERT-base  Data: pubmedqa  Training ===


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 450/450 [00:00<00:00, 2010.24 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 2398.36 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 1640.74 examples/s]
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.984101,0.52,0.260057,0.264493,0.32825
2,No log,1.058503,0.54,0.233766,0.18,0.333333
3,No log,1.172644,0.36,0.252186,0.268519,0.302106
4,No log,1.158278,0.52,0.260057,0.264493,0.32825
5,No log,1.465195,0.4,0.374074,0.378195,0.484749
6,No log,1.421444,0.52,0.405861,0.638756,0.393246
7,No log,1.786125,0.54,0.315217,0.333333,0.35512
8,No log,2.114757,0.54,0.315217,0.333333,0.35512
9,No log,2.439806,0.54,0.315528,0.352713,0.35512
10,No log,2.431877,0.48,0.292308,0.295322,0.318083


Generating predictions for validation set...
Results saved to ../data/result/pubmedqa/ModernBERT-base/pubmedqa_results.csv
=== Model: Simonlee711/Clinical_ModernBERT  Data: pubmedqa  Training ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Simonlee711/Clinical_ModernBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 450/450 [00:00<00:00, 2195.48 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 2284.13 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2472.76 examples/s]
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.980652,0.54,0.233766,0.18,0.333333
2,No log,1.004583,0.56,0.273879,0.517007,0.352941
3,No log,1.220083,0.34,0.240967,0.231602,0.267974
4,No log,1.314234,0.4,0.265006,0.24901,0.283224
5,No log,1.672227,0.46,0.393432,0.616487,0.370733
6,No log,1.999494,0.42,0.33879,0.366487,0.331518
7,No log,2.073219,0.44,0.365281,0.436749,0.351126
8,No log,2.143933,0.44,0.365281,0.436749,0.351126
9,No log,2.191538,0.44,0.365281,0.436749,0.351126
10,No log,2.207464,0.44,0.365281,0.436749,0.351126


Generating predictions for validation set...
Results saved to ../data/result/pubmedqa/Clinical_ModernBERT/pubmedqa_results.csv
=== Model: thomas-sounack/BioClinical-ModernBERT-base  Data: pubmedqa  Training ===


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at thomas-sounack/BioClinical-ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 450/450 [00:00<00:00, 1572.62 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 487.97 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 1839.69 examples/s]
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.980602,0.52,0.22807,0.176871,0.320988
2,No log,1.05993,0.54,0.233766,0.18,0.333333
3,No log,1.202001,0.42,0.297808,0.299769,0.339143
4,No log,1.119106,0.48,0.270531,0.285714,0.310821
5,No log,1.2128,0.5,0.386667,0.388889,0.409586
6,No log,1.36412,0.52,0.374062,0.603659,0.378722
7,No log,1.595955,0.54,0.438672,0.660354,0.420116
8,No log,1.835409,0.54,0.340584,0.331871,0.369644
9,No log,1.833256,0.54,0.340584,0.331871,0.369644
10,No log,1.892775,0.54,0.340584,0.331871,0.369644


Generating predictions for validation set...
Results saved to ../data/result/pubmedqa/BioClinical-ModernBERT-base/pubmedqa_results.csv


In [31]:
data_name

'HoC'