# `print_and_log_results`: Print and log results.

In [6]:
import os
from datetime import datetime

def print_and_log_results(trainer, log_path='./bioasq_results.txt'):
    # Get metrics from the latest evaluation.
    history         = trainer.state.log_history
    accuracy        = round(history[0].get("eval_accuracy", 0), 4)
    f1_yes          = round(history[0].get("eval_f1_yes", 0), 4)
    f1_no           = round(history[0].get("eval_f1_no", 0), 4)
    macro_f1        = round(history[0].get("eval_macro_f1", 0), 4)
    train_loss      = round(history[1].get("train_loss", 0), 4)
    validation_loss = round(history[0].get("eval_loss", 0), 4)

    # Calculate total training time.
    total_train_time = sum([log.get("train_runtime", 0) for log in history])
    epoch_time       = round(total_train_time / trainer.args.num_train_epochs, 2)
    time_per_step    = round(total_train_time / trainer.state.max_steps, 4)

    # Get hyperparameters.
    hyperparameters  = trainer.args.to_dict()
    learning_rate    = hyperparameters.get("learning_rate", "N/A")
    batch_size       = hyperparameters.get("per_device_train_batch_size", "N/A")
    num_epochs       = hyperparameters.get("num_train_epochs", "N/A")

    # Get model checkpoint.
    checkpoint       = trainer.model.name_or_path

    # Create log content.
    log_content = (
        f"# {datetime.now().strftime('%Y.%m.%d.')}\n"
        f"<Hyperparameters>\n"
        f"- Model        : {checkpoint}\n"
        f"- Learning Rate: {learning_rate}\n"
        f"- Batch Size   : {batch_size}\n"
        f"- Epochs       : {num_epochs}\n\n"
        f"<Results>\n"
        f"- Accuracy     : {accuracy}\n"
        f"- F1-yes       : {f1_yes}\n"
        f"- F1-no        : {f1_no}\n"
        f"- Macro-F1     : {macro_f1}\n"
        f"- Train Loss   : {train_loss}\n"
        f"- Validation Loss: {validation_loss}\n\n"
        f"<Training Time>\n"
        f"- Total Time   : {round(total_train_time, 2)} seconds\n"
        f"- Time per Epoch: {epoch_time} seconds\n"
        f"- Time per Step : {time_per_step} seconds\n"
    )

    # Print results.
    print(log_content)

    # Write to log file.
    with open(log_path, 'a') as f:
        f.write(log_content + '\n')

# `print_trial`: Print a trial of Optuna.

In [None]:
def print_trial(trial):
    print('<Hyperparameters>')
    print(f'- Model: {trial.user_attrs["model_name"]}')
    for k, v in trial.user_attrs['best_params'].items():
        print(f'- {k:<15} : {v:.4}')
    print()
    
    print('<Results>')
    for k, v in trial.user_attrs['results'].items():
        if k == 'eval_runtime':
            continue
        print(f'- {k:<15} : {v:.4}')
    print(f'- {"Training time":<15}: {trial.user_attrs["training_time"]:.4}')


In [None]:
[{'eval_loss': 0.5662793517112732,
  'eval_accuracy': 0.7389705882352942,
  'eval_f1_yes': 0.8498942917547568,
  'eval_f1_no': 0.0,
  'eval_macro_f1': 0.4249471458773784,
  'eval_runtime': 1.5888,
  'eval_samples_per_second': 171.196,
  'eval_steps_per_second': 21.399,
  'epoch': 1.0,
  'step': 136},
 {'train_runtime': 9.0454,
  'train_samples_per_second': 119.951,
  'train_steps_per_second': 15.035,
  'total_flos': 143727127541760.0,
  'train_loss': 0.5790046243106618,
  'epoch': 1.0,
  'step': 136}]

# `compute_metrics`: Custom ftn to compute metrics of BioASQ-12b, 'yesno' task.
- Accuracy.
- F1-yes.
- F1-no.
- Macro-F1.
- http://participants-area.bioasq.org/results/12b/phaseB/

In [1]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds    = predictions.argmax(axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1_yes   = f1_score(labels, preds, pos_label=1)
    f1_no    = f1_score(labels, preds, pos_label=0)
    macro_f1 = f1_score(labels, preds, average="macro")
    
    return {
        "accuracy" : accuracy,
        "f1_yes"   : f1_yes,
        "f1_no"    : f1_no,
        "macro_f1" : macro_f1,
    }

# `load_datasets`: load train and test dataset.

In [8]:
import json
from datasets import Dataset
import pandas as pd

def encode_label(ds):
    """Simply encode and change name of 'answer_exact', 'yes': 1, 'no': 0."""
    
    ds = ds.map(
        lambda x: {'answer_exact': 1 if x['answer_exact'] == 'yes' else 0}
    )

    ds = ds.rename_column('answer_exact', 'labels')

    return ds

def load_datasets_all():
    """ Returns train_ds, valid_ds, and (merged) test_ds. """
    # Data path.
    path_list = [
        './datasets/training_12b.json',    # train set.
        './datasets/12B1_golden.json',     # test set, split 1 ~ 4.
        './datasets/12B2_golden.json',
        './datasets/12B3_golden.json',
        './datasets/12B4_golden.json'
    ]

    # Load.
    df_list = []
    for path in path_list:
        with open(path, 'r', encoding='utf-8') as f:
            # Load json file.
            data = json.load(f)   

            # Read rows.
            rows = []
            for question in data['questions']:
                if question['type'] == 'yesno':   # Load only samples with type = 'yesno'.
                    row = {
                        "question": question['body'],
                        "snippets": "\n".join([s['text'] for s in question['snippets']]),
                        "documents": "\n".join(question['documents']),
                        "answer_exact": question.get('exact_answer', ''),
                        "answer_ideal": question.get('ideal_answer', '')
                    }
                    rows.append(row)
                    
            # Construct df.
            df = pd.DataFrame(rows)
            df['answer_ideal'] = df['answer_ideal'].apply(lambda x: x[0])
            df_list.append(df)
    
    # Split train and test set.
    train_df = df_list[0]
    test_df  = df_list[1:]
    
    # Null check.
    for df in df_list:
        if df.isnull().values.any():
            print("Null found!")
    
    # df -> ds.
    train_ds = Dataset.from_pandas(train_df)
    test_ds  = []
    for df in test_df:
        test_ds.append(Dataset.from_pandas(df))
        
    # Encode label.
    train_ds = encode_label(train_ds)
    for i in range(len(test_ds)):
        test_ds[i] = encode_label(test_ds[i])
        
    # Train-Valid split.
    split    = train_ds.train_test_split(test_size=0.2, shuffle=True, seed=42)
    train_ds = split['train'].shuffle()
    valid_ds = split['test'].shuffle()
        
    return train_ds, valid_ds, test_ds


In [None]:
def load_datasets(ds_type='train'):
    """ Returns specified ds. 'train', 'test', 'test-1' ~ 'test-4'. """
    import json
    import pandas as pd
    from datasets import Dataset
    
    # Data paths.
    path_list = [
        './datasets/training_12b.json',    # train set.
        './datasets/12B1_golden.json',     # test set, split 1 ~ 4.
        './datasets/12B2_golden.json',
        './datasets/12B3_golden.json',
        './datasets/12B4_golden.json'
    ]
    
    # Handle dataset type argument.
    if ds_type == 'train':
        selected_paths = [path_list[0]]  # Only training dataset.
    elif ds_type == 'test':
        selected_paths = path_list[1:]   # All test datasets.
    elif ds_type.startswith('test-'):
        idx = int(ds_type.split('-')[1]) - 1
        selected_paths = [path_list[idx + 1]]  # Specific test split.
    else:
        raise ValueError(f"Invalid ds_type: {ds_type}. Choose from 'train', 'test', 'test-1', 'test-2', 'test-3', 'test-4'.")

    # Load datasets.
    df_list = []
    for path in selected_paths:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            rows = []
            for question in data['questions']:
                if question['type'] == 'yesno':  # Load only 'yesno' samples.
                    row = {
                        "question": question['body'],
                        "snippets": "\n".join([s['text'] for s in question['snippets']]),
                        "documents": "\n".join(question['documents']),
                        "answer_exact": question.get('exact_answer', ''),
                        "answer_ideal": question.get('ideal_answer', '')
                    }
                    rows.append(row)

            df = pd.DataFrame(rows)
            df['answer_ideal'] = df['answer_ideal'].apply(lambda x: x[0] if isinstance(x, list) else x)
            df_list.append(df)

    # Check for null values.
    for df in df_list:
        if df.isnull().values.any():
            print("Null found!")

    # Convert DataFrame to Dataset.
    if ds_type == 'train':
        train_ds = Dataset.from_pandas(df_list[0])
        
        # Train-Validation split.
        split = train_ds.train_test_split(test_size=0.2)
        train_ds = split['train'].shuffle()
        valid_ds = split['test'].shuffle()
        return train_ds, valid_ds

    elif ds_type == 'test':
        test_ds = [Dataset.from_pandas(df) for df in df_list]
        return test_ds

    elif ds_type.startswith('test-'):
        test_ds = Dataset.from_pandas(df_list[0])
        return test_ds


# `get_logits_for_ensemble`: Calculate logits of each model for ensemble.

In [7]:
from datasets import Dataset
import torch
import numpy as np
from datasets import disable_progress_bar


# for soft voting, i.e. mean of logits.
def get_logits_for_ensemble(ds, model, tokenizer, batch_size):
    # Disable progress bar.
    disable_progress_bar()
    
    # cuda device.
    device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model_logits = []

    for i in range(0, len(ds), batch_size):
        # Extract the batch from the dataset
        batch = ds[i:i + batch_size]
        batch = Dataset.from_dict(batch)

        # Tokenize the batch
        def tokenize(sample):
            return tokenizer(sample['input'], 
                             truncation      = True, 
                             padding         = 'max_length',
                             max_length      = 512,
                             return_tensors  ='pt')
        
        batch_tokenized = batch.map(tokenize, batched=False)
        
        # Calculate batch_logits.
        batch_logits = []
        for s in range(min(batch_size, len(batch_tokenized['input_ids']))):
            input_ids = torch.tensor(batch_tokenized['input_ids'][s], device=device)
            attention_mask = torch.tensor(batch_tokenized['attention_mask'][s], device=device)
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits.cpu().numpy()
            
            sample_logit = np.mean(logits, axis=0)
            
            batch_logits.append(sample_logit)
            
        model_logits.extend(batch_logits)

    return model_logits

# `split_by_snippets_docs`: for each sample, split each snippets and docs, and make a new sample.

In [6]:
def split_by_snippets_docs(ds, only_no):
    splitted_inputs = []
    splitted_labels = []
    
    for sample in ds:
        idx_snippets  = sample['input'].find('\nSnippets:')
        idx_doc       = sample['input'].find('\nRetrieved Chunks:')
        question      = sample['input'][:idx_snippets]
        snippets      = sample['input'][idx_snippets:idx_doc]
        docs          = sample['input'][idx_doc + len('\nRetrieved Chunks:\n'):]
        label         = sample['labels']

        if only_no:
            if label == 0:
                # add with snippets.
                splitted_inputs.append(question + snippets + "\nAnswer:\n")
                splitted_labels.append(label)

                # add with docs.
                splitted_inputs.append(question + "\nSnippets:\n" + docs)
                splitted_labels.append(label)
                
            # if 'yes', just copy original sample.
            else:
                splitted_inputs.append(sample['input'])
                splitted_labels.append(label)
        else:
            # add with snippets.
            splitted_inputs.append(question + snippets + "\nAnswer:\n")
            splitted_labels.append(label)

            # add with docs.
            splitted_inputs.append(question + "\nSnippets:\n" + docs)
            splitted_labels.append(label)
            
    splited_ds = Dataset.from_dict({"input": splitted_inputs, "labels": splitted_labels})
    
    return splited_ds   