In [None]:
!pip install -qU transformers sentence-transformers datasets

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.training_args import MultiDatasetBatchSamplers
from sentence_transformers.losses import SoftmaxLoss, MultipleNegativesRankingLoss
from sentence_transformers.evaluation import TripletEvaluator, BinaryClassificationEvaluator, SequentialEvaluator
from transformers import EarlyStoppingCallback

import pandas as pd
import numpy as np
import random

from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

In [None]:
user_secrets = UserSecretsClient()
secret_token = user_secrets.get_secret("UIT_21520296_DATASET")
login(token=secret_token)

In [None]:
pair_class_dataset = load_dataset('KhoaUIT/UIT-R2GQA', 'pair-class-segmented')
pair_class_dataset

In [None]:
triplet_dataset = load_dataset('KhoaUIT/UIT-R2GQA', 'triplet-segmented')
triplet_dataset

In [None]:
train_dataset = {
    "pair-class": pair_class_dataset["train"],
    "triplet": triplet_dataset["train"]
}

eval_dataset={
    "pair-class": pair_class_dataset["valid"],
    "triplet": triplet_dataset["valid"]
}

test_dataset={
    "pair-class": pair_class_dataset["test"],
    "triplet": triplet_dataset["test"]
}

In [None]:
# Load model

"""
    Documentation:
    - Auto truncate any input longer than max_seq_length, see: https://sbert.net/docs/package_reference/sentence_transformer/models.html
      Notice: 
          + "max_seq_length" should be adjusted to make SentenceTransformer model works properly and to be easy-to-understand
          + Original PhoBERT-base-v2 from VinAI expects input of 256 tokens, which is its maximum sequence length
      
    - There are two ways to create new SentenceTransformer object, see: https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
      
"""

## first way to create SentenceTransformer model
model = SentenceTransformer("vinai/phobert-base-v2")
model.max_seq_length = 256                               # by default,'max_seq_length' does not match to model maximum sequence length

## second way
# from sentence_transformers import models, SentenceTransformer

# # Define Transformer model with max_seq_length=256
# transformer = models.Transformer("vinai/phobert-base-v2", max_seq_length=256)

# # Define pooling layer
# pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode="mean")

# # Create SentenceTransformer model with both modules
# model = SentenceTransformer(modules=[transformer, pooling])

In [None]:
# check whether model can truncate input to max_seq_length, if you get an error, recheck model initiation step
# paragraph = pair_class_dataset['train'][0]['context']
# model.encode(paragraph)

In [None]:
# Loss functions
pair_class_loss = SoftmaxLoss(model, model.get_sentence_embedding_dimension(), num_labels=2)  # for Pair-Class
triplet_loss = MultipleNegativesRankingLoss(model)                                            # for Triplet

# Mapping datasets to losses
losses = {
    "pair-class": pair_class_loss,
    "triplet": triplet_loss
}

In [None]:
# Evaluator for Triplet
dev_triplet_evaluator = TripletEvaluator(
    anchors=triplet_dataset["valid"]["anchor"],
    positives=triplet_dataset["valid"]["positive"],
    negatives=triplet_dataset["valid"]["negative"],
    name="triplet-dev"
)

test_triplet_evaluator = TripletEvaluator(
    anchors=triplet_dataset["test"]["anchor"],
    positives=triplet_dataset["test"]["positive"],
    negatives=triplet_dataset["test"]["negative"],
    name="triplet-test"
)

# Evaluator for Pair-Class
dev_pair_class_evaluator = BinaryClassificationEvaluator(
    sentences1=pair_class_dataset["valid"]["question"],
    sentences2=pair_class_dataset["valid"]["context"],
    labels=pair_class_dataset["valid"]["label"],
    name="pair-class-dev"
)

test_pair_class_evaluator = BinaryClassificationEvaluator(
    sentences1=pair_class_dataset["test"]["question"],
    sentences2=pair_class_dataset["test"]["context"],
    labels=pair_class_dataset["test"]["label"],
    name="pair-class-test"
)

# Combine evaluators with SequentialEvaluator
dev_evaluator = SequentialEvaluator([dev_triplet_evaluator, dev_pair_class_evaluator], main_score_function=lambda scores: np.average(scores))
test_evaluator = SequentialEvaluator([test_triplet_evaluator, test_pair_class_evaluator], main_score_function=lambda scores: np.average(scores))

# Use evaluator for evaluating Validation/Testing set before training
dev_evaluator(model)

In [None]:
# Training arguments

"""
    Documentation:
    1. SentenceTransformerTrainingArguments, see: https://sbert.net/docs/package_reference/sentence_transformer/training_args.html#
       Note: SentenceTransformerTrainingArguments extends TrainingArguments with additional arguments specific to Sentence Transformers
       
    2. make a BatchSamplers/MultiDatasetBatchSamplers, see: https://sbert.net/docs/package_reference/sentence_transformer/sampler.html#
    3. examples, see: https://sbert.net/docs/sentence_transformer/training_overview.html
"""

args = SentenceTransformerTrainingArguments(
    output_dir="finetuned model",
    
    # Optional training parameters:
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    fp16=True, 
    multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
    
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    # eval_steps=100,
    save_strategy="epoch",
    # save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_sequential_score",  
    greater_is_better=True,
    logging_dir="logs",
    logging_strategy="epoch",
    # logging_steps=100,
    report_to="none"     
)

In [None]:
# 7. Create a trainer & train

"""
    Notice:
        You can use an evaluator with or without an eval_dataset, and vice versa (document)
"""

# there is a bug here, EarlyStoppingCallback cannot find and track any metrics 
# early_stop = EarlyStoppingCallback(2)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=losses,
    evaluator=dev_evaluator,  
    # callbacks=[early_stop]
)

trainer.train()

In [None]:
import glob
import os

# Path to the folder
path = '/kaggle/working/finetuned model/'

# Get all directories starting with 'checkpoint'
checkpoint_dirs = glob.glob(os.path.join(path, 'checkpoint*'))

trainer_state_dir = os.path.join(checkpoint_dirs[0], 'trainer_state.json')
print(trainer_state_dir)

In [None]:
import json

with open(trainer_state_dir, "r") as f:
    trainer_state = json.load(f)

trainer_state

In [None]:
dev_evaluator(model)

In [None]:
test_evaluator(model)

**Several mistakes**

1. **with eval_dataset and dev evaluator**, KeyError: "The metric_for_best_model training argument is set to 'eval_loss', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_pair-class_loss', 'eval_triplet-dev_cosine_accuracy', 'eval_pair-class-dev_cosine_accuracy', 'eval_pair-class-dev_cosine_accuracy_threshold', 'eval_pair-class-dev_cosine_f1', 'eval_pair-class-dev_cosine_f1_threshold', 'eval_pair-class-dev_cosine_precision', 'eval_pair-class-dev_cosine_recall', 'eval_pair-class-dev_cosine_ap', 'eval_sequential_score', 'eval_triplet_loss']. Consider changing the metric_for_best_model via the TrainingArguments."

2. **with eval_dataset and without dev evaluator**, KeyError: "The `metric_for_best_model` training argument is set to 'eval_sequential_score', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_pair-class_loss', 'eval_triplet_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments."

   after training, losses only have `Pair-class Loss` and `Triplet Loss` (have no `val_loss`)

3. early stopping required metric_for_best_model, but did not find eval_sequential_score so early stopping is disabled.

   *This a bug here, EarlyStoppingCallback cannot find and track any metrics even you implement both cases above (1 and 2)*

5. ValueError: You have set `args.eval_strategy` to epoch but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`.

