# Workflow for fine Tuning LLM for Multiple Choice | DeBERTaV3 model
**Main Steps:**
* Prepare training & validation data.
* Preporcessing data using Tokenizer from pretrained Model (DeBERTaV3)
* Train the (pretrained) model - incl. the evaluation strategy, metrics
* Generate prediction with new fine-tuned models

Thanks to the [Hugging Face guide for fine-tuning](https://huggingface.co/docs/transformers/tasks/multiple_choice) (which I tried to replicated here)!

# This script trains DeBERTa on the full training set, with the best hyperparameters selected on the validation set.

# Imports

In [None]:
!pip install -q accelerate peft==0.5.0 bitsandbytes trl==0.4.7 langchain
!pip install -q --no-cache-dir transformers sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/265.7 kB[0m [31m952.8 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/265.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m256.0/265.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m3.8 MB/s[0m eta 

In [None]:
import pandas as pd
import numpy as np
import random

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice, BitsAndBytesConfig, TrainingArguments, Trainer

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

from sklearn.metrics import accuracy_score

In [None]:
# Log in to huggingface to load/push fine-tuned models in hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Global Variables

In [None]:
# model checkpoint
# checkpoint = 'bert-base-uncased'
# checkpoint = 'microsoft/deberta-v3-large'
checkpoint = 'bert-base-uncased'
# checkpoint = 'roberta-base'

# multiple choice answers
options = ['A', 'B', 'C', 'D', 'E']

# mapping of answers to integer values
option_to_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
index_to_option = {0: 'A', 1:'B', 2:'C', 3:'D', 4:'E'}

# 1. Load Datasets

## 1.1. Select Training Data

In [None]:
# Load the training data
data = pd.read_csv('/content/thesis_questions_train.csv').drop(columns="index")
data.reset_index(inplace=True, drop=True)

# Set a random seed for reproducibility
random.seed(42)

# Select 10% of data (randomly) for validation data
# validation_data = data.sample(frac=0.10)

# Use the remaining 90% of data for training data
# train_data = data.drop(validation_data.index)
train_data = data

# print training & validation sizes
# print(f"Train data: {train_data.shape}, Validation data: {validation_data.shape}")
print(f"Train data: {train_data.shape}")

Train data: (780, 7)


In [None]:
# examples in train data
train_data.head(5)

Unnamed: 0,prompt,A,B,C,D,E,answer
0,A potential relationship between a given disea...,cross-sectional study.,case-control study.,ecological study.,correlational study.,descriptive study.,B
1,Which of the following is not an indication fo...,post-inflammatory liver cirrhosis.,Budd-Chiari syndrome.,Wilson’s disease.,primary and secondary biliary liver cirrhosis.,disseminated hepatocellular carcinoma.,E
2,A patient presents to the gynaecological outpa...,overactive bladder.,exercise-induced urinary incontinence caused b...,mixed urinary incontinence.,evacuation of accumulated lymph fluid through ...,vesicovaginal fistula.,E
3,Superior vena cava syndrome:,most commonly occurs in the course of lung can...,may be associated with vessel thrombosis.,is a contraindication to the use of corticoste...,A and B are correct.,"A, B and C are correct.",D
4,The treatment of acute hyperkalemia includes t...,156.,245.,356.,236.,136.,D


## 1.2. Create datasets for training

In [None]:
# Convert the Pandas dataframe into Dataset
train_ds = Dataset.from_pandas(train_data.reset_index()) #reset_index() since index is randomly out of order
# validation_ds = Dataset.from_pandas(validation_data.reset_index())

# print features of train dataset & an example
print(train_ds)
print("An example:\n", train_ds[3])

Dataset({
    features: ['index', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 780
})
An example:
 {'index': 3, 'prompt': 'Superior vena cava syndrome:', 'A': 'most commonly occurs in the course of lung cancer.', 'B': 'may be associated with vessel thrombosis.', 'C': 'is a contraindication to the use of corticosteroids.', 'D': 'A and B are correct.', 'E': 'A, B and C are correct.', 'answer': 'D'}


# 2. Preprocess
Process the datasets into a format suitable as model's input. From documentation, DeBERTa expects pairs of tokenized sentences and the following fields: 'input_ids', 'token_type_ids', 'attention_mask', 'label'

In [None]:
# create tokenizer from model checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# # test tokenizer
# tokenizer("This is the first sentence.", "This is the second one.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']

    try:
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example['label'] = option_to_index[example['answer']]
        return tokenized_example
    except Exception as e:
        # Print error message and the data causing the error
        print(f"Error during tokenization: {e}")
        print(f"First sentence: {first_sentence}")
        print(f"Second sentences: {second_sentences}")
        # Optional: re-raise the exception or handle it as needed
        raise e


* **Note:** the preprocess function above takes in a single sample for the dataset. It is possible to speed data processing by implement preprocess function that uses multiple samples, which allows use of **batched=True** in map function, and speed up the process.
* Here is the code on Hugging Face: https://huggingface.co/docs/transformers/tasks/multiple_choice

In [None]:
# tokenize & process datasets for input to model training
tokenized_train_ds = train_ds.map(preprocess, remove_columns=['index', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
# tokenized_validation_ds = validation_ds.map(preprocess, remove_columns=['index', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

# checking output with prints
print(tokenized_train_ds)
# print(tokenized_validation_ds)

Map:   0%|          | 0/780 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 780
})


# 3. Model Training

* Create a custom data collater for multiple choice training. This helps speed up training by providing dynamic embedding for each training batches --> memory efficient.
* Instruction: https://huggingface.co/docs/transformers/tasks/multiple_choice#train

In [None]:
@dataclass
class DataCollatorForMultipleChoice:

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True  # Defines how padding should be applied to the inputs
    max_length: Optional[int] = None  # The maximum length of the tokenized inputs
    pad_to_multiple_of: Optional[int] = None  # If set, pads the input to a multiple of this value

    def __call__(self, features):
        # This method is called to create a batch from a list of features (input data) with proper padding
        # The batch consists of samples, where each sample contains all question-answer pairs for a question

        # Determine the correct label key in the features dictionary
        label_name = "label" if "label" in features[0].keys() else "labels"
        # Extract labels from the features and remove the label entries
        labels = [feature.pop(label_name) for feature in features]

        # Get the batch size (number of examples)
        batch_size = len(features)
        # Get the number of choices per question (assuming all questions have the same number of choices)
        num_choices = len(features[0]["input_ids"])

        # Flatten the features for each choice across all examples
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        # Convert the list of lists into a single list
        flattened_features = sum(flattened_features, [])

        # Pad the batch so all inputs are of the same length and convert to tensors.
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Reshape the batch to have dimensions [batch_size, num_choices, sequence_length].
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Convert the list of labels into a tensor and add to the batch.
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)

        # Return the final processed batch ready for input to a model.
        return batch


* Define evaluate metrics for training - using a simple accuracy here.

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = predictions.argmax(axis=1) # Convert continuous values to discrete values
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

* Train the model using Trainer API. For memory constraints context like this, smaller batch_size requires less memory at the cost of longer training time.
* **Note:** choosing training hyper parameters is kinda an art. May re-use the parameters from [Radek's notebook](https://www.kaggle.com/code/radek1/new-dataset-deberta-v3-large-training).

In [None]:
# load the model
model = AutoModelForMultipleChoice.from_pretrained(checkpoint) # Initialise the pretrained model with a multiple choice classification head

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Change the output directory name
# output_dir='./roberta-lek-full-train-4epochs'
output_dir='./bert-lek-full-train-4epochs-run2'

training_args = TrainingArguments(
    warmup_ratio=0.8,
    learning_rate=5e-6,
    # evaluation_strategy="epoch", # Comment out if no evaluation on validation set
    # save_strategy="epoch",
    # save_total_limit=1,  # The number of saved checkpoints, can be increased
    # optim='adamw_torch # This is the default optimizer
    per_device_train_batch_size=2, # TRY INCREASE THIS - 2 almost exceeds the RAM limit
    per_device_eval_batch_size=2,
    # gradient_accumulation_steps=2, # Set to 2 to increase the batch size without increasing the memory requirement
    num_train_epochs=4,
    report_to='none',
    push_to_hub=True, # the trained model is pushed to hub. Yayy!
    # output_dir='./deberta-v3-lek-full-train-correct-4epochs',
    output_dir=output_dir,
    # logging_steps=50,
    # logging_strategy="steps"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_train_ds,
    # eval_dataset=tokenized_validation_ds,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.6267
1000,1.6119
1500,1.5584


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/toke

TrainOutput(global_step=1560, training_loss=1.5946768296070588, metrics={'train_runtime': 277.7897, 'train_samples_per_second': 11.232, 'train_steps_per_second': 5.616, 'total_flos': 573828753963900.0, 'train_loss': 1.5946768296070588, 'epoch': 4.0})

* More training may help - since we can see loss and accuracy continue to improves after each epoch (on validation set).
* More data may help - provided they are relevance to the task (aka medical exam).

# 4. Predict with new Model
Data needs to be processed the same way like in training, before it can be fed into model for predictions

In [None]:
# Create Test dataset
test_df = pd.read_csv('/content/thesis_questions_test.csv')
tokenized_test_ds = Dataset.from_pandas(test_df.drop(columns=['index'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])

Map:   0%|          | 0/195 [00:00<?, ? examples/s]

In [None]:
tokenized_test_ds

Dataset({
    features: ['answer', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 195
})

In [None]:
# Reload the trained BERT/deberta model
# model_trained = AutoModelForMultipleChoice.from_pretrained("yy0514/deberta-v3-lek-train-new-4epochs")

In [None]:
# reloaded_trainer = Trainer(
#                     model = model_trained,
#                     args=training_args,
#                     tokenizer=tokenizer,
#                     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
#                     train_dataset=tokenized_train_ds, # Is this one right?
#                     # eval_dataset=tokenized_validation_ds,
#                     compute_metrics=compute_metrics,
#                     )

# # predict on test data
# test_preds = reloaded_trainer.predict(tokenized_test_ds)

In [None]:
# Change column names
# top1_col = 'deberta_v3_train_top1'
# top3_col = 'deberta_v3_train_top3'
top1_col = 'bert_train_top1'
top3_col = 'bert_train_top3'
# top1_col = 'roberta_train_top1'
# top3_col = 'roberta_train_top3'

# predict on test data
test_preds = trainer.predict(tokenized_test_ds)

# converts predictions back to our answer format  - top 3 of (A, B, C, D, E)
preds_as_ids = np.argsort(-test_preds.predictions, 1)
preds_as_answer_letters = np.array(list('ABCDE'))[preds_as_ids]

# add top 1 and top 3 answers to test df
top_1_preds_as_string = test_df[top1_col] = [
    ' '.join(row) for row in preds_as_answer_letters[:, :1]
]

top_3_preds_as_string = test_df[top3_col] = [
    ' '.join(row) for row in preds_as_answer_letters[:, :3]
]
# save the test df
test_df = test_df[['index', 'answer', top1_col, top3_col]]
test_df.to_csv('./test_results.csv', index=False)

# print sample rows in test_df
test_df.head(3)

Unnamed: 0,index,answer,bert_train_top1,bert_train_top3
0,199,B,C,C D B
1,789,E,E,E A C
2,174,B,B,B D A


In [None]:
# Sense checks
print("Model outputs for 3 test data points:\n ", test_preds.predictions[:3])
print("Sorted outputs as IDs:\n", preds_as_ids[:3])
print("Top 3 outputs as label:\n", top_3_preds_as_string[:3] )

Model outputs for 3 test data points:
  [[-0.7332759  -0.7035437  -0.10445204 -0.5317845  -0.73758334]
 [ 0.60486174  0.4184134   0.45390385  0.44373143  0.66629535]
 [-1.318665   -1.2489964  -1.3421845  -1.2863371  -1.3623228 ]]
Sorted outputs as IDs:
 [[2 3 1 0 4]
 [4 0 2 3 1]
 [1 3 0 2 4]]
Top 3 outputs as label:
 ['C D B', 'E A C', 'B D A']


# Evaluation

In [None]:
test_df = pd.read_csv("/content/test_results.csv")
test_df.head()

Unnamed: 0,index,answer,bert_train_top1,bert_train_top3
0,199,B,C,C D B
1,789,E,E,E A C
2,174,B,B,B D A
3,467,A,A,A B D
4,66,D,E,E B D


In [None]:
# Compute the Mean Average Precision (MAP) for the top 3 choices

def precision_at_k(relevance, k):
    assert k <= len(relevance), "k is larger than the length of the list of predictions"
    return sum(relevance[:k]) / k

def MAP_at_3(df, true_col, prediction_col):
    map_score = 0.0
    for _, row in df.iterrows():
        predictions = row[prediction_col].split()  # Split the predictions into a list
        true_answer = row[true_col]
        user_results = [1 if pred == true_answer else 0 for pred in predictions[:3]]

        for k in range(min(len(predictions), 3)):
            # Only consider the first correct label, ignore the rest for MAP calculation
            if user_results[k] == 1:
                map_score += precision_at_k(user_results, k+1)
                break  # Stop after the first correct prediction
    return map_score / len(df)


In [None]:
def calculate_accuracy(df, true_col, pred_col):
    # Comparing the true answers with the top-1 predictions based on perplexity
    correct_predictions = (df[true_col] == df[pred_col])
    # Calculating the accuracy
    accuracy = correct_predictions.sum() / len(df)
    return accuracy

In [None]:
# Compute the accuracy
accuracy = calculate_accuracy(test_df, 'answer', top1_col)
print(f"Accuracy: {accuracy}")

# Compute the MAP@3 scores
map_at_3_score = MAP_at_3(test_df, 'answer', top3_col)
print(f"MAP at 3: {map_at_3_score}")

Accuracy: 0.37435897435897436
MAP at 3: 0.5435897435897432
