# Training an Open Book Model for Q & A

For training a model for use with Open Book Q&A need a csv that contains: `prompt` ("questions"), `A,B,C,D,E` (answer choices), and also a column with the `context`.

In [2]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = "0.1"

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer, EarlyStoppingCallback, AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

VER = 2
# Train with subset of 60K
NUM_TRAIN_SAMPLES = 1_024
# Parameter efficient fine tuning 

USE_PEFT = False

FREEZE_LAYERS = 18

FREEZE_EMBEDDINGS = True

MAX_INPUT = 256

MODEL = 'microsoft/deberta-v3-large'

In [3]:
df_valid = pd.read_csv("/kaggle/input/60k-data-with-context-v2/train_with_context2.csv")
print("df shape", df_valid.shape)

df_valid.head()

df shape (200, 8)


Unnamed: 0,prompt,context,A,B,C,D,E,answer
0,Which of the following statements accurately d...,The presence of a clustered thick disk-like co...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Many of these systems evolve in a self-similar...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Renormalization is distinct from regularizatio...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,Several qualitative observations can be made o...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [4]:
df_train = pd.read_csv("/kaggle/input/60k-data-with-context-v2/all_12_with_context2.csv")
df_train = df_train.drop(columns="source")
df_train = df_train.fllna("").sample(NUM_TRAIN_SAMPLES)
print("Train data size: ", df_train.shape)
df_train.head()

Train data size:  (60347, 9)


Unnamed: 0,prompt,context,A,B,C,D,E,answer,source
0,"In relation to Eunice Fay McKenzie's career, w...","Eunice Fay McKenzie (February 19, 1918 – April...",McKenzie showcased her singing talents in nume...,McKenzie is primarily remembered for her starr...,McKenzie gained recognition for her role as a ...,McKenzie's collaborations with director Blake ...,McKenzie's successful career in sound films co...,B,1
1,How does Modified Newtonian Dynamics (MOND) im...,The presence of a clustered thick disk-like co...,MOND is a theory that increases the discrepanc...,MOND explains the missing baryonic mass in gal...,MOND is a theory that reduces the observed mis...,MOND is a theory that eliminates the observed ...,MOND's impact on the observed missing baryonic...,E,1
2,Which of the following statements accurately d...,Woody Hartman is a retired American soccer goa...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B,1
3,What is the significance of the Museum of the ...,The Museum of the Occupation of Latvia () is a...,The Museum of the Occupation of Latvia is a me...,The Museum of the Occupation of Latvia showcas...,The Museum of the Occupation of Latvia was est...,The Museum of the Occupation of Latvia primari...,The Museum of the Occupation of Latvia is a mu...,C,1
4,What was the previous name of the Christian Sc...,It was named the Evangelical School for the De...,The Christian School for the Deaf (CSD),The Christian School for the Blind (CSB),The Evangelical School and Chapel for the Deaf...,The Evangelical School for the Deaf (ESD),The Evangelical School for the Blind (ESB),D,1


### Data Loader

In [23]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = ['[CLS]' + example['context'] ] * 5
    second_sentences = [" ####" + example['prompt'] + " [SEP] " + example[option] if example[option] else ' ' + " [SEP]" for option in "ABCDE"]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation = "only_first", max_length=MAX_INPUT,
                                 add_special_tokens = False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k,v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        batch = self.tokenizer.pad(
            flattened_features,
            padding = self.padding,
            max_length = self.max_length,
            pad_to_multiple_of = self.pad_to_multiple_of,
            return_tensors = 'pt',
        )
        batch = {k:v.view(batch_size, num_choices, -1) for k,v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
dataset_valid = Dataset.from_pandas(df_valid)
dataset = Dataset.from_pandas(df_train)
#dataset = dataset.remove_columns(['__index_level_0__'])
dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer', 'source'],
    num_rows: 60347
})

In [21]:
tokenized_dataset_valid = dataset_valid.map(preprocess, remove_columns=['prompt','context','A','B','C','D','E', 'answer'])
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt','context','A','B','C','D','E','answer'])
tokenized_dataset

  0%|          | 0/200 [00:00<?, ?ex/s]

  0%|          | 0/60347 [00:00<?, ?ex/s]

Dataset({
    features: ['source', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 60347
})

### Build Model

using Hugging Face AutoModelForMultipleChoice. 
optionally can also use PEFT to accelerate training and uses less memory. However it has noticed that validation accuracy is lower. 
Can also feeze layers to accelerate training and lower memory use but this can also result in worse validation accuracy. 

In [13]:
model = AutoModelForMultipleChoice.from_pretrained(MODEL)

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
if FREEZE_EMBEDDINGS:
    print('Freezing Embeddings --------------------------------------------')
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
if FREEZE_LAYERS > 0:
    print(f"Freezing {FREEZE_LAYERS} layers. ---------------------------------------")
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

Freezing Embeddings --------------------------------
Freezing 18 layers. ---------------------------------------


### MAP@3 Metric 

In [25]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions), axis = 1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y == j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3" : map_at_3(predictions, labels)}

### Train and Save

Tricks to train model more efficiently when low RAM

- use fp16 (speeds up T4 not p100)
- use `gradient_accumulation_steps` (this simulates large batch sizes)
- use `gradient_checkpointing` (this uses disk to save RAM)
- freeze model embeddings (this reduces weights to train)
- freeze some model layers (this reduces weights to train)
- use PEFT (reduce weights to train)
- increase LR and decrease epochs (this reduces work)
- use smaller model (this reduces weights to train)

In [16]:
training_args = TrainingArguments(
    warmup_ratio = 0.1, 
    learning_rate = 2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 2,
    num_train_epochs = 2,
    report_to = 'none',
    output_dir = f'./checkpoints_{VER}',
    overwrite_output_dir = True,
    fp16 = True,
    gradient_accumulation_steps = 8,
    logging_steps = 25,
    evaluation_strategy = 'steps',
    eval_steps = 25,
    save_strategy = 'steps',
    save_steps = 25,
    load_best_model_at_end = False,
    metric_for_best_model = 'map@3',
    lr_scheduler_type = 'cosine',
    weight_decay = 0.01,
    save_total_limit = 2,
)

In [26]:
trainer = Trainer(
    model = model,
    args = training_args,
    tokenizer = tokenizer,
    data_collator = DataCollatorForMultipleChoice(tokenizer = tokenizer),
    train_dataset = tokenized_dataset,
    eval_dataset = tokenized_dataset_valid,
    compute_metrics = compute_metrics,
)

trainer.train()
trainer.save_model(f'model_v{VER}')

Step,Training Loss,Validation Loss,Map@3
25,1.604,1.609468,0.354167
50,1.625,1.609424,0.354167
75,1.619,1.609385,0.395833
100,1.6268,1.609214,0.439167
125,1.6114,1.609014,0.493333
150,1.6143,1.608638,0.551667
175,1.6173,1.608003,0.615
200,1.6192,1.606157,0.6875
225,1.6056,1.601934,0.731667
250,1.596,1.577207,0.769167


KeyboardInterrupt: 

In [28]:
trainer.save_model(f"model_v{VER}")

### Verifying saved Model

Checking to see if the model saved correctly

In [29]:
load_model = AutoModelForMultipleChoice.from_pretrained(f'model_v{VER}')
trainer = Trainer(model = load_model)

In [30]:
test_df = pd.read_csv("/kaggle/input/60k-data-with-context-v2/train_with_context2.csv")
tokenized_test_dataset = Dataset.from_pandas(test_df).map(
preprocess, remove_columns = ['prompt','context','A','B','C','D','E'])

test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [' '.join(row) for row in predictions_as_answer_letters[:,:3]]


  0%|          | 0/200 [00:00<?, ?ex/s]

### Compute Validation Score

In [33]:
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [34]:
m = MAP_at_3(test_df.prediction.values, test_df.answer.values)
print("CV MAP@3 = ", m)

CV MAP@3 =  0.8208333333333334
