# Training an Open Book Model for Q & A

In [None]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = "0.1"

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer, EarlyStoppingCallback, AutoModelForMultipleChoice, TrainingmArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

VER = 2
# Train with subset of 60K
NUM_TRAIN_SAMPLES = 1_024
# Parameter efficient fine tuning 

USE_PEFT = False

FREEZE_LAYERS = 18

FREEZE_EMBEDDINGS = True

MAX_INPUT = 256

MODEL = 'microsoft/deberta-v3-large'

In [None]:
df_valid = pd.read_csv("/kaggle/input/60k-data-with-context-v2/train_with_context2.csv")
print("df shape", df_valid.shape)

df_valid.head()

In [None]:
df_train = pd.read_csv("/kaggle/input/60k-data-with-context-v2/all_12_with_context2.csv")
#df_train = df_train.drop(columns="source")
#df_train = df_train.fllna("").sample(NUM_TRAIN_SAMPLES)
print("Train data size: ", df_train.shape)
df_train.head()

### Data Loader

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.item()}

def preprocess(example):
    first_sentence = ['[CLS]' + example['context'] ] * 5
    second_sentences = [" ####" + example['prompt'] + " [SEP]" + example[option] + " [SEP]" for option in "ABCDE"]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation = "only_first", max_length=MAX_INPUT,
                                 add_special_tokens = False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k,v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        batch = {k:v.view(batch_size, num_choices, -1) for k,v in batch.items()}