# Load CSV
We will load 60k CSV of `prompts`, `A,B,C,D,E`, and `context` from my Kaggle dataset [here][1]. This dataset is all publicly shared datasets concatenated then processed with Mgoksu's notebook [here][2] to create a `context` column. (To learn more about the datasets within read my discussion post). This Kaggle dataset also contains competition `train.csv` with added `context` column (to be used as a validation dataset).

In this train notebook, we have internet turned on and can choose whatever model we wish to download and train. After we finetune this model, we will create a second notebook with the Open Book Q&A technique and load the finetuned model from the output of this notebook. The second notebook will have internet turned off so that it can be submitted to Kaggle's competition.

[1]: https://www.kaggle.com/datasets/cdeotte/60k-data-with-context-v2
[2]: https://www.kaggle.com/code/mgoksu/0-807-sharing-my-trained-with-context-model

In [1]:
cp -r /kaggle/input/longformer-ep1/checkpoints_2/  /kaggle/working/

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

VER=2
# TRAIN WITH SUBSET OF 60K
NUM_TRAIN_SAMPLES = 1_024
# PARAMETER EFFICIENT FINE TUNING
# PEFT REQUIRES 1XP100 GPU NOT 2XT4
USE_PEFT = False
# NUMBER OF LAYERS TO FREEZE 
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 20
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 480
# HUGGING FACE MODEL
MODEL = '/kaggle/input/longformer-race-model/longformer_qa_model'



In [3]:
df_train = pd.read_csv('/kaggle/input/021023-new-data/longformer_train_35.csv')
df_valid = pd.read_csv('/kaggle/input/021023-new-data/longformer_val_495.csv')
df_valid = df_valid.dropna()
# df_train = df_train.dropna()
df_train = df_train.fillna('Hi')
df_train = df_train[['prompt', 'context', 'A', 'B', 'C', 'D','E' ,'answer']]
df_valid = df_valid[['prompt', 'context', 'A', 'B', 'C', 'D','E' ,'answer']]
valid_label = df_valid.loc[:, 'answer'].values

# print('Train data size:', df_train.shape )
print('Val data size:', df_valid.shape )

Val data size: (495, 8)


In [4]:
# items = []
# for i in val_sample.index:
#     items.append(i)
    

In [5]:
# file = open('items.txt','w')
# file.write(str(items))
# file.close()

# Data Loader
Code is from Radek's notebook [here][1] with modifications to the tokenization process.

[1]: https://www.kaggle.com/code/radek1/new-dataset-deberta-v3-large-training

In [6]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### <s>" + example['prompt'] + " </s></s> " + example[option] + " </s>" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [7]:
from transformers import LongformerTokenizer, LongformerForMultipleChoice
tokenizer = LongformerTokenizer.from_pretrained("/kaggle/input/longformer-race-model/longformer_qa_model")
model = LongformerForMultipleChoice.from_pretrained("/kaggle/input/longformer-race-model/longformer_qa_model").cuda()


In [8]:
dataset_valid = Dataset.from_pandas(df_valid)
dataset = Dataset.from_pandas(df_train)
# dataset_valid = dataset_valid.remove_columns(["__index_level_0__"])
dataset

Dataset({
    features: ['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 39001
})

In [9]:
# dataset_valid

In [10]:
tokenized_dataset_valid = dataset_valid.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset

  0%|          | 0/495 [00:00<?, ?ex/s]

  0%|          | 0/39001 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 39001
})

In [None]:
# torch.save(tokenized_dataset, 'tokenized_dataset.pt')
# torch.save(tokenized_dataset_valid, 'tokenized_dataset_valid.pt')


In [None]:
tokenized_dataset_valid

# Build Model
We will use a Hugging Face AutoModelForMultipleChoice. For the list of possible models, see Hugging Face's repository [here][1]. We can optionally use PEFT to accelerate training and use less memory. However i have noticed that validation accuracy is less. (Note that PEFT requires us to use 1xP100 not 2xT4 GPU. I'm not sure why). We can also optionally freeze layers. This also accelerates training and uses less memory. However validation accuracy may become less.

[1]: https://huggingface.co/models

In [11]:
# NOTE PEFT REQUIRES US TO USE 1XP100 NOT 2XT4. I'M NOT SURE WHY.
if USE_PEFT:
    !pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl

In [12]:
if USE_PEFT:
    print('We are using PEFT.')
    from peft import LoraConfig, get_peft_model, TaskType
    peft_config = LoraConfig(
        r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
        bias="none", inference_mode=False, 
        target_modules=["query_proj", "value_proj"],
        modules_to_save=['classifier','pooler'],
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

In [13]:
if FREEZE_EMBEDDINGS:
    print('Freezing embeddings.')
    for param in model.longformer.embeddings.parameters():
        param.requires_grad = False
if FREEZE_LAYERS>0:
    print(f'Freezing {FREEZE_LAYERS} layers.')
    for layer in model.longformer.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

Freezing embeddings.
Freezing 20 layers.


# MAP@3 Metric
The competition metric is MAP@3 therefore we will make a custom code to add to Hugging Face's trainer. Discussion [here][1]

[1]: https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/435602

In [None]:
# def map_at_3(predictions, labels):
#     map_sum = 0
#     pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
#     for x,y in zip(pred,labels):
#         z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
#         map_sum += np.sum(z)
#     return map_sum / len(predictions)

# def compute_metrics(p):
#     predictions = p.predictions.tolist()
#     labels = p.label_ids.tolist()
#     return {"map@3": map_at_3(predictions, labels)}

In [14]:
def map3(y_true, y_pred):
    m = (y_true.reshape((-1,1)) == y_pred)
    return np.mean(np.where(m.any(axis=1), m.argmax(axis=1)+1, np.inf)**(-1))

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    valid_pred_ids = np.argsort(-logits, 1)
    valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :1]
    valid_pred_letters_3 = np.array(list('ABCDE'))[valid_pred_ids][:, :3]
    accuracy = (valid_label.reshape((-1,1)) == valid_pred_letters).sum() / len(valid_label)
    map_3 = map3(valid_label, valid_pred_letters_3)
    RMZ = accuracy + map_3
    print(RMZ , accuracy , map_3 )
    return { 'accuracy': accuracy , 'map3': map_3 , 'RMZ': RMZ}

In [16]:
# def compute_metrics(eval_pred):
#   logits, labels = eval_pred
#   valid_pred_ids = np.argsort(-logits, 1)
#   valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :3]
#   return {'eval_Map3':map3(valid_label, valid_pred_letters)}

# Train and Save 
We will now train and save our model using Hugging Face's easy to use trainer. By adjusting the parameters in this notebook, we can achieve `CV MAP@3 = 0.915+` and corresponding single model `LB MAP@3 = 0.830+` wow!

In we run this notebook outside of Kaggle then we can train longer and with more RAM. If we run this notebook on Kaggle, then we need to use tricks to train models efficiently. Here are some ideas:
* use fp16 (this speeds up T4 not P100)
* use gradient_accumlation_steps (this simulates larger batch sizes)
* use gradient_checkpointing (this uses disk to save RAM)
* use 2xT4 instead of 1xP100 (this doubles GPUs)
* freeze model embeddings (this reduces weights to train)
* freeze some model layers (this reduces weights to train)
* use PEFT (this reduces weights to train)
* increase LR and decrease epochs (this reduces work)
* use smaller models (this reduces weights to train)

In [17]:
training_args = TrainingArguments(
    warmup_ratio=0.6, 
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    report_to='none',
    output_dir = f'./checkpoints_{VER}',
    overwrite_output_dir=True,
    fp16=True,
    gradient_accumulation_steps=16,
    logging_steps=25,
    evaluation_strategy='steps',
    eval_steps=90,
    save_strategy="steps",
    save_steps=90,
    load_best_model_at_end=True,
    metric_for_best_model='RMZ',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
    compute_metrics = compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)



In [None]:
trainer.train('/kaggle/input/longformer-ep1/checkpoints_2/checkpoint-1710')

Step,Training Loss,Validation Loss,Accuracy,Map3,Rmz
1800,1.0242,0.921213,0.705051,0.796296,1.501347


1.5013468013468012 0.705050505050505 0.7962962962962962


In [None]:

# 0.5592592592592592 0.2101010101010101 0.3491582491582491
# 0.8983164983164984 0.3696969696969697 0.5286195286195287
# 1.305050505050505 0.593939393939394 0.7111111111111111
# 1.336026936026936 0.604040404040404 0.731986531986532
# 1.4053872053872056 0.6444444444444445 0.760942760942761
# 1.4309764309764312 0.6585858585858586 0.7723905723905725
# 1.5134680134680134 0.7090909090909091 0.8043771043771044
# 1.5 0.6989898989898989 0.8010101010101011
# 1.54983164983165 0.7292929292929293 0.8205387205387206


# 1.5646464646464646 0.7393939393939394 0.8252525252525252
# 1.5646464646464646 0.7393939393939394 0.8252525252525252
# 1.582154882154882 0.7434343434343434 0.8387205387205388
# 1.5683501683501684 0.7393939393939394 0.828956228956229
# 1.5565656565656565 0.7313131313131314 0.8252525252525252
# 1.5841750841750843 0.7535353535353535 0.8306397306397307
# 1.567003367003367 0.7393939393939394 0.8276094276094277
# 1.5804713804713806 0.7474747474747475 0.8329966329966331
# 1.595959595959596 0.7535353535353535 0.8424242424242424
# 1.6121212121212123 0.7656565656565657 0.8464646464646465


# 4460.9s	13	1.6191919191919193 0.7696969696969697 0.8494949494949495
# 8843.4s	14	1.6218855218855217 0.7696969696969697 0.8521885521885522
# 13256.3s	15	1.5932659932659932 0.7555555555555555 0.8377104377104377
# 17644.9s	16	1.6205387205387205 0.7717171717171717 0.8488215488215487
# 22097.9s	17	1.5932659932659932 0.7535353535353535 0.8397306397306398
# 26503.5s	18	1.5996632996632998 0.7575757575757576 0.8420875420875422
# 30854.7s	19	1.6313131313131313 0.7777777777777778 0.8535353535353535 #83 LB
# 35249.7s	20	1.6107744107744106 0.7636363636363637 0.8471380471380471
# 39625.0s	21	1.6265993265993264 0.7737373737373737 0.8528619528619528 #81.7 LB


# 4482.6s	13	1.6444444444444444 0.7838383838383839 0.8606060606060606
# 8920.1s	14	1.639057239057239 0.7818181818181819 0.8572390572390574
# 13275.9s	15	1.6228956228956228 0.7717171717171717 0.8511784511784511
# 17697.2s	16	1.5973063973063972 0.7595959595959596 0.8377104377104376
# 22073.6s	17	1.625925925925926 0.7757575757575758 0.8501683501683502
# 26494.8s	18	1.6074074074074076 0.7656565656565657 0.8417508417508418
# 30887.9s	19	1.6343434343434344 0.7838383838383839 0.8505050505050505
# 35239.0s	20	1.6232323232323234 0.7737373737373737 0.8494949494949495
# 39662.6s	21	1.6303030303030304 0.7818181818181819 0.8484848484848485

In [None]:
# trainer.train('/kaggle/input/zak-46k-epoch1/checkpoints_2/checkpoint-2400')

In [None]:
trainer.save_model('./RMZ_140k_epoch1')

# Verify Saved Model
During training, we see the MAP@3 validation score above. Let's load the saved model and compute it again here to verify that our model is saved correctly.

In [None]:
del model, trainer
if USE_PEFT:
    model = AutoModelForMultipleChoice.from_pretrained(MODEL)
    model = get_peft_model(model, peft_config)
    checkpoint = torch.load(f'model_v{VER}/pytorch_model.bin')
    model.load_state_dict(checkpoint)
else:
    model = AutoModelForMultipleChoice.from_pretrained('./RMZ_40k_epoch1')
trainer = Trainer(model=model)

In [None]:
test_df = pd.read_csv('/kaggle/input/60k-data-with-context-v2/train_with_context2.csv')
tokenized_test_dataset = Dataset.from_pandas(test_df).map(
        preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E'])

test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

# Compute Validation Score

In [None]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
import numpy as np
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [None]:
m = MAP_at_3(test_df.prediction.values, test_df.answer.values)
print( 'CV MAP@3 =',m )