**Importing Necessary Libraries**

In [None]:
import numpy as np
import pandas as pd
from colorama import Fore, Back, Style
from datasets import Dataset
from transformers import AutoTokenizer, EarlyStoppingCallback
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
import gc

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
from sklearn.model_selection import KFold



**LOAD DATA**

In [None]:
# Combining multiple datasets into a single DataFrame for training.
train_df = pd.concat([
    pd.read_csv('/content/drive/MyDrive/kaggle-llm-science-exam/train.csv'),
    pd.read_csv('/content/drive/MyDrive/kaggle-llm-science-exam/6000_train_examples.csv'),
    pd.read_csv('/content/drive/MyDrive/kaggle-llm-science-exam/extra_train_set.csv')
])

# Remove unnecessary 'id' column and reset index.
train_df.drop('id', axis=1, inplace=True)
train_df.reset_index(drop=True, inplace=True)

train_df.head()

Unnamed: 0,prompt,A,B,C,D,E,answer
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [None]:
train_df = train_df.iloc[:1000]
train_df

Unnamed: 0,prompt,A,B,C,D,E,answer
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D
...,...,...,...,...,...,...,...
995,What is Ferenc András Kalmár known for?,Ferenc András Kalmár is known for his contribu...,Ferenc András Kalmár is known for his career a...,Ferenc András Kalmár is known for his involvem...,Ferenc András Kalmár is known for his research...,Ferenc András Kalmár is known for his innovati...,B
996,What literary prize was Ákoz Kertész awarded i...,The Kossuth Prize,The Nobel Prize in Literature,The PEN/Faulkner Award,The Booker Prize,The Pulitzer Prize,A
997,What role did Daniel George Belisle assume dur...,Daniel George Belisle was a successful NHL pla...,Daniel George Belisle was known for his philan...,Daniel George Belisle was an influential figur...,Daniel George Belisle primarily served as a co...,Daniel George Belisle was a pioneer in ice hoc...,D
998,What is Gilbert Alfred Franklin best known for?,Gilbert Alfred Franklin is best known for his ...,Gilbert Alfred Franklin is best known for his ...,Gilbert Alfred Franklin is best known for his ...,Gilbert Alfred Franklin is best known for his ...,Gilbert Alfred Franklin is best known for his ...,A


LOADING PRE-TRAINED MODEL

In [None]:
# Loading the pre-trained model and tokenizer
deberta_v3_large = '/kaggle/input/deberta-v3-large-hf-weights'
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**DATA PREPROCESSING**

In [None]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}


# Define a preprocessing function that prepares the data for model input.
def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = [example[option] for option in 'ABCDE']

    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]

    return tokenized_example

In [None]:
print(index_to_option)

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}


In [None]:
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.

#  Define a data collator class for multiple choice tasks using @dataclass.
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # Determine the label name based on whether 'label' or 'labels' is present in the features.
        label_name = "label" if 'label' in features[0].keys() else 'labels'

        # Extract labels from features and compute batch size and the number of choices.
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])

        # Flatten the features to prepare for padding.
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        # Pad the flattened features using the tokenizer.
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )

         # Reshape the padded batch into the desired format.
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}

        # Add the labels to the batch as a tensor.
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)

        return batch

In [None]:
# Evaluation metric: MAP@3

K = 3

def apk(y_i_true, y_i_pred):

    assert(len(y_i_pred) <= K)
    assert(len(np.unique(y_i_pred)) == len(y_i_pred))

    sum_precision = 0.0
    num_hits = 0.0

    for i, p in enumerate(y_i_pred):
        #checks whether it is valid prediction
        if p in y_i_true:
            num_hits += 1
            precision = num_hits / (i + 1)
            sum_precision += precision

    return sum_precision / min(len(y_i_true), K)

def mapk(y_true, y_pred):
    return np.mean([apk(y_i_true, y_i_pred) for y_i_true, y_i_pred in zip(y_true, y_pred)])


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/kaggle-llm-science-exam/test.csv')
test_df['answer'] = 'A' # dummy answer that allows us to preprocess the test datataset using functionality that works for the train set

# Create a tokenized test dataset from the Pandas DataFrame and apply the 'preprocess' function to prepare the data.
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

  0%|          | 0/200 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Set up K-Fold Cross-Validation

final_dfs = pd.DataFrame()
kf = KFold(n_splits=5, shuffle=True, random_state=71)
cv_list = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(train_df)):
    # Create train/validation subsets.
    train_set = train_df.loc[tr_idx, ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']]
    valid_set = train_df.loc[va_idx, ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']]

    # Convert to Hugging Face's 'Dataset' format and preprocess.
    train_set = Dataset.from_pandas(train_set)
    tokenized_train = train_set.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
    valid_set = Dataset.from_pandas(valid_set)
    tokenized_valid = valid_set.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

    valid_label = train_df.loc[va_idx, 'answer'].values

    # Training arguments.
    training_args = TrainingArguments(
        output_dir='./',
        overwrite_output_dir=True,
        load_best_model_at_end=True,
        save_total_limit=1,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.8,
        learning_rate=1e-6,

        per_device_train_batch_size=1,
        per_device_eval_batch_size=2,

        num_train_epochs=10,

        report_to='none',
        seed=422
    )

     # Initialize the model and trainer
    model = AutoModelForMultipleChoice.from_pretrained(deberta_v3_large)
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # training
    trainer.train()

    # validation
    valid_pred = trainer.predict(tokenized_valid).predictions
    valid_pred_ids = np.argsort(-valid_pred, axis=1)
    valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :3]

    # Compute MAP@3 score
    valid_map3 = mapk(valid_label, valid_pred_letters)

    print(f"{Fore.RED}{Style.BRIGHT}Fold {fold}: MAP@3 = {valid_map3:.5f}{Style.RESET_ALL}")
    cv_list.append(valid_map3)

    test_predictions = trainer.predict(tokenized_test_ds).predictions
    fold_predict_df = pd.DataFrame(test_predictions, columns=[f'{x}{fold}' for x in ['A','B','C','D','E']])
    final_dfs = pd.concat([final_dfs, fold_predict_df], axis=1)

    # Clean up to avoid running out of memory.
    del model, trainer, tokenized_train, tokenized_valid, train_set, valid_set
    gc.collect()

  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.6113,1.609449
2,1.6206,1.610504
3,1.6167,1.593135
4,1.6083,1.600539
5,1.574,1.555467
6,1.5332,1.446381
7,1.3744,1.307687
8,1.2048,1.223704
9,0.9242,1.296124
10,0.7823,1.347473


[31m[1mFold 0: MAP@3 = 0.67833[0m


  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6131,1.608853
2,1.6098,1.608821
3,1.6107,1.609304
4,1.604,1.586339
5,1.5696,1.525717
6,1.5246,1.435965
7,1.3334,1.35358
8,1.2256,1.29817
9,1.0137,1.371726
10,0.8613,1.509892


[31m[1mFold 1: MAP@3 = 0.67333[0m


  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6093,1.609239
2,1.6104,1.608782
3,1.6195,1.605416
4,1.6046,1.585162
5,1.5545,1.506296
6,1.4818,1.42109
7,1.2886,1.365332
8,1.1695,1.439716
9,0.9905,1.578342


[31m[1mFold 2: MAP@3 = 0.63417[0m


  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6141,1.608989
2,1.6105,1.608855
3,1.6134,1.599948
4,1.603,1.59554
5,1.5553,1.511646
6,1.4858,1.29238
7,1.143,1.190665
8,0.9526,1.274834
9,0.7084,1.46639


[31m[1mFold 3: MAP@3 = 0.65833[0m


  0%|          | 0/800 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6167,1.608765
2,1.6156,1.609131
3,1.616,1.60213
4,1.608,1.58103
5,1.5847,1.543132
6,1.5458,1.432844
7,1.2457,1.279293
8,1.1003,1.337485
9,0.7637,1.591478


[31m[1mFold 4: MAP@3 = 0.64917[0m


In [None]:
# Average cross-validation score
cv = np.mean(cv_list)
print(f"{Fore.RED}{Style.BRIGHT}Global MAP@3 = {cv:.5f}{Style.RESET_ALL}")

[31m[1mGlobal MAP@3 = 0.65867[0m


In [None]:
final_dfs['A'] = final_dfs[['A1','A2','A3','A0']].mean(axis=1)
final_dfs['B'] = final_dfs[['B1','B2','B3','B0']].mean(axis=1)
final_dfs['C'] = final_dfs[['C1','C2','C3','C0']].mean(axis=1)
final_dfs['D'] = final_dfs[['D1','D2','D3','D0']].mean(axis=1)
final_dfs['E'] = final_dfs[['E1','E2','E3','E0']].mean(axis=1)

final_dfs[['A', 'B', 'C', 'D', 'E']].head()

Unnamed: 0,A,B,C,D,E
0,-2.553663,-1.515964,-2.539781,0.134061,-2.477152
1,-2.774576,-2.955051,-3.052081,-3.093159,-3.282637
2,0.600896,-2.402701,-0.27139,-3.179917,-1.798219
3,-2.107654,-2.341429,-1.240922,-2.639822,-3.033341
4,-0.814792,-1.063014,-1.505676,0.037207,-1.512424


## Prediction

In [None]:
predictions_as_ids = np.argsort(-test_predictions, 1)


predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]


test_df['prediction'] = [' '.join(row) for row in predictions_as_answer_letters[:, :3]]

In [None]:
submission = test_df[['id', 'prediction']]
submission.to_csv('LLM_Output.csv', index=False)

display(pd.read_csv('LLM_Output.csv').head())
display(pd.read_csv('LLM_Output.csv').tail())

Unnamed: 0,id,prediction
0,0,D B E
1,1,A D C
2,2,A C E
3,3,C A B
4,4,D A B


Unnamed: 0,id,prediction
195,195,C A E
196,196,B C A
197,197,B A D
198,198,D C A
199,199,D A C
