In [None]:
!pip install torch accelerate transformers datasets evaluate



In [None]:
import torch, transformers, datasets, accelerate, evaluate
import numpy as np
from transformers import AutoTokenizer, DataCollatorForMultipleChoice, AutoModelForSequenceClassification, AutoModelForMultipleChoice, TrainingArguments, Trainer
from torch.utils.data import DataLoader
from accelerate.utils.memory import clear_device_cache
from datasets import get_dataset_split_names, load_dataset, get_dataset_config_names
from google.colab import userdata
import random

# Data Loading

In [None]:
train_dataset = load_dataset("allenai/sciq", split = "train").shuffle(seed = 1).select(range(2500))
eval_dataset = load_dataset("allenai/sciq", split = "validation").shuffle(seed = 1).select(range(1000))
train_dataset[0]

{'question': 'What layer of soil, essential for farming, has the highest proportion of organic material?',
 'distractor3': 'subsoil',
 'distractor1': 'bedrock',
 'distractor2': 'humus',
 'correct_answer': 'topsoil',
 'support': 'Topsoil has the highest proportion of organic material. Topsoil is essential for farming.'}

# Preprocess

In [None]:
def preprocess_function(examples):

    num_examples = len(examples["question"])


    # combine choices
    choices = []
    for i in range(num_examples):
        example_choices = [
            examples["distractor1"][i],
            examples["distractor2"][i],
            examples["distractor3"][i],
            examples["correct_answer"][i]
        ]
        choices.append(example_choices)

    repeated_questions = [ [question] * 4 for question in examples["question"] ]

    labels = []

    # shuffle choices
    shuffled_choices = []
    for i in range(num_examples):
        # (0-2 for distractors, 3 for correct)
        indexed_choices = list(enumerate(choices[i]))
        random.shuffle(indexed_choices)

        # find the new index of the correct answer (ori index 3)
        new_label = -1
        current_shuffled_choices = []
        for new_idx, (original_idx, choice_text) in enumerate(indexed_choices):
            current_shuffled_choices.append(choice_text)

            if original_idx == 3: # set label
                new_label = new_idx

        shuffled_choices.append(current_shuffled_choices)
        labels.append(new_label)


    repeated_questions = sum(repeated_questions, [])
    shuffled_choices = sum(shuffled_choices, [])

    tokenized_examples = tokenizer(
        repeated_questions,
        shuffled_choices,
        truncation=True,
        max_length=512
    )

    unflattened = {
        k: [v[i:i+4] for i in range(0, len(v), 4)]
        for k, v in tokenized_examples.items()
    }


    unflattened["labels"] = labels

    return unflattened

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return metric.compute(predictions=preds, references=labels)

# Model Loading (BERT)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    report_to="none",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    weight_decay=0.01,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,

    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    logging_dir='./logs',
    logging_steps=10,
    output_dir="./best_model",
    eval_strategy="epoch"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2572,1.112209,0.548
2,0.8904,1.025735,0.584
3,0.7162,1.064429,0.577


TrainOutput(global_step=471, training_loss=1.021293390202168, metrics={'train_runtime': 225.6364, 'train_samples_per_second': 33.239, 'train_steps_per_second': 2.087, 'total_flos': 499413187885440.0, 'train_loss': 1.021293390202168, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.0644294023513794,
 'eval_accuracy': 0.577,
 'eval_runtime': 8.6825,
 'eval_samples_per_second': 115.175,
 'eval_steps_per_second': 14.397,
 'epoch': 3.0}

# Model Loading (DeBERTa)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
model = AutoModelForMultipleChoice.from_pretrained("microsoft/deberta-v3-base")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    report_to="none",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    weight_decay=0.01,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,

    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    logging_dir='./logs',
    logging_steps=10,
    output_dir="./best_model",
    eval_strategy="epoch"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8574,0.63572,0.764
2,0.4687,0.60082,0.774
3,0.4119,0.62221,0.777


TrainOutput(global_step=471, training_loss=0.6323553383856569, metrics={'train_runtime': 342.7733, 'train_samples_per_second': 21.88, 'train_steps_per_second': 1.374, 'total_flos': 459387794331552.0, 'train_loss': 0.6323553383856569, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.6222095489501953,
 'eval_accuracy': 0.777,
 'eval_runtime': 10.9395,
 'eval_samples_per_second': 91.412,
 'eval_steps_per_second': 11.427,
 'epoch': 3.0}

# Model Loading (allenai/scibert_scivocab_uncased)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModelForMultipleChoice.from_pretrained("allenai/scibert_scivocab_uncased")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    report_to="none",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    weight_decay=0.01,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,

    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    logging_dir='./logs',
    logging_steps=10,
    output_dir="./best_model",
    eval_strategy="epoch"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9571,0.884322,0.646
2,0.6376,0.887412,0.656
3,0.5259,0.941939,0.665


TrainOutput(global_step=471, training_loss=0.7428857708179774, metrics={'train_runtime': 237.2982, 'train_samples_per_second': 31.606, 'train_steps_per_second': 1.985, 'total_flos': 471786775121280.0, 'train_loss': 0.7428857708179774, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.9419394731521606,
 'eval_accuracy': 0.665,
 'eval_runtime': 7.4285,
 'eval_samples_per_second': 134.617,
 'eval_steps_per_second': 16.827,
 'epoch': 3.0}