In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from transformers import TrainingArguments, Trainer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np
import evaluate

In [None]:
# Load ARC dataset
arc_dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', token="")
model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased', token="")

In [None]:
def convert_label(label):
    if label.isdigit():  # Check if the label is a digit
        return int(label) - 1
    else:  # Assume the label is a letter (A, B, C, D)
        return ord(label) - ord('A')

In [None]:
def preprocess_arc_function(examples):
    # Unpack questions and choices
    questions = examples["question"]
    choices = examples['choices']

    # Prepare first and second sentences

    first_sentences = []
    second_sentences = []

    # Prepare labels array if you need to handle labels dynamically as well
    labels = []  

    # Number of choices can vary
    num_choices_per_question = []

    for i, (question, choice_dict) in enumerate(zip(questions, choices)):
        num_choices = len(choice_dict['text'])
        num_choices_per_question.append(num_choices)
        
        # Repeat the question for each choice
        first_sentences.extend([question] * num_choices)
        
        # Extend second sentences with each choice
        second_sentences.extend(choice_dict['text'])

        # If you're handling labels, adapt this part to your data structure
        labels.append(convert_label(examples['answerKey'][i]))

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, return_tensors='pt', padding=True)

    # Un-flatten the tokenized outputs to maintain structure [number of examples, number of choices per example]
    tokenized_outputs = {key: [] for key in tokenized_examples.keys()}
    index = 0
    for count in num_choices_per_question:
        for key in tokenized_examples.keys():
            tokenized_outputs[key].append(tokenized_examples[key][index:index + count])
        index += count

    # If using labels, make sure to format them here too
    tokenized_outputs['labels'] = labels

    return tokenized_outputs


In [None]:
# Tokenize and prepare dataset
#tokenized_arc = arc_dataset.map(lambda examples: preprocess_arc_function(examples, tokenizer), batched=True)
tokenized_arc = arc_dataset.map(preprocess_arc_function, batched=True)

In [None]:
tokenized_arc['train']

In [None]:
tokenized_arc

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]  # Extract labels from features

        flattened_features = []
        num_choices_per_feature=[]
        for feature in features:
            # Determine the number of choices for the current question
            num_choices = len(feature["input_ids"])  # Assuming 'input_ids' represents the number of choices
            num_choices_per_feature.append(num_choices)
            # Iterate over each choice for the current feature
            for i in range(num_choices):
                # Create a dictionary for the current choice
                choice_dict = {}

                # Iterate over each key in the feature, excluding 'labels'
                for key in feature:
                    if key != 'labels' and key != 'id' and key != 'question' and key != 'choices' and key != 'answerKey':
                        # Add the data for the current choice to the choice_dict
                        choice_dict[key] = feature[key][i]

                # Append the dictionary for the current choice to the flattened_features list
                flattened_features.append(choice_dict)

        # Pad the flattened features
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Reorganize padded data back to their respective feature structures
        new_batch = {key: [] for key in batch.keys()}
        current_index = 0
        for num_choices in num_choices_per_feature:
            for key in batch.keys():
                new_batch[key].append(batch[key][current_index:current_index + num_choices])
            current_index += num_choices

        # Convert list of tensors back to tensor for each key
        # This needs to handle variable sizes, so we use padding or similar approaches as required
        for key in new_batch.keys():
            new_batch[key] = torch.nn.utils.rnn.pad_sequence(new_batch[key], batch_first=True)

        # Add back labels
        new_batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return new_batch


In [None]:
# accepted_keys = ["input_ids", "attention_mask", "labels"]
# features = [{k: v for k, v in tokenized_arc["train"][i].items() if k in accepted_keys} for i in range(10)]
# batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
# model.eval()  # Set the model to evaluation mode

# # Prepare the input as shown earlier
# input_ids = batch['input_ids'].to(model.device)
# attention_mask = batch['attention_mask'].to(model.device)
# labels = batch['labels'].to(model.device)

# # Inference
# with torch.no_grad():
#     outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

# # Get predictions
# logits = outputs.logits
# predictions = torch.argmax(logits, dim=-1)

# # Evaluate
# accuracy = (predictions == labels).float().mean()
# print("Predictions:", predictions)
# print("True Labels:", labels)
# print("Accuracy of predictions:", accuracy.item())

In [None]:
# [tokenizer.decode(batch["input_ids"][83][i].tolist()) for i in range(4)]

In [None]:
# def show_one(example):
#     print(f"Context: {example['question']}")
#     print(f"  A - {example['choices']['text'][0]}")
#     print(f"  B - {example['choices']['text'][1]}")
#     print(f"  C - {example['choices']['text'][2]}")
#     print(f"  D - {example['choices']['text'][3]}")
#     print(f"\nGround truth: option {[example['answerKey']]}")

In [None]:
# show_one(arc_dataset["train"][8])

In [None]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="fine-tuned-bert-base-uncased-arceasy",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1.5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    hub_token=""
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arc["train"],
    eval_dataset=tokenized_arc["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_arc["test"])
print(test_results)

# Save the model
#trainer.save_model("./arc-trained-model")