In [6]:
!pip install transformers datasets evaluate



In [7]:
from datasets import load_dataset

raw_data = load_dataset("ai2_arc", "ARC-Challenge")

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [9]:
print(raw_data["train"])

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 1119
})


In [10]:
def preprocess_function(examples):
    # Initialize tokenized inputs and labels
    tokenized_inputs = {"input_ids": [], "attention_mask": [], "labels": []}

    for question, choices, answer_key in zip(examples["question"], examples["choices"], examples["answerKey"]):
        # Ensure there are 5 choices by padding with empty strings if needed
        while len(choices["text"]) < 5:
            choices["text"].append("")  # Add empty dummy choices

        # Get the correct choice index based on `answerKey`
        correct_choice_index = ord(answer_key) - ord("A")  # Convert 'A', 'B', 'C', etc., to 0, 1, 2...
        if correct_choice_index < 0 and answer_key.isnumeric():
            correct_choice_index = int(answer_key) - 1  # Convert numeric answers to 0-based index
        # Tokenize each choice
        for choice in choices["text"]:
            encoded = tokenizer(
                question,
                choice,
                truncation=True,
                padding="max_length",
                max_length=128  # Adjust as needed
            )
            tokenized_inputs["input_ids"].append(encoded["input_ids"])
            tokenized_inputs["attention_mask"].append(encoded["attention_mask"])

        # Append the label (correct choice index)
        tokenized_inputs["labels"].append(correct_choice_index)

    # Group tokenized inputs into chunks (one chunk per question)
    tokenized_inputs = {
        k: [v[i:i + 5] for i in range(0, len(v), 5)] if k != "labels" else v
        for k, v in tokenized_inputs.items()
    }

    return tokenized_inputs


In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice, PreTrainedTokenizerBase
from torch.utils.data import DataLoader
from dataclasses import dataclass
from transformers.tokenization_utils_base import PaddingStrategy
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from typing import Optional, Union
import torch

# Load ARC Dataset
arc_dataset = load_dataset("ai2_arc", "ARC-Challenge")

# Tokenizer and Model
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that dynamically pads inputs for multiple choice tasks.
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        #print(features[0].keys())
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


# Apply preprocessing
print(arc_dataset["train"][0].keys())
tokenized_arc = arc_dataset.map(preprocess_function, batched=True, remove_columns=arc_dataset["train"].column_names)

# Create DataLoader with DataCollator
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
train_dataloader = DataLoader(tokenized_arc["train"], batch_size=32, collate_fn=data_collator)

# Example usage of DataLoader
for batch in train_dataloader:
    print(batch["input_ids"].shape)  # Shape: (batch_size, num_choices, seq_len)
    print(batch["labels"].shape)    # Shape: (batch_size)
    break


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

dict_keys(['id', 'question', 'choices', 'answerKey'])


Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

Map:   0%|          | 0/1172 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([32, 5, 128])
torch.Size([32])


In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

#model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
model = AutoModelForMultipleChoice.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_arc["train"],
#     eval_dataset=tokenized_arc["validation"],
#     processing_class=tokenizer,
#     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
#     compute_metrics=compute_metrics,
# )

# trainer.train()

from transformers import TrainerCallback

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Training arguments with GPU-specific configurations
training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",  # Fixed typo from eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
    per_device_train_batch_size=32,  # Adjust batch size based on GPU memory
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",  # Directory for logs
    logging_steps=50,
    save_total_limit=2,  # Keep only 2 checkpoints to save disk space
    fp16=torch.cuda.is_available(),  # Enable mixed precision if on GPU
    report_to="wandb",  # Report metrics to W&B
)

class AccuracyLoggerCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.accuracies = []  # Store all accuracies

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            # Capture and print the accuracy
            accuracy = metrics.get("eval_accuracy", None)
            if accuracy is not None:
                self.accuracies.append(accuracy)
                print(f"Epoch {state.epoch}: Accuracy = {accuracy:.4f}")

    def on_train_end(self, args, state, control, **kwargs):
        if self.accuracies:
            # Print all accuracies and the best one
            print("\nAll Accuracies:", self.accuracies)
            print(f"Best Accuracy: {max(self.accuracies):.4f}")

# Initialize and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arc["train"],
    eval_dataset=tokenized_arc["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[AccuracyLoggerCallback()]  # Add the custom callback
)

trainer.train()


Using device: cuda


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.436844,0.287625
2,1.098500,1.458699,0.280936
3,1.156700,1.458337,0.274247
4,1.156700,1.454096,0.274247
5,1.227000,1.43882,0.284281
6,1.233000,1.437993,0.297659
7,1.233000,1.449301,0.29097
8,1.223400,1.456953,0.280936
9,1.193100,1.456951,0.270903
10,1.213900,1.457934,0.274247


Epoch 1.0: Accuracy = 0.2876
Epoch 2.0: Accuracy = 0.2809
Epoch 3.0: Accuracy = 0.2742
Epoch 4.0: Accuracy = 0.2742
Epoch 5.0: Accuracy = 0.2843
Epoch 6.0: Accuracy = 0.2977
Epoch 7.0: Accuracy = 0.2910
Epoch 8.0: Accuracy = 0.2809
Epoch 9.0: Accuracy = 0.2709
Epoch 10.0: Accuracy = 0.2742
Epoch 11.0: Accuracy = 0.2742
Epoch 12.0: Accuracy = 0.2809
Epoch 13.0: Accuracy = 0.2609
Epoch 14.0: Accuracy = 0.2642
Epoch 15.0: Accuracy = 0.2542
Epoch 16.0: Accuracy = 0.2609
Epoch 17.0: Accuracy = 0.2609
Epoch 18.0: Accuracy = 0.2809
Epoch 19.0: Accuracy = 0.2809
Epoch 20.0: Accuracy = 0.2776
Epoch 21.0: Accuracy = 0.2742
Epoch 22.0: Accuracy = 0.2910
Epoch 23.0: Accuracy = 0.2776
Epoch 24.0: Accuracy = 0.2742
Epoch 25.0: Accuracy = 0.2776
Epoch 26.0: Accuracy = 0.2742
Epoch 27.0: Accuracy = 0.2843
Epoch 28.0: Accuracy = 0.2843
Epoch 29.0: Accuracy = 0.2843
Epoch 30.0: Accuracy = 0.2843

All Accuracies: [0.28762541806020064, 0.2809364548494983, 0.27424749163879597, 0.27424749163879597, 0.284280

TrainOutput(global_step=1050, training_loss=1.1917757016136532, metrics={'train_runtime': 1721.1936, 'train_samples_per_second': 19.504, 'train_steps_per_second': 0.61, 'total_flos': 1.10406985296768e+16, 'train_loss': 1.1917757016136532, 'epoch': 30.0})

In [18]:
# Evaluate model on the validation set
results = trainer.evaluate()

print(f"Validation Loss: {results['eval_loss']:.4f}")
print(f"Validation Accuracy: {results['eval_accuracy']:.4f}")

# Get predictions
predictions = trainer.predict(tokenized_arc["validation"])
predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = predictions.label_ids


Epoch 30.0: Accuracy = 0.2977
Validation Loss: 1.3966
Validation Accuracy: 0.2977


In [29]:
from collections import Counter

# Find incorrect predictions
incorrect_indices = [i for i, (pred, true) in enumerate(zip(predicted_labels, true_labels)) if pred != true]

# Count incorrect predictions
incorrect_predictions = [predicted_labels[i] for i in incorrect_indices]

# Map numeric labels to options ('A', 'B', 'C', 'D')
label_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
mapped_predictions = [label_mapping[pred] for pred in incorrect_predictions]

# Count occurrences of each option in incorrect predictions
incorrect_counts = Counter(mapped_predictions)

# Display detailed error examples and incorrect counts
print("Detailed Examples of Errors (Indexes 20-30):\n")
for idx in incorrect_indices[20:30]:
    sample = arc_dataset["validation"][idx]
    print(f"Question ID: {sample['id']}")
    print(f"Question: {sample['question']}")
    print(f"Choices: {sample['choices']}")
    print(f"True Label: {label_mapping[true_labels[idx]]}, Predicted: {label_mapping[predicted_labels[idx]]}")
    print("\n---\n")

# Print breakdown of incorrect predictions
print("\nIncorrect Predictions Breakdown:")
for option in ['A', 'B', 'C', 'D', 'E']:
    print(f"{option}: {incorrect_counts.get(option, 0)}")


Detailed Examples of Errors (Indexes 20-30):

Question ID: Mercury_7115063
Question: Which of the following is most likely to disrupt a wetland ecosystem?
Choices: {'text': ['construction of a housing development', 'planting native wildflowers', 'a period of heavy rainfall', 'a lightning strike'], 'label': ['A', 'B', 'C', 'D']}
True Label: A, Predicted: C

---

Question ID: MCAS_2015_8_7
Question: A student heats two pans of water on a stove using the highest setting. One pan contains 1 L of water and the other pan contains 3 L of water. The student heats each pan until the water boils. Which of the following statements best describes what happens to the water in the pans?
Choices: {'text': ['The water in both pans boils at the same time.', 'The water in both pans boils at the same temperature.', 'The 3 L of water gets hotter than the 1 L of water before boiling.', 'The 3 L of water absorbs heat more quickly than the 1 L of water.'], 'label': ['A', 'B', 'C', 'D']}
True Label: B, Predic