In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from transformers import TrainingArguments, Trainer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np
import evaluate

In [None]:
# Load ARC dataset
arc_dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('amritpuhan/fine-tuned-bert-base-uncased-swag', token="")
model = AutoModelForMultipleChoice.from_pretrained('amritpuhan/fine-tuned-bert-base-uncased-swag', token="")

In [None]:
def convert_label(label):
    if label.isdigit():  # Check if the label is a digit
        return int(label) - 1
    else:  # Assume the label is a letter (A, B, C, D)
        return ord(label) - ord('A')

In [None]:
def preprocess_arc_function(examples):
    # Unpack questions and choices
    questions = examples["question"]
    choices = examples['choices']

    # Prepare first and second sentences

    first_sentences = []
    second_sentences = []

    # Prepare labels array if you need to handle labels dynamically as well
    labels = []  

    # Number of choices can vary
    num_choices_per_question = []

    for i, (question, choice_dict) in enumerate(zip(questions, choices)):
        num_choices = len(choice_dict['text'])
        num_choices_per_question.append(num_choices)
        
        # Repeat the question for each choice
        first_sentences.extend([question] * num_choices)
        
        # Extend second sentences with each choice
        second_sentences.extend(choice_dict['text'])

        # If you're handling labels, adapt this part to your data structure
        labels.append(convert_label(examples['answerKey'][i]))

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, return_tensors='pt', padding=True)

    # Un-flatten the tokenized outputs to maintain structure [number of examples, number of choices per example]
    tokenized_outputs = {key: [] for key in tokenized_examples.keys()}
    index = 0
    for count in num_choices_per_question:
        for key in tokenized_examples.keys():
            tokenized_outputs[key].append(tokenized_examples[key][index:index + count])
        index += count

    # If using labels, make sure to format them here too
    tokenized_outputs['labels'] = labels

    return tokenized_outputs


In [None]:
# Tokenize and prepare dataset
#tokenized_arc = arc_dataset.map(lambda examples: preprocess_arc_function(examples, tokenizer), batched=True)
tokenized_arc = arc_dataset.map(preprocess_arc_function, batched=True)

In [None]:
tokenized_arc['train']

In [None]:
tokenized_arc


In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]  # Extract labels from features

        flattened_features = []
        num_choices_per_feature=[]
        for feature in features:
            # Determine the number of choices for the current question
            num_choices = len(feature["input_ids"])  # Assuming 'input_ids' represents the number of choices
            num_choices_per_feature.append(num_choices)
            # Iterate over each choice for the current feature
            for i in range(num_choices):
                # Create a dictionary for the current choice
                choice_dict = {}

                # Iterate over each key in the feature, excluding 'labels'
                for key in feature:
                    if key != 'labels' and key != 'id' and key != 'question' and key != 'choices' and key != 'answerKey':
                        # Add the data for the current choice to the choice_dict
                        choice_dict[key] = feature[key][i]

                # Append the dictionary for the current choice to the flattened_features list
                flattened_features.append(choice_dict)

        # Pad the flattened features
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Reorganize padded data back to their respective feature structures
        new_batch = {key: [] for key in batch.keys()}
        current_index = 0
        for num_choices in num_choices_per_feature:
            for key in batch.keys():
                new_batch[key].append(batch[key][current_index:current_index + num_choices])
            current_index += num_choices

        # Convert list of tensors back to tensor for each key
        # This needs to handle variable sizes, so we use padding or similar approaches as required
        for key in new_batch.keys():
            new_batch[key] = torch.nn.utils.rnn.pad_sequence(new_batch[key], batch_first=True)

        # Add back labels
        new_batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return new_batch


In [None]:
for param in model.parameters():
    param.requires_grad = True

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# Assuming DataCollatorForMultipleChoice and ARC_Dataset are already defined

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # Move model to the appropriate device

# Create data loaders
train_loader = DataLoader(tokenized_arc["train"], batch_size=8, shuffle=True, collate_fn=DataCollatorForMultipleChoice(tokenizer))
val_loader = DataLoader(tokenized_arc["validation"], batch_size=8, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer))

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define a simple accuracy metric for evaluation
def compute_accuracy(predictions, labels):
    return (predictions == labels).float().mean()

from torch.nn.utils import clip_grad_norm_

def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss, total_accuracy = 0, 0
    for batch in tqdm(dataloader, desc="Training", leave=False):
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)  # Clip gradients to avoid explosion
        optimizer.step()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        accuracy = compute_accuracy(predictions, labels)
        total_loss += loss.item()
        total_accuracy += accuracy.item()
    return total_loss / len(dataloader), total_accuracy / len(dataloader)


# Evaluation loop
def evaluate(model, dataloader):
    model.eval()
    total_loss, total_accuracy = 0, 0
    # Wrap dataloader with tqdm for a progress bar
    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            
            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            accuracy = compute_accuracy(predictions, labels)
            
            total_loss += loss.item()
            total_accuracy += accuracy.item()
            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item(), 'acc': accuracy.item()})
    
    return total_loss / len(dataloader), total_accuracy / len(dataloader)

torch.cuda.empty_cache()  # Assuming your model has a reset_parameters method defined

# Run the training and validation cycles
for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    train_loss, train_acc = train_epoch(model, train_loader, optimizer)
    val_loss, val_acc = evaluate(model, val_loader)
    print(f"Train Loss = {train_loss:.4f}, Train Acc = {train_acc:.4f}, Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")
