In [8]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer, RobertaForQuestionAnswering, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from dataclasses import dataclass
from typing import Optional, Union
from datasets import Dataset
from huggingface_hub import login
from utils import keys
import numpy as np
import torch
import json
import evaluate
import json

login(token=keys.HF)
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
model.train()

# Test data & labels
with open("../data/new_test_data_nolabel/SP_new_test_corrected.json") as f:
    testdata = f.read()
testdata = json.loads(testdata)
with open("../data/new_test_data_nolabel/sp_choices.txt", "r") as f:
    testlabs = f.readlines()
for i in range(len(testlabs)):
    testlabs[i] = int(testlabs[i].strip())

# Training data & labels
with open("../data/new_test_data_nolabel/SP_train.json") as f:
    data = f.read()
data = json.loads(data)
valid = Dataset.from_list(data[-101:])
train = Dataset.from_list(data[:-101])

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/alvinchen/.cache/huggingface/token
Login successful


In [15]:
# Preprocess for multiple choice task
def preprocess_function(examples, ending_names=["ending0", "ending1", "ending2", "ending3"]):
    first_sentences = [[context] * 4 for context in examples["sent1"]]
    question_headers = examples["sent2"]
    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

tokens_train = train.map(preprocess_function, batched=True)
tokens_valid = train.map(preprocess_function, batched=True)


Map:   0%|          | 0/406 [00:00<?, ? examples/s]

KeyError: 'sent1'

In [11]:
# Collate data
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [12]:
# Load evaluator
accuracy = evaluate.load("accuracy")

def compute_metrics(evals):
    predictions, labels = evals
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
model_path = "models/robertaQA/"
# Train model
training_args = TrainingArguments(
    output_dir=f"models/{model_path}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokens_train,
    eval_dataset=tokens_valid,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/78 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TypeError: RobertaForQuestionAnswering.forward() got an unexpected keyword argument 'labels'