In [None]:
# Define data paths
loading_script = "loading_script.py"
train_data = "../../data/retrieve/squad-sr.json"
dev_data = "../../data/squad-sr/squad-sr-dev-latin.json"

# Define model
#model_ckpt = "bert-base-multilingual-cased"
model_ckpt = "xlm-roberta-base"

In [None]:
# Define additional params
max_length = 512

In [None]:
# Define training arguments
training_args = {
    "output_dir": "out",
    "overwrite_output_dir": False,
    "do_train": True,
    "do_eval": True,
    "evaluation_strategy": "steps",
    "eval_steps": 2000,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "learning_rate": 3e-5,
    "num_train_epochs": 3,
    "logging_strategy": "steps",
    "logging_steps": 2000,
    "save_strategy": "epoch",
    "save_total_limit": 1,
    "no_cuda": False,
    "seed": 42,
    "optim": "adamw_hf",
    "optim_args": None,
    "push_to_hub": False
}

In [None]:
# Import modules
import os
from datasets import load_dataset
from transformers import AutoTokenizer, DefaultDataCollator, \
    AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm
import collections
import numpy as np
from evaluate import load

In [None]:
# Load dataset
data_files = {}
if train_data is not None:
    data_files["train"] = train_data
if dev_data is not None:
    data_files["dev"] = dev_data

# Load Dataset using loading script
dataset = load_dataset(loading_script, data_files=data_files)

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

In [None]:
# Define filter function
def filter_dataset(example, max_length):
    """Remove samples with more than max_length tokens

    Args:
        example (dict): Training sample
        max_length (int): Number of tokens

    Returns:
        list[bool]: List containing info about which samples to exclude from the dataset
    """
    inputs = tokenizer(
        example["question"],
        example["context"],
        max_length=max_length,
        truncation="do_not_truncate",
        padding="max_length",
        return_offsets_mapping=True
    )
    return len(inputs["input_ids"])==tokenizer.model_max_length

In [None]:
# Filter training dataset
filtered_train = dataset["train"].filter(filter_dataset, fn_kwargs={"max_length": max_length})

In [None]:
# Define map function
def preprocess_train(examples, tokenizer, max_length):
    """Preprocessing function for train split.
    Convert loaded SQuAD dataset samples to representation suitable for model finetuning.

    Args:
        examples (list[dict]): Dataset samples
        tokenizer (transformers.Tokenizer): Tokenizer
        max_length (int): Maximum number of tokens

    Returns:
        list[dict]: Preprocessed samples
    """

    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) -1

        my_start_position = next((idx for idx in range(context_start, context_end+1) if offset[idx][0] > start_char), context_end)
        start_positions.append(my_start_position - 1)
        my_end_position = next((idx for idx in range(context_end, context_start-1, -1) if offset[idx][1] < end_char), context_start)
        end_positions.append(my_end_position + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [None]:
# Apply map function to train dataset
train_dataset = filtered_train.map(
    preprocess_train,
    batched=True,
    remove_columns=dataset["train"].column_names,
    fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

In [None]:
# Filter validation dataset
filtered_validation = dataset["validation"].filter(filter_dataset, fn_kwargs={"max_length": max_length})

In [None]:
def preprocess_validation(examples, tokenizer, max_length):
    """Preprocess samples in validation dataset.
    Differs from `preprocess_train` because processed samples contain `offset_mapping` and `example_id` values

    Args:
        examples (list[dict]): Samples to process
        tokenizer (transformers.Tokenizer): Tokenizer
        max_length (int): Maximum tokenized sequence length

    Returns:
        Dataset: Processed dataset
    """
    
    questions = [q.strip() for q in examples["question"]]
    
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        example_ids.append(examples["id"][i])
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) -1

        my_start_position = next((idx for idx in range(context_start, context_end+1) if offset[idx][0] > start_char), context_end)
        start_positions.append(my_start_position - 1)
        my_end_position = next((idx for idx in range(context_end, context_start-1, -1) if offset[idx][1] < end_char), context_start)
        end_positions.append(my_end_position + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["example_id"] = example_ids
    return inputs

In [None]:
# Apply function to validation dataset
validation_dataset = filtered_validation.map(
    preprocess_validation,
    batched=True,
    remove_columns=dataset["validation"].column_names,
    fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
    )

In [None]:
# Define Trainer
os.environ["WANDB_DISABLED"] = "true"
collator = DefaultDataCollator()

training_args = TrainingArguments(**training_args)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= validation_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
)

In [None]:
print(trainer.args)

In [None]:
trainer.train()