# Preliminary steps

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs

In [None]:
import pandas as pd
import json
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, Trainer, TrainingArguments, default_data_collator, DataCollatorForLanguageModeling
from huggingface_hub import notebook_login
import torch
import math
import collections
import numpy as np
from datasets import Dataset

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "luca.zunino@epfl.ch"
!git config --global user.name "lzunino"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
notebook_login()

# Dataset creation

In [None]:
# Load the JSON data from file
with open("solutions_v1.json") as json_file:
    data = json.load(json_file)

# Separate open questions and MCQs
open_questions = []
mcqs_no_explanation = []
mcqs_with_explanation = []

for entry in data:
    if "choices" in entry and entry["choices"] is not None:
        if "explanation" in entry and entry["explanation"] is not None:
            mcqs_with_explanation.append(entry)
        else:
            mcqs_no_explanation.append(entry)
    else:
        open_questions.append(entry)

In [None]:
# Process MCQs
for mcq in mcqs_no_explanation:
    choices_text = ' '.join(choice for choice in mcq["choices"])
    mcq["text"] = mcq["question"] + ' ' + choices_text

for mcq in mcqs_with_explanation:
    choices_text = ' '.join(choice for choice in mcq["choices"])
    mcq["text"] = mcq["question"] + ' ' + choices_text

# Process open questions
for open_question in open_questions:
    open_question["text"] = open_question["question"]

In [None]:
count = 0

for mcq in mcqs_with_explanation:
    print(mcq['text'])
    count += 1
    if count == 2:
        break

In [None]:
count = 0

for mcq in mcqs_no_explanation:
    print(mcq['text'])
    count += 1
    if count == 2:
        break

In [None]:
count = 0

for oq in open_questions:
    print(oq['text'])
    count += 1
    if count == 2:
        break

In [None]:
# Combine all entries into a single list
all_entries = open_questions + mcqs_no_explanation + mcqs_with_explanation

# Convert list of entries into a DataFrame
data = pd.DataFrame(all_entries)

# Extract only the 'text' column and convert it into a DataFrame
data = pd.DataFrame(data["text"])

# Fine-tuning a masked language model (PyTorch)

In [None]:
model_checkpoint = "xlm-roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

xlm_roberta_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> XLM RoBERTa number of parameters: {round(xlm_roberta_num_parameters)}M'")

In [None]:
text = "This is a great <mask>."
# text = "C'est une bonne <mask>."

In [None]:
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of <mask> and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the <mask> candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [None]:
questions_dataset = Dataset.from_pandas(data)
questions_dataset

In [None]:
sample = questions_dataset.shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Question: {row['text']}'")

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = questions_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

In [None]:
tokenizer.model_max_length

In [None]:
chunk_size = 128

In [None]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets[:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Question {idx} length: {len(sample)}'")

In [None]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated questions length: {total_length}'")

In [None]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
tokenizer.decode(lm_datasets[1]["input_ids"])

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
wwm_probability = 0.2

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
batch_size = 16
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-questions",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    # data_collator=whole_word_masking_data_collator,
    tokenizer=tokenizer,
)

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()

In [None]:
mask_filler = pipeline(
    "fill-mask", model="lucazed/xlm-roberta-base-finetuned-questions"
)

In [None]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")