# Masked LM with DistilRoBERTa

In [1]:
%env TRANSFORMERS_CACHE=/scratch/alif
%env HF_DATASETS_CACHE=/scratch/alif

env: TRANSFORMERS_CACHE=/scratch/alif


In [2]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset eli5 (/home/alif/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [3]:
eli5 = eli5.train_test_split(test_size=0.2)

In [4]:
eli5["train"][0]

{'q_id': 'xqo79',
 'title': 'Why do people seem to make mistakes more often when in front of people?',
 'selftext': 'For example, it seems I only trip and fall off my longboard when people are around or watching. Is this an actual thing, or am I imagining it? If it is a thing, what causes it?',
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['c5oswlp'],
  'text': ['If you\'re longboarding and showing people (or not, I guess), it\'s likely that you start (subconsciously or not) focusing on doing it "properly" by thinking through the steps you take one by one, instead of focusing on the whole--which would be the same reason most sports coaches become worse at their sport when they start teaching.\n\nThat, or self-consciousness.'],
  'score': [4]},
 'title_urls': {'url': []},
 'selftext_urls': {'url': []},
 'answers_urls': {'url': []}}

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

Downloading (…)lve/main/config.json: 100%|██████████| 480/480 [00:00<00:00, 2.38MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 4.13MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 7.92MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 12.5MB/s]


In [6]:
eli5 = eli5.flatten()
eli5["train"][0]

{'q_id': 'xqo79',
 'title': 'Why do people seem to make mistakes more often when in front of people?',
 'selftext': 'For example, it seems I only trip and fall off my longboard when people are around or watching. Is this an actual thing, or am I imagining it? If it is a thing, what causes it?',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c5oswlp'],
 'answers.text': ['If you\'re longboarding and showing people (or not, I guess), it\'s likely that you start (subconsciously or not) focusing on doing it "properly" by thinking through the steps you take one by one, instead of focusing on the whole--which would be the same reason most sports coaches become worse at their sport when they start teaching.\n\nThat, or self-consciousness.'],
 'answers.score': [4],
 'title_urls.url': [],
 'selftext_urls.url': [],
 'answers_urls.url': []}

In [7]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [8]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1292 > 512). Running this sequence through the model will result in indexing errors
Map (num_proc=4):  25%|██▌       | 1000/4000 [00:01<00:04, 731.36 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1092 > 512). Running this sequence through the model will result in indexing errors
Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]             Token indices sequence length is longer than the s

In [9]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [10]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

                                                                              

In [19]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [11]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

Downloading model.safetensors: 100%|██████████| 331M/331M [00:05<00:00, 59.5MB/s] 
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs/distilroberta_eli5",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

In [20]:
# from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.089966
2,2.271000,2.015621
3,2.271000,2.027498




TrainOutput(global_step=858, training_loss=2.2203698502831806, metrics={'train_runtime': 158.8667, 'train_samples_per_second': 172.314, 'train_steps_per_second': 5.401, 'total_flos': 907630488864000.0, 'train_loss': 2.2203698502831806, 'epoch': 3.0})

In [21]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.64


In [32]:
text = "The Milky Way is a <mask> <mask>."

In [34]:
from transformers import pipeline

mask_filler = pipeline("fill-mask", model="outputs/distilroberta_eli5/checkpoint-500", tokenizer=tokenizer)
mask_filler(text, top_k=3)

[[{'score': 0.07441897690296173,
   'token': 22703,
   'token_str': ' galaxy',
   'sequence': '<s>The Milky Way is a galaxy<mask>.'},
  {'score': 0.04852228984236717,
   'token': 2721,
   'token_str': ' beautiful',
   'sequence': '<s>The Milky Way is a beautiful<mask>.'},
  {'score': 0.046418074518442154,
   'token': 13258,
   'token_str': ' distant',
   'sequence': '<s>The Milky Way is a distant<mask>.'}],
 [{'score': 0.6030702590942383,
   'token': 22703,
   'token_str': ' galaxy',
   'sequence': '<s>The Milky Way is a<mask> galaxy.'},
  {'score': 0.02819615602493286,
   'token': 317,
   'token_str': ' place',
   'sequence': '<s>The Milky Way is a<mask> place.'},
  {'score': 0.024911869317293167,
   'token': 9468,
   'token_str': ' universe',
   'sequence': '<s>The Milky Way is a<mask> universe.'}]]