In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
eli5 = load_dataset("eli5", split="train_asks[:5000]")

Reusing dataset eli5 (/home/bill/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [3]:
eli5 = eli5.train_test_split(test_size=0.2)
eli5["train"][0]

{'q_id': '1i3n2x',
 'title': 'A question about treeline elivations in the Western US.',
 'selftext': 'I was part of a discussion in /r/hiking about the treeline on Mt Hood in Oregon and why it is so low as compared to the treeline in the Rocky Mountains at the same latitude (5k feet vs 10k feet respectively). One redditor suggested that it was due to costal winds but I pointed out that the treeline in Southern California, which experiences the same wind patterns as costal Oregon, is about 10k feet, which I understand would be even higher if not for dry conditions. \n\nI do understand that treelines become lower as you go north with treelines as low as 1k feet in Alaska but what makes for such a disproportionally low treeline on Mt Hood? ',
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['cb0pi2s'],
  'text': ['Treeline is affected by a wide variety of factors, any number of which could influence this.  The trees will grow until they can\'t, which is determined by how

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [5]:
eli5 = eli5.flatten()
eli5["train"][0]

{'q_id': '1i3n2x',
 'title': 'A question about treeline elivations in the Western US.',
 'selftext': 'I was part of a discussion in /r/hiking about the treeline on Mt Hood in Oregon and why it is so low as compared to the treeline in the Rocky Mountains at the same latitude (5k feet vs 10k feet respectively). One redditor suggested that it was due to costal winds but I pointed out that the treeline in Southern California, which experiences the same wind patterns as costal Oregon, is about 10k feet, which I understand would be even higher if not for dry conditions. \n\nI do understand that treelines become lower as you go north with treelines as low as 1k feet in Alaska but what makes for such a disproportionally low treeline on Mt Hood? ',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['cb0pi2s'],
 'answers.text': ['Treeline is affected by a wide variety of factors, any number of which could influence this.  The trees will grow until they can\'t, which is determined by 

In [6]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
tokenized_eli5["train"]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4000
})

In [8]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [10]:
model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

***** Running training *****
  Num examples = 7463
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2799


Epoch,Training Loss,Validation Loss


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.