In [1]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")

Reusing dataset eli5 (/home/bill/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [2]:
eli5 = eli5.train_test_split(test_size=0.2)

In [3]:
eli5["train"][0]

{'q_id': 'iukwf',
 'title': 'Statics question',
 'selftext': "I'm taking statics and mechanics of materials and had a conceptual question that my professor didn't answer confidently. (new prof.) Anyway if you had say a cube in space and pushed it perpendicular to one side by the corner, would it experience translational motion and a moment or just translation motion?",
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['c26ra0j'],
  'text': ['A static force applied eccentric to the center of mass (c.m.) of an initially stationary cube in space would result in rotation and translation. In terms of statics, think about the "penalty" of "moving" the force *from* the corner over *to* the c.m., where this "penalty" is the moment.\nAs a terrestrial analogy, try pushing a ballon at an eccentric location.'],
  'score': [8]},
 'title_urls': {'url': []},
 'selftext_urls': {'url': []},
 'answers_urls': {'url': []}}

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [5]:
eli5 = eli5.flatten()
eli5["train"][0]

{'q_id': 'iukwf',
 'title': 'Statics question',
 'selftext': "I'm taking statics and mechanics of materials and had a conceptual question that my professor didn't answer confidently. (new prof.) Anyway if you had say a cube in space and pushed it perpendicular to one side by the corner, would it experience translational motion and a moment or just translation motion?",
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c26ra0j'],
 'answers.text': ['A static force applied eccentric to the center of mass (c.m.) of an initially stationary cube in space would result in rotation and translation. In terms of statics, think about the "penalty" of "moving" the force *from* the corner over *to* the c.m., where this "penalty" is the moment.\nAs a terrestrial analogy, try pushing a ballon at an eccentric location.'],
 'answers.score': [8],
 'title_urls.url': [],
 'selftext_urls.url': [],
 'answers_urls.url': []}

In [6]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)

In [7]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

        

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [11]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

***** Running training *****
  Num examples = 7466
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2802


Epoch,Training Loss,Validation Loss
1,2.3229,2.048742
2,2.1719,2.031908
3,2.1336,2.005469


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1854
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1854
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./

TrainOutput(global_step=2802, training_loss=2.1971572777954362, metrics={'train_runtime': 286.2088, 'train_samples_per_second': 78.258, 'train_steps_per_second': 9.79, 'total_flos': 742615806011904.0, 'train_loss': 2.1971572777954362, 'epoch': 3.0})