In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM

import torch
from torch import nn
from torch.utils.data import DataLoader

from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2).flatten()


Reusing dataset eli5 (/home/bill/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [4]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
# tensor_dataset = lm_dataset.map(lambda x: {k: torch.tensor(v) for k, v in x.items()}, batched=True)

In [7]:
class MaskedLanguageModelingDataLoader(DataLoader):

    def __init__(self, dataset, batch_size=1, shuffle=False, num_workers=0):

        # def collate_fn(batch):
        #     return {k: torch.stack(v) for k, v in batch.items()}

        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    
    def __getitem__(self, i):
        sample = self.dataset[i]
        input_ids = sample["input_ids"]
        attention_mask = sample["attention_mask"]
        labels = sample["labels"]
        # to tensor
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)
        labels = torch.tensor(labels)

        return input_ids, attention_mask, labels
        



In [8]:
# type(tensor_dataset["train"]["input_ids"])

In [9]:
# def collate_fn(batch):
#     return {k: torch.stack(v, dim=0) for k, v in batch.items()}

train_loader = MaskedLanguageModelingDataLoader(lm_dataset["train"], batch_size=32, shuffle=True, num_workers=8)

In [10]:
model = AutoModelForMaskedLM.from_pretrained("distilroberta-base").to(device)

In [11]:
epochs = 3
for epoch in range(epochs):
    for batch in train_loader:
        print(type(batch))

        # put batch on gpu
        # batch = {k: v.to(device) for k, v in batch.items()}

        # loss = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
        # print(loss)
        # print(type(batch["input_ids"][0]))
        break
    break

<class 'dict'>


In [None]:
# from transformers import DataCollatorForLanguageModeling

# tokenizer.pad_token = tokenizer.eos_token
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()