In [1]:
import torch
import evaluate

from torch.optim import SGD, AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Migrate online datasets to offline datasets
datasets = load_dataset("JulesBelveze/tldr_news")
datasets.save_to_disk("tldr_news")

Saving the dataset (1/1 shards): 100%|██████████| 7138/7138 [00:00<00:00, 934657.28 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 794/794 [00:00<00:00, 346687.21 examples/s]


In [3]:
dataset_name = "tldr_news"
model_name = "bigscience/bloom-560m"

In [4]:
datasets = load_from_disk(dataset_name)
datasets

DatasetDict({
    train: Dataset({
        features: ['headline', 'content', 'category'],
        num_rows: 7138
    })
    test: Dataset({
        features: ['headline', 'content', 'category'],
        num_rows: 794
    })
})

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    result = tokenizer(examples["content"], max_length=128, truncation=True, padding="max_length")
    result["labels"] = result["input_ids"].copy()
    return result


# Tokenize
# before tokenize: ['headline', 'content', 'category']
# after tokenize: ['input_ids', 'attention_mask', 'labels']
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=2,
    remove_columns=datasets["train"].column_names,  # remove columns that are not required for model input
)
tokenized_datasets.set_format("torch")

In [6]:
dataset_size = 0.3
batch_size = 64

In [7]:
train_dataset = tokenized_datasets["train"].shuffle(seed=77).select(range(int(tokenized_datasets["train"].num_rows * dataset_size)))
valid_dataset = tokenized_datasets["test"].shuffle(seed=77).select(range(int(tokenized_datasets["test"].num_rows * dataset_size)))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

In [9]:
optimizers = ["sgd", "adamw"]
idx = 1

# Set optimizer
if optimizers[idx] == "sgd":
    optimizer = SGD(model.parameters(), lr=1e-3)
elif optimizers[idx] == "adamw":
    optimizer = AdamW(model.parameters(), lr=1e-3)

optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [10]:
num_epochs = 1

with torch.profiler.profile(with_stack=True) as prof:
    for epoch in range(num_epochs):
        metric = evaluate.load("perplexity")

        model.train()
        loss_per_epoch = 0
        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss_per_epoch += loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print(f"[epoch {epoch+1}] train step: {step + 1}/{len(train_dataloader)}, loss: {loss_per_epoch / (step + 1)}")

        model.eval()
        loss_per_epoch = 0
        for step, batch in enumerate(valid_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            loss_per_epoch += outputs.loss
            print(f"[epoch {epoch+1}] valid step: {step + 1}/{len(valid_dataloader)}, loss: {loss_per_epoch / (step + 1)}")
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=tokenizer.batch_decode(predictions))

        metric = metric.compute(model_id=model_name)
        print(f"[epoch {epoch+1}] mean perplexity: {metric['mean_perplexity']}")

STAGE:2023-10-06 20:51:59 524716:524716 ActivityProfilerController.cpp:311] Completed Stage: Warm Up


[epoch 1] train step: 1/34, loss: 3.6373510360717773
[epoch 1] train step: 2/34, loss: 29.31656265258789
[epoch 1] train step: 3/34, loss: 36.43174743652344
[epoch 1] train step: 4/34, loss: 38.48322677612305
[epoch 1] train step: 5/34, loss: 37.63285827636719
[epoch 1] train step: 6/34, loss: 35.645751953125
[epoch 1] train step: 7/34, loss: 35.27306365966797
[epoch 1] train step: 8/34, loss: 33.95383071899414
[epoch 1] train step: 9/34, loss: 32.769813537597656
[epoch 1] train step: 10/34, loss: 31.391077041625977
[epoch 1] train step: 11/34, loss: 29.97881317138672
[epoch 1] train step: 12/34, loss: 28.616689682006836
[epoch 1] train step: 13/34, loss: 27.3470401763916
[epoch 1] train step: 14/34, loss: 26.27412986755371
[epoch 1] train step: 15/34, loss: 25.32369041442871
[epoch 1] train step: 16/34, loss: 24.477231979370117
[epoch 1] train step: 17/34, loss: 23.74869728088379
[epoch 1] train step: 18/34, loss: 23.068159103393555
[epoch 1] train step: 19/34, loss: 22.41190719604492

100%|██████████| 15/15 [00:05<00:00,  2.71it/s]


[epoch 1] mean perplexity: 46.49623289629191


STAGE:2023-10-06 20:52:41 524716:524716 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-10-06 20:52:41 524716:524716 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [12]:
print(prof.key_averages(group_by_stack_n=3).table(sort_by='self_cpu_time_total', row_limit=15))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       cudaLaunchKernel        40.31%       15.464s        40.31%       15.464s     187.801us        2.296s         7.24%        2.335s      28.354us         82345  
                                        cudaMemcpyAsync        18.12%        6.951s        18.12%        6.951s       2.323ms      59.036ms         0.19%      59.036ms      19.725us          2993  
         