In [1]:
import numpy as np
from datasets import Dataset


seq_len, dataset_size = 512, 512
dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
print_gpu_utilization()

GPU memory occupied: 479 MB.


In [4]:
import torch

torch.ones((1,1)).to('cuda')
print_gpu_utilization()

GPU memory occupied: 683 MB.


In [5]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained('bert-large-uncased').to('cuda')
print_gpu_utilization()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory occupied: 1971 MB.


In [6]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [7]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()


training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)

In [12]:
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



{'train_runtime': 18.7714, 'train_samples_per_second': 27.276, 'train_steps_per_second': 1.705, 'train_loss': 0.07128893584012985, 'epoch': 1.0}
Time: 18.77
Samples/second: 27.28
GPU memory occupied: 11913 MB.


In [13]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 35.6032, 'train_samples_per_second': 14.381, 'train_steps_per_second': 0.899, 'train_loss': 0.00016126169066410512, 'epoch': 1.0}
Time: 35.60
Samples/second: 14.38
GPU memory occupied: 9139 MB.


In [8]:
training_args = TrainingArguments(
    per_device_train_batch_size=4, fp16=True, **default_args
)

model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant":False})

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



{'train_runtime': 23.7746, 'train_samples_per_second': 21.536, 'train_steps_per_second': 1.346, 'train_loss': 0.03352086618542671, 'epoch': 1.0}
Time: 23.77
Samples/second: 21.54
GPU memory occupied: 7643 MB.


In [8]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=True,
    **default_args,
)

In [10]:
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader

dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)

if training_args.gradient_checkpointing:
    model.gradient_checkpointing_enable()

accelerator = Accelerator(training_args.fp16)
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

model.train()
for step, batch in enumerate(dataloader, start=1):
    loss = model(**batch).loss
    loss = loss / training_args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % training_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

NameError: name 'adam_bnb_optim' is not defined

In [4]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.17.1%2Bcu118-cp38-cp38-linux_x86_64.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.2.1%2Bcu118-cp38-cp38-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Downloading https://download.pytorch.org/whl/pillow-10.2.0-cp38-cp38-manylinux_2_28_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pillow, torchvision, torchaudio
Successfully installed pillo

In [1]:
import torch

torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False

In [2]:
torch.__version__

'2.2.1+cu121'