In [1]:
import numpy as np
from datasets import Dataset

seq_len, dataset_size = 512, 128
dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

In [2]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
print_gpu_utilization()

GPU memory occupied: 632 MB.


In [4]:
import torch

a = torch.tensor(1).cuda()
print_gpu_utilization()

GPU memory occupied: 850 MB.


In [5]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
print_gpu_utilization()

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

GPU memory occupied: 2138 MB.


In [6]:
(2219-931) / 336

3.8333333333333335

In [7]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "max_steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [8]:
from transformers import TrainingArguments, Trainer, logging

In [9]:
## GRAD ACCUM + CKPT (4)

training_args = TrainingArguments(per_device_train_batch_size=16, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/4 [00:00<?, ?it/s]

{'train_runtime': 14.0111, 'train_samples_per_second': 9.136, 'train_steps_per_second': 0.285, 'train_loss': 0.5233250856399536, 'epoch': 1.0}
Time: 14.01
Samples/second: 9.14
GPU memory occupied: 8512 MB.


In [9]:
# VANILLA

training_args = TrainingArguments(per_device_train_batch_size=1, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/128 [00:00<?, ?it/s]

{'train_runtime': 18.1932, 'train_samples_per_second': 7.036, 'train_steps_per_second': 7.036, 'train_loss': 0.023147890344262123, 'epoch': 1.0}
Time: 18.19
Samples/second: 7.04
GPU memory occupied: 6770 MB.


### Optimizer

In [9]:
## ADAFACTOR

training_args = TrainingArguments(per_device_train_batch_size=1, optim="adafactor", **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

  0%|          | 0/128 [00:00<?, ?it/s]

{'train_runtime': 22.8572, 'train_samples_per_second': 5.6, 'train_steps_per_second': 5.6, 'train_loss': 0.063768669962883, 'epoch': 1.0}
Time: 22.86
Samples/second: 5.60
GPU memory occupied: 4325 MB.


In [9]:
## ADAFACTOR X2

training_args = TrainingArguments(per_device_train_batch_size=2, optim="adafactor", **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

  0%|          | 0/64 [00:00<?, ?it/s]

{'train_runtime': 16.8882, 'train_samples_per_second': 7.579, 'train_steps_per_second': 3.79, 'train_loss': 0.05527115240693092, 'epoch': 1.0}
Time: 16.89
Samples/second: 7.58
GPU memory occupied: 6014 MB.


In [9]:
## ADAFACTOR X4

training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

  0%|          | 0/32 [00:00<?, ?it/s]

{'train_runtime': 13.6542, 'train_samples_per_second': 9.374, 'train_steps_per_second': 2.344, 'train_loss': 0.04318753629922867, 'epoch': 1.0}
Time: 13.65
Samples/second: 9.37
GPU memory occupied: 9730 MB.


In [9]:
## DEEPSPEED ZERO 2

import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

training_args = TrainingArguments(per_device_train_batch_size=1, deepspeed='ds_config_zero2.json', **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

[2023-02-12 16:28:43,489] [INFO] [comm.py:657:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/zerui/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module cpu_adam...


ninja: no work to do.
Time to load cpu_adam op: 2.2330217361450195 seconds


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Emitting ninja build file /home/zerui/.cache/torch_extensions/py310_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module utils...


ninja: no work to do.
Time to load utils op: 0.007245540618896484 seconds
Rank: 0 partition count [1] and sizes[(335143938, False)] 
Time to load utils op: 0.0004546642303466797 seconds


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


  0%|          | 0/128 [00:00<?, ?it/s]

{'train_runtime': 85.6566, 'train_samples_per_second': 1.494, 'train_steps_per_second': 1.494, 'train_loss': 0.03127874433994293, 'epoch': 1.0}
Time: 85.66
Samples/second: 1.49
GPU memory occupied: 7569 MB.


In [11]:
## DEEPSPEED ZERO 3

import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

training_args = TrainingArguments(per_device_train_batch_size=1, deepspeed='ds_config_zero3.json', **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Creating extension directory /home/zerui/.cache/torch_extensions/py310_cu118/cpu_adam...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/zerui/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/zerui/.local/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include/TH -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__

Loading extension module cpu_adam...
Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Creating extension directory /home/zerui/.cache/torch_extensions/py310_cu118/utils...
Emitting ninja build file /home/zerui/.cache/torch_extensions/py310_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/2] c++ -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include/TH -isystem /home/zerui/.local/lib/python3.10/site-packages/torch/include/THC -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /home/zerui/.local/lib/python3.10/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o 
[2/2] c++ flatten_unflatten.o -shared -L/home/zerui/.local/lib/python3.10/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so
Time to load utils op: 9.119560480117798 seconds


Loading extension module utils...


Parameter Offload: Total persistent parameters: 326658 in 246 params


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


Time to load utils op: 0.00043702125549316406 seconds




{'train_runtime': 139.6129, 'train_samples_per_second': 0.917, 'train_steps_per_second': 0.917, 'train_loss': 0.01930386945605278, 'epoch': 1.0}
Time: 139.61
Samples/second: 0.92
GPU memory occupied: 4786 MB.


In [9]:
## DEEPSPEED ZERO 3 X 4

import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

training_args = TrainingArguments(per_device_train_batch_size=4, deepspeed='ds_config_zero3.json', **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

[2023-02-12 16:21:23,734] [INFO] [comm.py:657:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/zerui/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module cpu_adam...


ninja: no work to do.
Time to load cpu_adam op: 2.217463493347168 seconds


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Emitting ninja build file /home/zerui/.cache/torch_extensions/py310_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module utils...


ninja: no work to do.
Time to load utils op: 0.00684046745300293 seconds
Parameter Offload: Total persistent parameters: 326658 in 246 params
Time to load utils op: 0.0005059242248535156 seconds


Using /home/zerui/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


  0%|          | 0/32 [00:00<?, ?it/s]



{'train_runtime': 39.3838, 'train_samples_per_second': 3.25, 'train_steps_per_second': 0.813, 'train_loss': 0.07862478494644165, 'epoch': 1.0}
Time: 39.38
Samples/second: 3.25
GPU memory occupied: 10124 MB.


### FP Precision

In [9]:
# TF32

import torch
torch.backends.cuda.matmul.allow_tf32 = True

training_args = TrainingArguments(per_device_train_batch_size=1, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/128 [00:00<?, ?it/s]

{'train_runtime': 15.6338, 'train_samples_per_second': 8.187, 'train_steps_per_second': 8.187, 'train_loss': 0.03858725726604462, 'epoch': 1.0}
Time: 15.63
Samples/second: 8.19
GPU memory occupied: 6768 MB.


In [9]:
# FP 16

training_args = TrainingArguments(per_device_train_batch_size=1, fp16=True, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/128 [00:00<?, ?it/s]

{'train_runtime': 14.8333, 'train_samples_per_second': 8.629, 'train_steps_per_second': 8.629, 'train_loss': 0.03560449928045273, 'epoch': 1.0}
Time: 14.83
Samples/second: 8.63
GPU memory occupied: 6876 MB.


In [9]:
# TF32 & FP16

import torch
torch.backends.cuda.matmul.allow_tf32 = True

training_args = TrainingArguments(per_device_train_batch_size=1, tf32=True, fp16=True, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/128 [00:00<?, ?it/s]

{'train_runtime': 15.1285, 'train_samples_per_second': 8.461, 'train_steps_per_second': 8.461, 'train_loss': 0.038383085280656815, 'epoch': 1.0}
Time: 15.13
Samples/second: 8.46
GPU memory occupied: 6876 MB.


### Gradient Accum + Ckpt

In [10]:
# GRAD ACCUM ONLY

training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 14.1982, 'train_samples_per_second': 9.015, 'train_steps_per_second': 2.254, 'train_loss': 5.175313162908424e-06, 'epoch': 1.0}
Time: 14.20
Samples/second: 9.02
GPU memory occupied: 7774 MB.


In [9]:
## GRAD ACCUM + CKPT

training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/32 [00:00<?, ?it/s]

{'train_runtime': 18.4171, 'train_samples_per_second': 6.95, 'train_steps_per_second': 1.738, 'train_loss': 0.03817403316497803, 'epoch': 1.0}
Time: 18.42
Samples/second: 6.95
GPU memory occupied: 6374 MB.


In [9]:
## GRAD ACCUM + CKPT (2)

training_args = TrainingArguments(per_device_train_batch_size=2, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/16 [00:00<?, ?it/s]

{'train_runtime': 16.4072, 'train_samples_per_second': 7.801, 'train_steps_per_second': 0.975, 'train_loss': 0.15510064363479614, 'epoch': 1.0}
Time: 16.41
Samples/second: 7.80
GPU memory occupied: 6500 MB.


In [9]:
## GRAD ACCUM + CKPT (4)

training_args = TrainingArguments(per_device_train_batch_size=4, gradient_accumulation_steps=2, gradient_checkpointing=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



  0%|          | 0/16 [00:00<?, ?it/s]

{'train_runtime': 15.5595, 'train_samples_per_second': 8.226, 'train_steps_per_second': 1.028, 'train_loss': 0.11093101650476456, 'epoch': 1.0}
Time: 15.56
Samples/second: 8.23
GPU memory occupied: 6859 MB.


In [None]:
(7884 - 2219) / 336

16.860119047619047

In [None]:
[p.dtype for p in model.parameters()]

In [None]:
n_params * 4 / 1e6

66.448992

In [None]:
from dataset import TextGenerationDataset
from model import TransformerModelLN

In [None]:
import sentencepiece as sp

In [None]:
tokenizer = sp.SentencePieceProcessor(model_file='shakespeare_15000.model')

In [None]:
model = TransformerModelLN.load_inference_model_from_compiled_checkpoint('gpt-shakespeare-sp15000.pt', vocab_size=tokenizer.vocab_size(), block_len=512, learning_rate=0)

In [None]:
model.cuda()

In [None]:
model.generate('What shall we do after returning from war?', tokenizer, 1000)