In [1]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    # For GPU 0
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    memory_used = info.used
    # For GPU 1
    handle = nvmlDeviceGetHandleByIndex(1)
    info = nvmlDeviceGetMemoryInfo(handle)
    memory_used += info.used
    print(f"GPU memory occupied: {memory_used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


print_gpu_utilization()

GPU memory occupied: 620 MB.


In [2]:
from datasets import load_dataset
from transformers import LlamaTokenizer

block_size = 128
batch_size = 512
model_ckpt = 'openlm-research/open_llama_3b'
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = LlamaTokenizer.from_pretrained(model_ckpt, use_fast=True)


def tokenize_function(examples):
    return tokenizer(examples["text"])


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
    

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
lm_datasets = tokenized_datasets.map(group_texts, batched=True, batch_size=batch_size, num_proc=4)

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset wikitext (/home/zonghang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|███████████████████████████████████████████| 3/3 [00:00<00:00, 1238.60it/s]
Loading cached processed dataset at /home/zonghang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-061cec7cfeb153a3_*_of_00004.arrow
Loading cached processed dataset at /home/zonghang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-9bfb3be9041641d2_*_of_00004.arrow
Loading cached processed dataset at /home/zonghang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-220d4d46917abe66_*_of_00004.arrow
Loading cached processed dataset at /home/zon

In [3]:
from transformers import LlamaForCausalLM

model = LlamaForCausalLM.from_pretrained(model_ckpt, device_map='auto')
print_gpu_utilization()

[2023-06-26 01:28:19,748] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


GPU memory occupied: 15191 MB.


In [4]:
default_args = {
    "output_dir": "outputs",
    "evaluation_strategy": "no",
    "max_steps": 2,
    "log_level": "error",
    "report_to": "none",
}

In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=int(batch_size / 64),
    gradient_checkpointing=True,
    # fp16=True, # Disable mixed precision training in model parallel mode
    optim="adafactor",
    **default_args,
)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [6]:
result = trainer.train()
print_summary(result)

Step,Training Loss


Time: 179.22
Samples/second: 5.71
GPU memory occupied: 42455 MB.


Summary:

* Model: LLAMA 3B
* Model Size: ~14 GB
* GPU Mem Required: ~41.5 GB
* GPU Num Required: NVIDIA GeForce RTX 3090 x 2 (Total CUDA Mem: 48 GB)
* Batch Size: 64
* Gradient Checkpointing: Enabled
* Optimizer: AdaFactor
* Mixed Precision Training: Disabled (due to incompatibility with model parallel training)
* Throughput: 5.71 samples per second
* Training Time: ~1.5 minutes per iteration