In [2]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # use normalized 4bit float for QLoRA
    bnb_4bit_compute_dtype=torch.float16 # can use torch.bfloat16 on newer gpus
)

In [3]:
from transformers import LlamaTokenizer, LlamaForCausalLM

## v2 models
model_path = 'openlm-research/open_llama_7b_v2'

model = LlamaForCausalLM.from_pretrained(
    model_path, quantization_config=bnb_config, device_map='auto',
)

tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
model.config.use_cache = False # for training
model.config.pretraining_tp = 1

In [6]:
from peft import LoraConfig, get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=16,
    lora_alpha=32, 
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model) # half of 7b due to 4 bit quantization 

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


In [7]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes", split='train').train_test_split(test_size=0.9)
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
training_data = data['train'].with_format("torch", device='cuda')

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2258 [00:00<?, ? examples/s]

In [8]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=training_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4, # effective batch size of 1x4 = 4, hence ~60 steps required for 250 training samples
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="./outputs",
        optim="paged_adamw_8bit",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()

Step,Training Loss
1,2.5512
2,2.1398
3,1.9608
4,1.6003
5,0.7884
6,1.2567
7,1.6399
8,1.3227
9,1.1838
10,1.181


TrainOutput(global_step=62, training_loss=1.5109875884748274, metrics={'train_runtime': 73.7964, 'train_samples_per_second': 3.388, 'train_steps_per_second': 0.84, 'total_flos': 416394186178560.0, 'train_loss': 1.5109875884748274, 'epoch': 0.99})