In [1]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # use normalized 4bit float for QLoRA
    bnb_4bit_compute_dtype=torch.float16 # can use torch.bfloat16 on newer gpus
)

In [2]:
# # first time download

# from transformers import LlamaTokenizer, LlamaForCausalLM

# model_path = 'openlm-research/open_llama_7b_v2' # download
# model_save_path = '../model/'

# model = LlamaForCausalLM.from_pretrained(
#     model_path
# )
# model.save_pretrained(model_save_path)

# tokenizer = LlamaTokenizer.from_pretrained(model_path)
# tokenizer.save_pretrained(model_save_path)

# from datasets import load_dataset

# dataset_save_path = '../dataset/'

# data = load_dataset("Abirate/english_quotes", split='train').to_json(f"{dataset_save_path}english_quotes.jsonl")

In [3]:
from transformers import LlamaTokenizer, LlamaForCausalLM

model_path = '../model/'

model = LlamaForCausalLM.from_pretrained(
    model_path, quantization_config=bnb_config, device_map='auto',
)

tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
model.config.use_cache = False # for training
model.config.pretraining_tp = 1

In [5]:
from peft import LoraConfig, get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=16,
    lora_alpha=32, 
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model) # half of 7b due to 4 bit quantization 

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


In [6]:
from datasets import load_dataset

dataset_path = '../dataset/english_quotes.jsonl'

data = load_dataset("json", data_files=dataset_path, split='train[:10%]') # use only 10% due to OOM on 16gb gpu
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
training_data = data.with_format("torch", device='cuda')

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

In [7]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=training_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4, # effective batch size of 1x4 = 4, hence ~60 steps required for 250 training samples
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="./outputs",
        optim="paged_adamw_8bit",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()

Step,Training Loss
1,1.4014
2,1.4262
3,0.9151
4,1.2045
5,0.7496
6,0.906
7,0.8718
8,0.8741
9,0.91
10,1.8201


TrainOutput(global_step=62, training_loss=0.8901458401833812, metrics={'train_runtime': 72.1074, 'train_samples_per_second': 3.481, 'train_steps_per_second': 0.86, 'total_flos': 353121323188224.0, 'train_loss': 0.8901458401833812, 'epoch': 0.99})

In [8]:
torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024

9.465446472167969