In [1]:
!pip install --upgrade pip

# Install compatible versions of the packages
!pip install -q accelerate peft bitsandbytes transformers trl torch



In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig

## loading llama-2-7b-chat-hf model
QLoRA will use a rank of 64 with a scaling parameter of 16. We’ll load the Llama 2 model directly in 4-bit precision using the NF4 type and train it for one epoch

also i have chosen per_device_train_batch_size to 1 and gradient_accumulation_steps to 8. since we are using free version of google colab. we are short on resources 😭

In [3]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

dataset_name = "anuzb/humorchains-llama2-1k"

new_model = "Llama-2-7b-chat-finetune"

# QLoRA parameters

# LoRA attention dimension
lora_r = 64

# alpha
lora_alpha = 16  # good choice or common choice bhaneko 2 * rank but in this i chose 16

# dropout prob for LoRA layers
lora_dropout = 0.1

# bitsandbytes params
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Traning args hai ta

output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 8

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

# SFT params
# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = True

# Load the entire model on the GPU 0
device_map = {"": 0}


## Loading the dataset and Start of Finetuning

configured 4 bit quantization, loaded the llama 2 model with 4 bit precision on a GPU with its tokenzier. alsom configured QLoRa, and other hyperparameters everything to SFTTrainer and its SFTConfig.

In [5]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)




# Set training parameters

# SFT config
training_configuration = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    dataset_text_field="text",
    max_length=4096, # Or whatever value you need
    packing=False,
)


# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_configuration,
)

# Train model
trainer.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss
25,4.3935
50,5.1118
75,3.7481
100,4.1576
125,3.0871
150,3.4129
175,2.8347
200,3.1005
225,2.7773
250,3.0498


TrainOutput(global_step=250, training_loss=3.567322479248047, metrics={'train_runtime': 4652.6684, 'train_samples_per_second': 0.43, 'train_steps_per_second': 0.054, 'total_flos': 3912378830610432.0, 'train_loss': 3.567322479248047, 'epoch': 1.0})

In [6]:
trainer.model.save_pretrained(new_model)

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [16]:
import torch
from transformers import pipeline
import logging


# --- Assume your 'model' and 'tokenizer' are already loaded from training ---
prompt = "I was about to post about timetravel joke but you didn't like"
# Create the pipeline and tell it to use the bfloat16 data type
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200,
    torch_dtype=torch.bfloat16, # <--- THE FIX IS HERE
    device_map="auto"           # <--- Also good practice to ensure it uses the GPU
)
messages = [
    {"role": "user", "content": prompt}
]

# 2. Run the pipeline with your formatted prompt.
print("Generating response...")
result = pipe(messages)

# 3. Print the generated text from the result.
# The output is a list containing a dictionary, so we access it like this.
print("\n--- Model Response ---")
print(result[0]['generated_text'])


Generating response...


RuntimeError: Tensor.__contains__ only supports Tensor or scalar, but you passed in a <class 'str'>.

In [17]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

4704

In [19]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 154923 has 14.72 GiB memory in use. Of the allocated memory 14.33 GiB is allocated by PyTorch, and 264.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)