<a href="https://colab.research.google.com/github/beniamine3155/Fine_Tuning_LLM_with_HuggingFace/blob/main/Prompt_Fine_Tuning_with_QLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prompt Fine Tuning with QLoRA



In [1]:
!pip install transformers peft trl accelerate datasets bitsandbytes

Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.48.2 trl-0.24.0


### Memory uses for full model and 4bit quantization

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc
import torch

In [3]:
def get_memory_usage():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        return round(torch.cuda.memory_allocated() / (1024**2), 2)  # in MB
    else:
        return "No CUDA available"

In [4]:
# Load tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# Full precision model
print("\n--- Loading Full Precision Model ---")
torch.cuda.empty_cache()
full_model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
print("Memory Used (Full Precision):", get_memory_usage(), "MB")



--- Loading Full Precision Model ---


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Memory Used (Full Precision): 487.47 MB


In [7]:
# Print dtype
print("Full precision dtype:", full_model.transformer.h[0].mlp.c_fc.weight.dtype)

Full precision dtype: torch.float32


In [8]:
del full_model
torch.cuda.empty_cache()
gc.collect()

127

In [9]:
# Quantize Model (4 bit)
print("\n--- Loading 4-bit Quantized Model ---")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

quant_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

print("Memory Used (4-bit Quantized):", get_memory_usage(), "MB")
print("Quantized model dtype:", quant_model.transformer.h[0].mlp.c_fc.weight.dtype)


--- Loading 4-bit Quantized Model ---
Memory Used (4-bit Quantized): 129.25 MB
Quantized model dtype: torch.uint8


### Prompt Fine tuning with QLoRA

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import PromptTuningConfig, get_peft_model, TaskType, PromptTuningInit

In [11]:
# Load dataset
dataset = load_dataset("sst2", split="train[:5000]")  # small sample for demo

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [12]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token



In [14]:

# Load quantized model using QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

In [15]:
# prompt tuning config
prompt_config = PromptTuningConfig(
    task_type = TaskType.CAUSAL_LM,
    prompt_tuning_init = PromptTuningInit.TEXT,
    num_virtual_tokens = 20,
    tokenizer_name_or_path = model_name,
    prompt_tuning_init_text="Classify the sentiment"
)

# Add Prompt Tuning layer to model
model = get_peft_model(model, prompt_config)

In [16]:
# Prepare the dataset for causal LM format
def format_sample(example):
    text = f"Sentiment: {example['sentence']} ->"
    label = " positive" if example["label"] == 1 else " negative"
    full_input = text + label
    tokenized = tokenizer(full_input, padding="max_length", truncation=True, max_length=64)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(format_sample, remove_columns=dataset.column_names)



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./qlora-prompt-gpt2",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_steps=10,
    save_steps=50,
    learning_rate=5e-4,
    remove_unused_columns=False,
    fp16=True,
    report_to="none"
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,6.8645
20,6.327
30,5.9624
40,5.5352
50,4.9986
60,4.6258
70,4.4072
80,4.0029
90,3.8419
100,3.3843


TrainOutput(global_step=6250, training_loss=1.1756699999237061, metrics={'train_runtime': 512.6697, 'train_samples_per_second': 48.764, 'train_steps_per_second': 12.191, 'total_flos': 816537600000000.0, 'train_loss': 1.1756699999237061, 'epoch': 5.0})

In [20]:
input_text = "Sentiment: I hate the read history ->"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentiment: I hate the read history -> negative
