<a href="https://colab.research.google.com/github/Yeabebe/Finetuning-LLM/blob/main/finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Environment setup

In [None]:
!pip install -q evaluate
!pip install -U bitsandbytes
!pip install -U transformers accelerate

Imports and Configuration

In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
import evaluate

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [5]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_LENGTH = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


Load IMDb Dataset

In [6]:
from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Convert IMDb to Instruction Format

In [7]:
def format_example(example):
    label = "Positive" if example["label"] == 1 else "Negative"

    prompt = (
        "Instruction: Classify the sentiment of the following movie review "
        "as Positive or Negative.\n\n"
        f"Review:\n{example['text']}\n\n"
        "Answer:"
    )

    return {
        "text": f"{prompt} {label}"
    }


In [8]:
train_data = dataset["train"].map(format_example, remove_columns=dataset["train"].column_names)
test_data  = dataset["test"].map(format_example, remove_columns=dataset["test"].column_names)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [10]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )


In [11]:
train_data = train_data.map(tokenize, batched=True)
test_data  = test_data.map(tokenize, batched=True)

train_data.set_format(type="torch")
test_data.set_format(type="torch")


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Baseline Model

In [12]:
baseline_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
baseline_model.eval()


`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [13]:
def predict_sentiment(model, review):
    prompt = (
        "Instruction: Classify the sentiment of the following movie review "
        "as Positive or Negative.\n\n"
        f"Review:\n{review}\n\n"
        "Answer:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False
        )

    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.split("Answer:")[-1].strip()


In [14]:
sample = dataset["test"][0]["text"]
print(predict_sentiment(baseline_model, sample))


Negative.


QLoRA Configuration

In [16]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)


In [17]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [18]:
model = prepare_model_for_kbit_training(model)


In [19]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)


In [20]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


Training Setup

In [None]:
training_args = TrainingArguments(
    output_dir="./tinyllama-imdb-qlora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)


In [27]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data.select(range(2000)),  # small eval for speed
    # tokenizer=tokenizer,
    data_collator=data_collator
)


Train

In [29]:
trainer.train()


  return fn(*args, **kwargs)


Step,Training Loss
50,2.379447
100,2.341426
150,2.381905
200,2.362615
250,2.340806
300,2.367742
350,2.339369
400,2.353628
450,2.337812
500,2.335039


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=1563, training_loss=2.3468155290404726, metrics={'train_runtime': 4441.3634, 'train_samples_per_second': 5.629, 'train_steps_per_second': 0.352, 'total_flos': 3.97685293056e+16, 'train_loss': 2.3468155290404726, 'epoch': 1.0})

Training Loss Curve

In [None]:
logs = trainer.state.log_history
losses = [log["loss"] for log in logs if "loss" in log]
steps = list(range(len(losses)))

plt.figure()
plt.plot(steps, losses)
plt.xlabel("Logging Step")
plt.ylabel("Training Loss")
plt.title("Training Loss Curve (QLoRA Fine-Tuning)")
plt.show()


Evaluation: Baseline vs Fine-Tuned

In [30]:
accuracy = evaluate.load("accuracy")

def evaluate_model(model, dataset, n_samples=200):
    preds, labels = [], []

    for i in range(n_samples):
        raw = dataset[i]
        review = dataset[i]["text"]

        prediction = predict_sentiment(model, review)

        preds.append(1 if "Positive" in prediction else 0)
        labels.append(1 if "Positive" in raw else 0)

    return accuracy.compute(predictions=preds, references=labels)


Downloading builder script: 0.00B [00:00, ?B/s]

In [31]:
baseline_acc = evaluate_model(baseline_model, test_data)
finetuned_acc = evaluate_model(model, test_data)

baseline_acc, finetuned_acc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


({'accuracy': 0.235}, {'accuracy': 0.495})

Baseline vs Fine-Tuned Accuracy Bar Chart

In [None]:
models = ["Baseline", "Fine-Tuned (QLoRA)"]
accuracies = [baseline_acc["accuracy"], finetuned_acc["accuracy"]]

plt.figure()
plt.bar(models, accuracies)
plt.ylabel("Accuracy")
plt.title("Baseline vs Fine-Tuned Accuracy")
plt.ylim(0, 1)
plt.show()


Example Predictions Table

In [None]:
def show_examples(model, dataset, n=5):
    for i in range(n):
        review = dataset[i]["text"][:300]
        prediction = predict_sentiment(model, review)
        print(f"Review {i+1}:")
        print("Prediction:", prediction)
        print("-" * 60)

show_examples(baseline_model, dataset["test"])
show_examples(model, dataset["test"])


Confusion Matrix

In [None]:
y_true, y_pred = [], []

for i in range(200):
    review = test_data[i]["text"]
    pred = predict_sentiment(model, review)

    y_pred.append(1 if "Positive" in pred else 0)
    y_true.append(1 if "Positive" in review else 0)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["Negative", "Positive"])
disp.plot()
plt.show()
