Imports and Configuration

In [None]:
!pip install -q evaluate


In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
import evaluate


In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_LENGTH = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


Load IMDb Dataset

In [None]:
from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")


Convert IMDb to Instruction Format

In [None]:
def format_example(example):
    label = "Positive" if example["label"] == 1 else "Negative"

    prompt = (
        "Instruction: Classify the sentiment of the following movie review "
        "as Positive or Negative.\n\n"
        f"Review:\n{example['text']}\n\n"
        "Answer:"
    )

    return {
        "text": f"{prompt} {label}"
    }


In [None]:
train_data = dataset["train"].map(format_example, remove_columns=dataset["train"].column_names)
test_data  = dataset["test"].map(format_example, remove_columns=dataset["test"].column_names)


Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )


In [None]:
train_data = train_data.map(tokenize, batched=True)
test_data  = test_data.map(tokenize, batched=True)

train_data.set_format(type="torch")
test_data.set_format(type="torch")


Baseline Model

In [None]:
baseline_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
baseline_model.eval()


In [None]:
def predict_sentiment(model, review):
    prompt = (
        "Instruction: Classify the sentiment of the following movie review "
        "as Positive or Negative.\n\n"
        f"Review:\n{review}\n\n"
        "Answer:"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False
        )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.split("Answer:")[-1].strip()


In [None]:
sample = dataset["test"][0]["text"]
print(predict_sentiment(baseline_model, sample))


QLoRA Configuration

In [None]:
!pip install -U bitsandbytes

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)


In [None]:
model = prepare_model_for_kbit_training(model)


In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)


In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Training Setup

In [None]:
!pip install -U transformers


In [None]:
training_args = TrainingArguments(
    output_dir="./tinyllama-imdb-qlora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=50,
    #evaluation_strategy="epoch",
    #save_strategy="epoch",
    fp16=True,
    report_to="none"
)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data.select(range(2000)),  # small eval for speed
    # tokenizer=tokenizer,
    data_collator=data_collator
)


Train

In [None]:
trainer.train()


Evaluation: Baseline vs Fine-Tuned

In [None]:
accuracy = evaluate.load("accuracy")

def evaluate_model(model, dataset, n_samples=200):
    preds, labels = [], []
    
    for i in range(n_samples):
        raw = dataset[i]
        review = dataset[i]["text"]
        
        prediction = predict_sentiment(model, review)
        
        preds.append(1 if "Positive" in prediction else 0)
        labels.append(1 if "Positive" in raw else 0)
    
    return accuracy.compute(predictions=preds, references=labels)


In [None]:
baseline_acc = evaluate_model(baseline_model, test_data)
finetuned_acc = evaluate_model(model, test_data)

baseline_acc, finetuned_acc
