Imports and Configuration

In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
import evaluate


In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_LENGTH = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


Load IMDb Dataset

In [None]:
from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")


Convert IMDb to Instruction Format

In [None]:
def format_example(example):
    label = "Positive" if example["label"] == 1 else "Negative"

    prompt = (
        "Instruction: Classify the sentiment of the following movie review "
        "as Positive or Negative.\n\n"
        f"Review:\n{example['text']}\n\n"
        "Answer:"
    )

    return {
        "text": f"{prompt} {label}"
    }


In [None]:
train_data = dataset["train"].map(format_example, remove_columns=dataset["train"].column_names)
test_data  = dataset["test"].map(format_example, remove_columns=dataset["test"].column_names)


Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )


In [None]:
train_data = train_data.map(tokenize, batched=True)
test_data  = test_data.map(tokenize, batched=True)

train_data.set_format(type="torch")
test_data.set_format(type="torch")


Baseline Model

In [None]:
baseline_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
baseline_model.eval()


In [None]:
def predict_sentiment(model, review):
    prompt = (
        "Instruction: Classify the sentiment of the following movie review "
        "as Positive or Negative.\n\n"
        f"Review:\n{review}\n\n"
        "Answer:"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False
        )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.split("Answer:")[-1].strip()


In [None]:
sample = dataset["test"][0]["text"]
print(predict_sentiment(baseline_model, sample))
