<a href="https://colab.research.google.com/github/atanuc073/Genrative-AI-development-and-deployment/blob/main/Finetuning_Prod_prompt_prefix_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part 1: Prompt fine-tuning

In soft prompt tuning (also known as prompt tuning), we introduce a set of trainable virtual tokens to the model’s input embeddings, while keeping all model weights frozen. We optimize these prompt embeddings on a small labeled dataset.

Here, we format the task as a completion: the model will see "Review: ... Sentiment:" and we train it to complete with "positive" or "negative".

Only the prompt embeddings will be trained, making this a parameter-efficient fine-tuning method.

In [None]:
!pip install -q transformers peft datasets

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import PromptTuningConfig, PromptTuningInit, TaskType, get_peft_model
from datasets import Dataset

In [None]:
model_name = "roneneldan/TinyStories-33M"
tokenizer_name = "EleutherAI/gpt-neo-125M"  # TinyStories uses GPT-Neo tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

config.json:   0%|          | 0.00/968 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/291M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import PromptTuningConfig, PromptTuningInit, TaskType, get_peft_model
from datasets import Dataset

# Load base model and tokenizer (same TinyStories model)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Define a Prompt Tuning configuration – we'll use a few virtual tokens
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,       # We are doing causal language modeling (text generation) task
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify the sentiment of movie reviews as positive or negative.",
    num_virtual_tokens=8,              # number of virtual prompt tokens to prepend
    tokenizer_name_or_path=tokenizer_name
)
# Wrap the base model with the prompt tuning adapter
peft_model = get_peft_model(model, peft_config)

# Print trainable vs frozen parameter counts for verification
peft_model.print_trainable_parameters()
# Expected output: only a small number of prompt embedding params are trainable (<<1% of total)

# Prepare a small training dataset of reviews
train_texts = [
    "I absolutely loved this movie, it was fantastic!",    # positive review
    "The film was terrible and incredibly boring.",        # negative review
    "An amazing film with stellar acting and great plot!"  # positive review
]
train_labels = ["positive", "negative", "positive"]

# Format data as "Review: ... Sentiment: " -> "<label>"
# We will train the model to generate the label text after the prompt.
inputs = [f"Review: {txt} Sentiment:" for txt in train_texts]
# Tokenize inputs and labels
input_encodings = tokenizer(inputs, padding=True, truncation=True)
label_encodings = tokenizer(train_labels, padding=True, truncation=True)

# Create labels tensor, masking out the input part with -100 (so loss is only on the label tokens)
input_ids_list = []
labels_list = []
for i in range(len(inputs)):
    input_ids = input_encodings["input_ids"][i]
    label_ids = label_encodings["input_ids"][i]
    # Combine input and label ids. Add eos token after label for good measure.
    combined_ids = input_ids + label_ids + [tokenizer.eos_token_id]
    # Mask input part in the labels
    labels = [-100]*len(input_ids) + label_ids + [tokenizer.eos_token_id]
    input_ids_list.append(combined_ids)
    labels_list.append(labels)

# Pad sequences to the same length
max_length = max(len(ids) for ids in input_ids_list)
for i in range(len(input_ids_list)):
    pad_len = max_length - len(input_ids_list[i])
    input_ids_list[i] += [tokenizer.pad_token_id] * pad_len
    labels_list[i]    += [-100] * pad_len

# Create Dataset object
train_dataset = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list}).with_format("torch")

# Set up training arguments (small epochs for demonstration)
training_args = TrainingArguments(
    output_dir="prompt_tuning_sentiment",
    learning_rate=5e-3,
    per_device_train_batch_size=1,
    num_train_epochs=100,
    logging_steps=20,
    save_strategy="no"
)
# Initialize Trainer with our PEFT model
trainer = Trainer(model=peft_model, args=training_args, train_dataset=train_dataset)
trainer.train()

# After training, test the model on a new review
test_review = "The storyline was interesting but the acting was mediocre."
test_input = f"Review: {test_review} Sentiment:"
inputs = tokenizer(test_input, return_tensors="pt")
# Move inputs to the same device as the model
inputs = {k: v.to(peft_model.device) for k, v in inputs.items()}
output = peft_model.generate(**inputs, max_new_tokens=3)
result = tokenizer.decode(output[0], skip_special_tokens=True)
print("Model output:", result)
# The output should include the sentiment, e.g. "positive" or "negative"

trainable params: 6,144 || all params: 68,520,192 || trainable%: 0.0090


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraj-dandekar8[0m ([33mraj-dandekar8-massachusetts-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
20,5.7039
40,3.0668
60,1.8273
80,1.2727
100,0.9794
120,0.7381
140,0.5503
160,0.3625
180,0.3242
200,0.2995


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model output: Review: The storyline was interesting but the acting was mediocre. Sentiment:positive


In [None]:
# =========================
# Before/After Prompt Fine-Tuning Comparison (3 examples)
# =========================
import torch

# Ensure both models are on the same device and in eval mode
device = next(peft_model.parameters()).device
base_model = AutoModelForCausalLM.from_pretrained(model_name).to(device).eval()
peft_model.eval()

# Deterministic generation for fair comparison
gen_kwargs = dict(max_new_tokens=3, do_sample=False, num_beams=1, pad_token_id=tokenizer.pad_token_id)

def generate_label_only(model, prompt_text: str):
    """Generate only the continuation tokens (label) after the prompt."""
    with torch.inference_mode():
        enc = tokenizer(prompt_text, return_tensors="pt").to(device)
        out = model.generate(**enc, **gen_kwargs)
        # Slice only the new tokens beyond the prompt length
        new_ids = out[0, enc["input_ids"].shape[1]:]
        return tokenizer.decode(new_ids, skip_special_tokens=True).strip()

def normalize_label(s: str):
    """Light normalization so the table is easier to read."""
    s = s.strip().lower()
    # Keep only the first short word if model rambles
    s = s.split()[0] if s else s
    # Common cleanups
    if s.endswith("."): s = s[:-1]
    return s

# --- Three test cases ---
test_reviews = [
    "The storyline was interesting but the acting was mediocre.",      # mixed/lean negative
    "Absolutely brilliant direction and moving performances!",         # positive
    "I found it dull, predictable, and a complete waste of time.",     # negative
]

results = []
print("\n================== BEFORE vs AFTER (Prompt Fine-Tuning) ==================\n")
for i, review in enumerate(test_reviews, 1):
    test_input = f"Review: {review} Sentiment:"
    # BEFORE: base model (no prompt tuning)
    before_raw = generate_label_only(base_model, test_input)
    before = normalize_label(before_raw)

    # AFTER: prompt-tuned model
    after_raw  = generate_label_only(peft_model, test_input)
    after = normalize_label(after_raw)

    # Pretty print for each example
    print(f"--- Example {i} ---")
    print("PROMPT:")
    print(test_input)
    print("\nBEFORE (Base model, no prompt tuning):")
    print(before_raw)
    print("\nAFTER  (Prompt-tuned model):")
    print(after_raw)
    print("----------------------------------------------------------------------\n")

    results.append({
        "Example": i,
        "Review": review,
        "Before (base)": before,
        "After (prompt-tuned)": after,
    })

# --- Comparison table at the end ---
try:
    import pandas as pd
    df = pd.DataFrame(results, columns=["Example", "Review", "Before (base)", "After (prompt-tuned)"])
    try:
        # Nice Markdown table in many environments
        print("\n=== Comparison Table ===\n")
        print(df.to_markdown(index=False))
    except Exception:
        # Fallback to plain print
        print("\n=== Comparison Table ===\n")
        print(df)
except Exception:
    # Zero-dep fallback
    print("\n=== Comparison Table (plain) ===\n")
    header = f'{"Example":^7} | {"Before (base)":^15} | {"After (prompt-tuned)":^20} | Review'
    print(header)
    print("-"*len(header))
    for r in results:
        print(f'{r["Example"]:^7} | {r["Before (base)"]:^15} | {r["After (prompt-tuned)"]:^20} | {r["Review"]}')




--- Example 1 ---
PROMPT:
Review: The storyline was interesting but the acting was mediocre. Sentiment:

BEFORE (Base model, no prompt tuning):
they look delicious

AFTER  (Prompt-tuned model):
positive
----------------------------------------------------------------------

--- Example 2 ---
PROMPT:
Review: Absolutely brilliant direction and moving performances! Sentiment:

BEFORE (Base model, no prompt tuning):
Famous Wra LOVE

AFTER  (Prompt-tuned model):
positive
----------------------------------------------------------------------

--- Example 3 ---
PROMPT:
Review: I found it dull, predictable, and a complete waste of time. Sentiment:

BEFORE (Base model, no prompt tuning):
can you help

AFTER  (Prompt-tuned model):
hurthenard
----------------------------------------------------------------------


=== Comparison Table ===

|   Example | Review                                                      | Before (base)   | After (prompt-tuned)   |
|----------:|:------------------------

## Part 2: Prefix finetuning Demo 1

In prefix tuning, we attach trainable prefix vectors to the model’s layers (not just at the embedding input).

The model’s weights again stay frozen.

We will train the model on a few short fable examples, each ending in a moral, so that the learned prefix will prompt the model to produce a moral at the end of a story.

In [None]:
from peft import PrefixTuningConfig, get_peft_model
import torch

# Load base model and tokenizer again (TinyStories-33M)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Configure prefix tuning (e.g., 10 virtual tokens as prefix)
prefix_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=10
)
peft_model = get_peft_model(model, prefix_config)
peft_model.print_trainable_parameters()
# Only prefix parameters (much fewer than total) should be trainable

# Prepare a small training set of fable -> moral pairs
# ---- small toy corpus (add MANY more examples for best results) ----
train_stories = [
    "A boy repeatedly lied about a wolf coming. When a wolf truly came, nobody believed him and the sheep were lost.",
    "A hare was boastful and fast, a tortoise slow and steady. They raced; the hare napped and the tortoise won.",
    "A fox saw high grapes he couldn't reach. He walked away claiming the grapes were probably sour anyway.",
    "A thirsty crow dropped pebbles into a pitcher to raise the water level and finally quenched its thirst.",
    "Two friends met a bear; one climbed a tree, the other lay still. The bear left. The survivor asked what it whispered."
]
train_morals = [
    "Moral of the story: If you lie too often, people won't believe you when you tell the truth.",
    "Moral of the story: Slow and steady wins the race.",
    "Moral of the story: It's easy to despise what you cannot have.",
    "Moral of the story: Little by little, a little becomes a lot.",
    "Moral of the story: A friend in need is a friend indeed."
]

# Tokenize and combine story + moral for each example
story_inputs = tokenizer(train_stories, truncation=True)
moral_targets = tokenizer(train_morals, truncation=True)

input_ids_list = []
labels_list = []
for i in range(len(train_stories)):
    story_ids = story_inputs["input_ids"][i]
    moral_ids = moral_targets["input_ids"][i]
    # Combine story and moral, add EOS at end
    combined_ids = story_ids + moral_ids + [tokenizer.eos_token_id]
    labels = [-100]*len(story_ids) + moral_ids + [tokenizer.eos_token_id]  # mask story part
    input_ids_list.append(combined_ids)
    labels_list.append(labels)

# Pad to same length
max_length = max(len(ids) for ids in input_ids_list)
for i in range(len(input_ids_list)):
    pad_len = max_length - len(input_ids_list[i])
    input_ids_list[i] += [tokenizer.pad_token_id]*pad_len
    labels_list[i]    += [-100]*pad_len

train_dataset = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list}).with_format("torch")

# Training setup
training_args = TrainingArguments(
    output_dir="prefix_tuning_fables",
    learning_rate=1e-3,
    per_device_train_batch_size=1,
    num_train_epochs=500,
    logging_steps=10,
    save_strategy="no"
)
trainer = Trainer(model=peft_model, args=training_args, train_dataset=train_dataset)
trainer.train()

# Test the prefix-tuned model on a new story
test_story = "A kind cat saved a mouse from a trap. Later, the mouse helped the cat in return for the kindness."
inputs = tokenizer(test_story, return_tensors="pt")
# Move inputs to the same device as the model
inputs = {k: v.to(peft_model.device) for k, v in inputs.items()}
output = peft_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))
# The generated continuation should include a moral at the end of the story.

trainable params: 61,440 || all params: 68,575,488 || trainable%: 0.0896


Step,Training Loss
10,9.6536
20,8.1223
30,6.8448
40,5.6776
50,4.7093
60,3.8934
70,3.1908
80,2.6079
90,2.1126
100,1.696


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A kind cat saved a mouse from a trap. Later, the mouse helped the cat in return for the kindness.Moral of the story: If you was the story: A
.




In [None]:
# --- Additional test stories ---
test_stories = [
    "A greedy dog found a piece of meat. On his way home, he saw his reflection in the river and tried to snatch the other meat. He lost his own piece in the water.",
    "An old lion pretended to be sick. Animals came to visit him, and he ate them one by one. A clever fox noticed many footprints going in but none coming out, so he stayed away.",
    "A farmer’s goose laid a golden egg every day. Greedy for more, the farmer killed the goose, only to find nothing inside."
]

device = getattr(peft_model, "device", next(peft_model.parameters()).device)

for story in test_stories:
    inputs = tokenizer(story, return_tensors="pt").to(device)
    output = peft_model.generate(
        **inputs,
        max_new_tokens=50,
        pad_token_id=tokenizer.eos_token_id
    )
    print("Story:", story)
    print("Generated moral:", tokenizer.decode(output[0], skip_special_tokens=True))
    print("-"*80)


Story: A greedy dog found a piece of meat. On his way home, he saw his reflection in the river and tried to snatch the other meat. He lost his own piece in the water.
Generated moral: A greedy dog found a piece of meat. On his way home, he saw his reflection in the river and tried to snatch the other meat. He lost his own piece in the water.
--------------------------------------------------------------------------------
Story: An old lion pretended to be sick. Animals came to visit him, and he ate them one by one. A clever fox noticed many footprints going in but none coming out, so he stayed away.
Generated moral: An old lion pretended to be sick. Animals came to visit him, and he ate them one by one. A clever fox noticed many footprints going in but none coming out, so he stayed away.
--------------------------------------------------------------------------------
Story: A farmer’s goose laid a golden egg every day. Greedy for more, the farmer killed the goose, only to find nothing 

## Part 3: Prefix finetuning Demo 2

In [None]:
 !pip install -q peft transformers datasets

In [None]:
model_name = "roneneldan/TinyStories-33M"


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

device = "cuda"
model_name_or_path = "t5-large"
tokenizer_name_or_path = "t5-large"

text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-2
num_epochs = 5
batch_size = 8

In [None]:
from datasets import load_dataset

ds = load_dataset(
    "autoevaluate/autoeval-eval-financial_phrasebank-sentences_allagree-c1bf87-48200145240",
    split="train"
)

dataset = ds.train_test_split(test_size=0.1, seed=42)
dataset["validation"] = dataset["test"]; del dataset["test"]

# NOTE: class names live under 'target' here
classes = dataset["train"].features["target"].names

# Add human-readable label text
dataset = dataset.map(
    lambda x: {"text_label": [classes[i] for i in x["target"]]},
    batched=True
)

print(dataset["train"][0])
# {'text': '...', 'target': 0/1/2, 'text_label': 'negative/neutral/positive', 'evaluation_predictions': [...]}


README.md:   0%|          | 0.00/950 [00:00<?, ?B/s]

predictions.parquet:   0%|          | 0.00/196k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

Map:   0%|          | 0/2037 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

{'text': 'The inventors are Bylander Johan , Ponten Fredrik and Lundberg Jorgen .', 'target': 1, 'evaluation_predictions': [-2.98046875, 6.28125, -3.431640625], 'text_label': 'neutral'}


In [None]:
classes

['negative', 'neutral', 'positive']

In [None]:
# --- SET COLS + TOKENIZER ---
text_col = "text"       # <- correct for this dataset
label_col = "target"    # <- correct for this dataset

classes = dataset["train"].features[label_col].names  # ['negative','neutral','positive']

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 3
lr = 2e-5
bs_train, bs_eval = 32, 64

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.add_special_tokens({"pad_token": "[PAD]"})


tokenizer_config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [None]:
def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True)   # <-- use text/target
    enc["labels"] = batch["target"]
    return enc

processed = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names,  # removes text, target, text_label, evaluation_predictions, etc.
    desc="Tokenize",
)

collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")
train_loader = DataLoader(processed["train"], batch_size=bs_train, shuffle=True, collate_fn=collator)
eval_loader  = DataLoader(processed["validation"], batch_size=bs_eval, shuffle=False, collate_fn=collator)



Tokenize:   0%|          | 0/2037 [00:00<?, ? examples/s]

Tokenize:   0%|          | 0/227 [00:00<?, ? examples/s]

In [None]:
# --- MODEL ---
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(classes))
model.resize_token_embeddings(len(tokenizer))  # safe if we added PAD
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False                 # no KV cache during training
model.to(device)


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at roneneldan/TinyStories-33M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPTNeoForSequenceClassification(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-3): 4 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): 

In [None]:
from peft import PrefixTuningConfig, TaskType, get_peft_model

# 1) Make sure we’re not using KV cache during training
model.config.use_cache = False

# 2) Keep the classifier head trainable along with the prefix params
modules_to_save = []
if hasattr(model, "classifier"):
    modules_to_save.append("classifier")
if hasattr(model, "score"):         # e.g., GPT2/GPTNeo classifiers
    modules_to_save.append("score")

peft_cfg = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=50,          # tweak: 10–100 typical
    modules_to_save=modules_to_save # keep the head trainable
)

model = get_peft_model(model, peft_cfg)

# (Optional) sanity & visibility
model.print_trainable_parameters()


trainable params: 309,504 || all params: 68,825,856 || trainable%: 0.4497


In [None]:
# --- EVAL UTILITIES ---
@torch.no_grad()
def evaluate_classifier(model, dataloader, classes):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0
    for batch in dataloader:
        labels = batch["labels"]
        batch = {k: v.to(model.device) for k, v in batch.items()}
        out = model(**batch)
        total_loss += out.loss.item()
        preds = out.logits.argmax(dim=-1).cpu()
        all_preds.append(preds)
        all_labels.append(labels)
    all_preds  = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    loss = total_loss / len(dataloader)
    acc  = accuracy_score(all_labels, all_preds)
    f1m  = f1_score(all_labels, all_preds, average="macro")
    report = classification_report(all_labels, all_preds, target_names=classes, digits=4)
    cm = confusion_matrix(all_labels, all_preds)
    return {"loss": loss, "acc": acc, "f1_macro": f1m, "report": report, "cm": cm,
            "pred_ids": all_preds, "label_ids": all_labels}

def id2text(ids, classes): return [classes[i] for i in ids]


In [None]:
# === BASELINE (NO FINETUNING) ===
base_metrics = evaluate_classifier(model, eval_loader, classes)
print("=== BASELINE ===")
print(f"val_loss={base_metrics['loss']:.4f}  acc={base_metrics['acc']:.4f}  f1_macro={base_metrics['f1_macro']:.4f}")
print(base_metrics["report"])

# peek at a few examples
pred_text = id2text(base_metrics["pred_ids"][:10], classes)
gold_text = id2text(base_metrics["label_ids"][:10], classes)
for i, (p, g) in enumerate(zip(pred_text, gold_text), 1):
    print(f"{i:02d}. pred={p:8s} | gold={g}")


=== BASELINE ===
val_loss=1.5717  acc=0.2511  f1_macro=0.2103
              precision    recall  f1-score   support

    negative     0.1412    0.9231    0.2449        26
     neutral     0.6889    0.2153    0.3280       144
    positive     0.1667    0.0351    0.0580        57

    accuracy                         0.2511       227
   macro avg     0.3322    0.3911    0.2103       227
weighted avg     0.4950    0.2511    0.2507       227

01. pred=negative | gold=positive
02. pred=positive | gold=positive
03. pred=negative | gold=neutral
04. pred=negative | gold=negative
05. pred=negative | gold=neutral
06. pred=negative | gold=neutral
07. pred=negative | gold=neutral
08. pred=neutral  | gold=positive
09. pred=negative | gold=positive
10. pred=negative | gold=neutral


In [None]:
# --- TRAIN ---
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch} [train]"):
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model(**batch)
        loss = out.loss
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        running_loss += loss.item()
    print(f"epoch={epoch}  train_loss={running_loss/len(train_loader):.4f}")


Epoch 1 [train]: 100%|██████████| 64/64 [00:01<00:00, 33.78it/s]


epoch=1  train_loss=1.2631


Epoch 2 [train]: 100%|██████████| 64/64 [00:01<00:00, 37.09it/s]


epoch=2  train_loss=1.0772


Epoch 3 [train]: 100%|██████████| 64/64 [00:01<00:00, 37.00it/s]

epoch=3  train_loss=1.0450





In [None]:
# === FINETUNED EVAL ===
ft_metrics = evaluate_classifier(model, eval_loader, classes)
print("=== FINETUNED ===")
print(f"val_loss={ft_metrics['loss']:.4f}  acc={ft_metrics['acc']:.4f}  f1_macro={ft_metrics['f1_macro']:.4f}")
print(ft_metrics["report"])

pred_text = id2text(ft_metrics["pred_ids"][:10], classes)
gold_text = id2text(ft_metrics["label_ids"][:10], classes)
for i, (p, g) in enumerate(zip(pred_text, gold_text), 1):
    print(f"{i:02d}. pred={p:8s} | gold={g}")


=== FINETUNED ===
val_loss=1.0245  acc=0.5551  f1_macro=0.3252
              precision    recall  f1-score   support

    negative     0.1000    0.0769    0.0870        26
     neutral     0.6429    0.8125    0.7178       144
    positive     0.2800    0.1228    0.1707        57

    accuracy                         0.5551       227
   macro avg     0.3410    0.3374    0.3252       227
weighted avg     0.4896    0.5551    0.5082       227

01. pred=neutral  | gold=positive
02. pred=positive | gold=positive
03. pred=neutral  | gold=neutral
04. pred=neutral  | gold=negative
05. pred=positive | gold=neutral
06. pred=negative | gold=neutral
07. pred=neutral  | gold=neutral
08. pred=neutral  | gold=positive
09. pred=negative | gold=positive
10. pred=neutral  | gold=neutral


In [None]:
eval_pred_names = id2text(ft_metrics["pred_ids"], classes)
gold_names = dataset["validation"]["text_label"][:len(eval_pred_names)]
acc_text = 100.0 * sum(p.strip() == g.strip() for p, g in zip(eval_pred_names, gold_names)) / len(eval_pred_names)
print(f"Text-label accuracy (finetuned) = {acc_text:.2f}%")


Text-label accuracy (finetuned) = 55.51%
