In [None]:
%pip install -r requirements.txt -q

In [None]:
import warnings
import os
warnings.filterwarnings("ignore")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


# Lab 5 - Text Classification with Generative Models

This notebook explores different approaches to text classification: task-specific models, embeddings-based classification (supervised and zero-shot), and generative models.

## 1. Dataset Loading


In [None]:
from datasets import load_dataset

data = load_dataset("rotten_tomatoes")
data


## 2. Task-Specific Model (RoBERTa)

Using a pre-trained sentiment classification model:


In [None]:
from transformers import pipeline
import torch
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
pipe = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, device=0 if torch.cuda.is_available() else -1)

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text"), batch_size=8), total=len(data["test"])):
    label = output["label"]
    y_pred.append(0 if label == "negative" else 1)


In [None]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    """Create and print the classification report"""
    performance = classification_report(
        y_true, y_pred,
        target_names=["Negative Review", "Positive Review"]
    )
    print(performance)

evaluate_performance(data["test"]["label"], y_pred)


## 3. Supervised Classification with Embeddings

Using sentence-transformers to generate embeddings and train a logistic regression classifier:


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

print(f"Train embeddings shape: {train_embeddings.shape}")


In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)


## 4. Zero-Shot Classification with Embeddings

Classifying without labeled training data by describing labels and using cosine similarity:


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

label_embeddings = model.encode(["A negative review", "A positive review"])

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

evaluate_performance(data["test"]["label"], y_pred)


## 5. Classification with Generative Models (FLAN-T5)

Using a sequence-to-sequence model for text classification:


In [None]:
t5_pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=0 if torch.cuda.is_available() else -1
)

prompt = "Is the following sentence positive or negative? "
data_t5 = data.map(lambda example: {"t5": prompt + example['text']})

y_pred = []
for output in tqdm(t5_pipe(KeyDataset(data_t5["test"], "t5")), total=len(data_t5["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)

evaluate_performance(data["test"]["label"], y_pred)


## Summary

| Approach | Description | Pros | Cons |
|----------|-------------|------|------|
| Task-specific model | Pre-trained for the exact task | High accuracy, fast inference | Limited to trained labels |
| Supervised embeddings | Train classifier on embeddings | Flexible, good performance | Requires labeled data |
| Zero-shot embeddings | Match text to label descriptions | No training needed | Lower accuracy |
| Generative model | Prompt-based classification | Very flexible | Slower, may need API costs |

The zero-shot approach achieves ~78% F1 without any labeled training data, demonstrating the power of embeddings for semantic understanding.
