# TP 5: Text Generation with Classification Models

### Apolline Hadjal

In [None]:
!pip install transformers>=4.41.2 accelerate>=0.31.0
!pip install transformers sentence-transformers openai
!pip install -U datasets
!pip install groq

In [None]:
from datasets import load_dataset
from transformers import pipeline
import torch
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import os
from groq import Groq
from dotenv import load_dotenv

In [25]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [None]:
data = load_dataset("rotten_tomatoes")
print(data)

## Part 1

### Step 1

In [21]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# load model into pipeline
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


### Step 2

In [22]:
# Run inference
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data["test"])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

100%|██████████| 1066/1066 [06:19<00:00,  2.81it/s]


### Step 3

In [None]:
def evaluate_performance(y_true, y_pred):
    """Create and print the classification report"""
    performance = classification_report(
        y_true, y_pred,
        target_names=["Negative Review", "Positive Review"]
    )
    print(performance)

evaluate_performance(data["test"]["label"], y_pred)

Accuracy: 80%

Negative recall: 0.88 ( so it catches most negative reviews)

Positive recall: 0.72 (misses 28% of positive reviews)

Model has a negative bias, meaning it tends to over-predict negativity

## Part 2
### Step 1

In [28]:
token = "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"

In [32]:
# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', token=token)
# Convert text to embeddings
train_embeddings = model.encode(data["train"]["text"],show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"],show_progress_bar=True)

sentences = ["This is an example sentence", "Eachsentence is converted"]
embeddings = model.encode(sentences) # Use the already loaded model
print(embeddings)
train_embeddings.shape

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

[[ 0.02250259 -0.07829179 -0.02303076 ... -0.00827927  0.02652692
  -0.00201895]
 [ 0.02576398 -0.04125077  0.02686729 ...  0.031839   -0.02995212
  -0.00993441]]


(8530, 768)

### Step 2

In [33]:
# Train a Logistic Regression on our train embeddings
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

# Predict previously unseen instances
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



## Part 3

In [34]:
# Average the embeddings of all documents in each target label
df = pd.DataFrame(np.hstack([
    train_embeddings,
    np.array(data["train"]["label"]).reshape(-1, 1)
]))
averaged_target_embeddings = df.groupby(768).mean().values

# Find the best matching embeddings between test documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings, averaged_target_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

# Evaluate the model
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.84      0.84       533
Positive Review       0.84      0.85      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



## Part 4

In [35]:
# Create embeddings for our labels
label_embeddings = model.encode([
    "A negative review",
    "A positive review"
])

# Find the best matching label for each document
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



Why do label descriptions matter ?

- More specific descriptions (e.g., "This movie is terrible") often perform better than generic ones ("negative")
- Descriptions should match the domain and style of the documents
- The embedding model captures semantic similarity between descriptions and documents

## Part 5
### Step 1

In [36]:
# Test with a sample
sample_text = data["test"]["text"][0]
print(f"Review: {sample_text}\n")

Review: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .



### Step 2

In [38]:
from google.colab import userdata
userdata.get('GROQ_API_KEY')

'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'

In [40]:
load_dotenv()

client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are a sentiment classifier. Respond with only 'positive' or 'negative'."
        },
        {
            "role": "user",
            "content": f"Classify the sentiment of this movie review: {sample_text}"
        }
    ],
    temperature=0,
    max_tokens=10
)

print(chat_completion.choices[0].message.content)

positive


### Step 3

In [41]:
chat_completion = client.chat.completions.create(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are a sentiment classifier. Rate the sentiment as a number between 0 (negative) and 1 (positive). Respond with only the number."
        },
        {
            "role": "user",
            "content": f"Rate the sentiment of this movie review: {sample_text}"
        }
    ],
    temperature=0,
    max_tokens=10
)

print(chat_completion.choices[0].message.content)

0.8


### Step 4

In [42]:
def groq_generation(prompt, model="meta-llama/llama-4-scout-17b-16e-instruct"):
    message = [
        {
            "role": "system",
            "content": "You are a sentiment classifier. Respond with only 'positive' or 'negative'."
        },
        {
            "role": "user",
            "content": f"Classify the sentiment of this movie review: {prompt}"
        }
    ]

    chat_completion = client.chat.completions.create(
        model=model,
        messages=message,
        temperature=0,
        max_tokens=10
    )

    return chat_completion.choices[0].message.content

groq_generation(sample_text)

'positive.'

## Part 6
### Step 1

In [44]:
# Load FLAN-T5 model
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device="cuda:0" if torch.cuda.is_available() else "cpu"
)

Device set to use cpu


### Step 2

In [45]:
# Prepare our data
prompt = "Is the following sentence positive or negative? "
data = data.map(lambda example: {"t5": prompt + example['text']})
data

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

### Step 3

In [47]:
# Run inference
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text.lower() == "negative" else 1)

100%|██████████| 1066/1066 [05:07<00:00,  3.46it/s]


### Step 4

In [48]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.85      0.84       533
Positive Review       0.85      0.83      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066

