In [1]:
from datasets import load_dataset

# Load our data
data = load_dataset("rotten_tomatoes")
data

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [2]:
data["train"][0, -1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

## **Text Classification with Representation Models**

In [3]:
from transformers import pipeline

# Path to our HF model
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Load model into pipeline
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="mps"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps


In [4]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

# Run inference on our data
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data["test"])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

100%|██████████| 1066/1066 [00:11<00:00, 92.13it/s] 


In [5]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    performance = classification_report(
        y_true, y_pred,
        target_names=["Negative Review", "Positive Review"]
    )
    print(performance)

In [6]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



## **Classification Tasks that Leverage Embeddings**
### Supervised Classification

In [7]:
from sentence_transformers import SentenceTransformer

# Load our model
# Este modelo (all-mpnet-base-v2) es un modelo de sentence transformers que 
# convierte textos en vectores numéricos (embeddings) que capturan su significado semántico. 
# Se utiliza comúnmente para tareas como búsqueda semántica, clustering, y 
# clasificación basada en similitud de textos.
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Encode our data
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

Batches: 100%|██████████| 267/267 [00:23<00:00, 11.17it/s]
Batches: 100%|██████████| 34/34 [00:03<00:00, 11.03it/s]


In [8]:
train_embeddings.shape

(8530, 768)

In [9]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression on our train embeddings
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [27]:
import pickle 

# Save the model
with open("../model/model_lr.pkl", "wb") as f:
    pickle.dump(clf, f)

In [10]:
# Predict previously unseen instances
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

# Average the embeddings of all documents in each target label
df = pd.DataFrame(np.hstack([train_embeddings, np.array(data["train"]["label"]).reshape(-1, 1)]))
averaged_target_embeddings = df.groupby(768).mean().values

# Find the best matching embeddings between evaluation documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings, averaged_target_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

# Evaluate the model
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.84      0.84       533
Positive Review       0.84      0.85      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



### Zero-shot Classification

In [12]:
# Create embeddings for our labels
label_embeddings = model.encode(["A negative review",  "A positive review"])
label_embeddings.shape

(2, 768)

In [13]:
# Find the best matching label for each document
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



## **Classification with Generative Models**
### Encoder-decoder Models

In [14]:
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device="mps"
)

Device set to use mps


In [15]:
# Prepare our data for the model
prompt = "Is the following review positive or negative? Review: "
data = data.map(lambda x: {"t5": prompt + x["text"]})
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [16]:
data["train"]["t5"]

Column(['Is the following review positive or negative? Review: the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'Is the following review positive or negative? Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'Is the following review positive or negative? Review: effective but too-tepid biopic', 'Is the following review positive or negative? Review: if you sometimes like to go to the movies to have fun , wasabi is a good place to start .', "Is the following review positive or negative? Review: emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."])

In [17]:
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)

100%|██████████| 1066/1066 [01:20<00:00, 13.29it/s]


In [18]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.86      0.85       533
Positive Review       0.86      0.83      0.84       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



### ChatGPT for Classification

In [19]:
import os
import openai
from dotenv import load_dotenv

# Load our API key
load_dotenv()

# Initialize our client
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [20]:
def chatgpt_generation(prompt, document, model="gpt-5-nano-2025-08-07"):
    """Generate an output based on a prompt and an input document."""
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."
            },
        {
            "role": "user",
            "content":   prompt.replace("[DOCUMENT]", document)
            }
    ]
    chat_completion = client.chat.completions.create(
      messages=messages,
      model=model
    )
    return chat_completion.choices[0].message.content

In [21]:
# Define a prompt template as a base
prompt = """Predict whether the following document is a positive or negative movie review:

[DOCUMENT]

If it is positive return 1 and if it is negative return 0. Do not give any other answers.
"""

# Predict the target using GPT
document = "unpretentious , charming , quirky , original"
chatgpt_generation(prompt, document)

'1'

In [22]:
# You can skip this if you want to save your (free) credits
predictions = [chatgpt_generation(prompt, doc) for doc in tqdm(data["test"]["text"])]

100%|██████████| 1066/1066 [53:50<00:00,  3.03s/it]   


In [23]:
# Extract predictions
y_pred = [int(pred) for pred in predictions]

# Evaluate performance
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.88      0.94      0.91       533
Positive Review       0.93      0.87      0.90       533

       accuracy                           0.90      1066
      macro avg       0.91      0.90      0.90      1066
   weighted avg       0.91      0.90      0.90      1066



In [8]:
from datasets import load_dataset
import pandas as pd

# Cargar dataset
data = load_dataset("rotten_tomatoes")

# Crear muestra aleatoria de 10 registros del conjunto de test
sample_test = data['test'].shuffle().select(range(10))

# Convertir a DataFrame para mejor visualización
df_sample = pd.DataFrame(sample_test)
print(df_sample)

df_sample = df_sample['text']

# Guardar como CSV
df_sample.to_csv('../data/sample.csv', index=False)

                                                text  label
0  deuces wild treads heavily into romeo and juli...      0
1  by turns gripping , amusing , tender and heart...      1
2    your children will be occupied for 72 minutes .      1
3  less funny than it should be and less funny th...      0
4  it collapses when mr . taylor tries to shift t...      0
5  whether our action-and-popcorn obsessed cultur...      1
6  if you're looking for an intelligent movie in ...      1
7  this feature is about as necessary as a hole i...      0
8  neither quite a comedy nor a romance , more of...      1
9  fails in making this character understandable ...      0
