![](https://www.soyhenry.com/_next/static/media/HenryLogo.bb57fd6f.svg)

# Introducción a las bases de datos vectoriales 
## Clase #3 :Text CLassification


In [None]:
#!uv add datasets

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from openai import OpenAI
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from tqdm import tqdm

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
EMBEDDING_MODEL = "text-embedding-3-small"

In [None]:
# Cargando la bae. de datos de Rotten Tomatoes
dataset = load_dataset("rotten_tomatoes")

In [None]:
train_data = dataset["train"].shuffle(seed=42).select(range(500))
test_data = dataset["test"].shuffle(seed=42).select(range(100))

In [None]:
def get_embeddings(texts, model=EMBEDDING_MODEL):
    texts = [t.replace("\n", " ") for t in texts]
    response = client.embeddings.create(input=texts, model=model)
    return [data.embedding for data in response.data]

In [None]:
X_train = get_embeddings(train_data["text"])
y_train = train_data["label"]

In [None]:
X_test = get_embeddings(test_data["text"])
y_test = test_data["label"]

In [None]:
print("Calculando reducción de dimensionalidad (t-SNE)...")
X_embedded = np.array(X_train)

In [None]:
# Primero PCA para reducir ruido, luego t-SNE para estructura local
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_embedded)

In [None]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_2d = tsne.fit_transform(X_pca)

In [None]:
df_viz = pd.DataFrame(X_2d, columns=["x", "y"])
df_viz["label"] = ["Positivo" if l == 1 else "Negativo" for l in y_train]
df_viz["text"] = train_data["text"]


plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df_viz, 
    x="x", y="y", 
    hue="label", 
    palette={"Positivo": "#2ecc71", "Negativo": "#e74c3c"},
    alpha=0.7
)
plt.title("Visualización de Embeddings: Cómo GPT separa los sentimientos", fontsize=15)
plt.xlabel("Dimensión latente 1")
plt.ylabel("Dimensión latente 2")
plt.legend(title="Sentimiento")
plt.show()

### Entrenando un modelo 

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("\n--- Reporte de Clasificación (Embeddings + Logistic Regression) ---")
print(classification_report(y_test, y_pred, target_names=["Negativo", "Positivo"]))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=["Negativo", "Positivo"], 
            yticklabels=["Negativo", "Positivo"])
plt.title("Matriz de Confusión")
plt.ylabel('Real')
plt.xlabel('Predicho')
plt.show()

### Buscando un modelo con Params

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0, 0.5, 1],
    'solver': ['lbfgs', 'liblinear'], 
    'max_iter': [1000] 
}

In [None]:
grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1, 
    verbose=1
)

In [None]:
grid_search.fit(X_train, y_train)


In [None]:
best_logreg = grid_search.best_estimator_

print("\n--- Resultados del Fine-Tuning ---")
print(f"Mejores Hiperparámetros: {grid_search.best_params_}")
print(f"Mejor Accuracy en Validación Cruzada: {grid_search.best_score_ * 100:.2f}%\n")


In [None]:
y_pred_tuned = best_logreg.predict(X_test)

print("--- Reporte de Clasificación (LogReg Optimizado) ---")
print(classification_report(y_test, y_pred_tuned, target_names=["Negativo", "Positivo"]))

In [None]:
cm_tuned = confusion_matrix(y_test, y_pred_tuned)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_tuned, annot=True, fmt='d', cmap='Oranges', 
            xticklabels=["Negativo", "Positivo"], 
            yticklabels=["Negativo", "Positivo"])
plt.title(f"Matriz de Confusión: LogReg Optimizado\nC={grid_search.best_params_['C']}, L1 Ratio={grid_search.best_params_['l1_ratio']}")
plt.ylabel('Etiqueta Real')
plt.xlabel('Predicción del Modelo')
plt.show()

### Haciendo la CLasificación con OpenAI 

In [None]:
def classify_review(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a helpful assistant for sentiment analysis. "
                               "Classify the following movie review strictly as 'Positive' or 'Negative'. "
                               "Do not add any other text."
                },
                {"role": "user", "content": text}
            ],
            temperature=0,  # Temperatura 0 para máxima determinismo
            max_tokens=2
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return "Error"

In [None]:
print("Iniciando clasificación Generativa con GPT-4o-mini...")
y_pred_gpt = []
y_true = test_data["label"]  # 1: Positivo, 0: Negativo
for text in tqdm(test_data["text"]):
    prediction = classify_review(text)
    
    # Mapear la respuesta de texto a número para comparar
    if "Positive" in prediction:
        y_pred_gpt.append(1)
    elif "Negative" in prediction:
        y_pred_gpt.append(0)
    else:
        # Fallback por si el modelo alucina (raro con temp=0)
        y_pred_gpt.append(0)

In [None]:
print("\n--- Reporte de Clasificación (Zero-Shot GPT-4o-mini) ---")
print(classification_report(y_true, y_pred_gpt, target_names=["Negativo", "Positivo"]))

In [None]:
cm_gpt = confusion_matrix(y_true, y_pred_gpt)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_gpt, annot=True, fmt='d', cmap='Greens', 
            xticklabels=["Negativo", "Positivo"], 
            yticklabels=["Negativo", "Positivo"])
plt.title("Matriz de Confusión: GPT-4o-mini (Zero-Shot)")
plt.ylabel('Etiqueta Real')
plt.xlabel('Predicción del Modelo')
plt.show()