# Evaluación de embeddings para una tarea de clasificación de texto

In [None]:
!python3 --version

Python 3.11.11


In [None]:
!pip3 install --upgrade numpy==2.2.5 --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytensor 2.27.1 requires numpy<2,>=1.17.0, but you have numpy 2.2.5 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.5 which is incompatible.
langchain 0.3.19 requires numpy<2,>=1.26.4; python_version < "3.12", but you have numpy 2.2.5 which is incompatible.
thinc 8.2.5 requires numpy<2.0.0,>=1.19.0; python_version >= "3.9", but you have numpy 2.2.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which is incompatible.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.5

In [None]:
!pip install scipy==1.14.0 --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.14.0 which is incompatible.[0m[31m
[0m

In [None]:
!pip install numpy pandas scikit-learn gensim transformers torch tqdm  tensorflow --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m111.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m771.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Comparando embeddings

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from gensim.models import Word2Vec, FastText
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertModel

# Descargar el dataset de IMDb desde keras
from tensorflow.keras.datasets import imdb


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:

# Carga el dataset de IMDb desde keras
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# Convierte las secuencias de índices de palabras en texto
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items()}

In [None]:


def decode_review(review):
    return ' '.join([index_word.get(i - 3, '?') for i in review])

X_train = [decode_review(review) for review in X_train]
X_test = [decode_review(review) for review in X_test]

# Preprocesamiento básico
def preprocess_text(text):
    # Convertir a minúsculas y eliminar caracteres especiales
    return text.lower().replace('<br />', ' ').replace('[^a-zA-Z]', ' ').strip()

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]


In [None]:

# Función para obtener vectores promedio de palabras
def average_word_vectors(words, model, num_features):
    feature_vec = np.zeros((num_features,), dtype='float32')
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


In [None]:

# 1. Word2Vec Embeddings
print("Entrenando Word2Vec...")
word2vec_model = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=2, sg=1)
X_train_word2vec = np.array([average_word_vectors(text.split(), word2vec_model, 100) for text in X_train])
X_test_word2vec = np.array([average_word_vectors(text.split(), word2vec_model, 100) for text in X_test])


In [None]:

# 2. FastText Embeddings
print("Entrenando FastText...")
fasttext_model = FastText(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=2, sg=1)
X_train_fasttext = np.array([average_word_vectors(text.split(), fasttext_model, 100) for text in X_train])
X_test_fasttext = np.array([average_word_vectors(text.split(), fasttext_model, 100) for text in X_test])


### Glove: Global Vectors for Word Representation

https://nlp.stanford.edu/projects/glove/


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip "glove.6B.zip"

In [None]:

# 3. GloVe Embeddings
print("Cargando GloVe...")
glove_embeddings = {}
with open("glove.6B.100d.txt", "r", encoding="utf-8") as f:  # Asegúrate de descargar GloVe: http://nlp.stanford.edu/data/glove.6B.zip
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector


def get_glove_embedding(text, glove_model, num_features=100):
    words = text.split()
    feature_vec = np.zeros((num_features,), dtype='float32')
    n_words = 0
    for word in words:
        if word in glove_model:
            n_words += 1
            feature_vec = np.add(feature_vec, glove_model[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

X_train_glove = np.array([get_glove_embedding(text, glove_embeddings, 100) for text in X_train])
X_test_glove = np.array([get_glove_embedding(text, glove_embeddings, 100) for text in X_test])

In [None]:


# 4. BERT Embeddings
print("Generando embeddings con BERT...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Usamos la representación del [CLS] token como embedding del documento
    return outputs.last_hidden_state[0][0].numpy()



In [None]:

X_train_bert = np.array([get_bert_embedding(text, tokenizer, model) for text in tqdm(X_train, desc="BERT embeddings - Train")])
X_test_bert = np.array([get_bert_embedding(text, tokenizer, model) for text in tqdm(X_test, desc="BERT embeddings - Test")])

# Función para entrenar y evaluar un clasificador
def train_and_evaluate(X_train, X_test, y_train, y_test, embedding_name):
    print(f"Entrenando clasificador con {embedding_name} embeddings...")
    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Resultados con {embedding_name} embeddings:")
    print(classification_report(y_test, y_pred))

# Evaluar el rendimiento de cada embedding
train_and_evaluate(X_train_word2vec, X_test_word2vec, y_train, y_test, "Word2Vec")
train_and_evaluate(X_train_fasttext, X_test_fasttext, y_train, y_test, "FastText")
train_and_evaluate(X_train_glove, X_test_glove, y_train, y_test, "GloVe")
train_and_evaluate(X_train_bert, X_test_bert, y_train, y_test, "BERT")