___
<img style="float: right; margin: 15px 15px 15px 15px;" src="https://communist.red/wp-content/uploads/2017/08/Anarchist_flag.png" width="300px" height="180px" />


# <font color= #bbc28d> **Sentiment Analysis in Movies IMDb** </font>
#### <font color= #2E9AFE> `Lab 2 – Text Mining`</font>
- <Strong> Sofía Maldonado, Diana Valdivia & Viviana Toledo </Strong>
- <Strong> Fecha </Strong>: 20/10/2025 

___

<p style="text-align:right;"> Imagen recuperada de: https://communist.red/wp-content/uploads/2017/08/Anarchist_flag.png</p>

In [82]:
# Importar librerías
import re
from collections import Counter
import spacy
import pickle

# Modeling
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import random

In [83]:
# PyTorch Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_device(device)
print(device)

cuda


In [84]:
# Leer el archivo de wikipedia
with open(r"text8", "r", encoding="utf-8") as f:
    text = f.read()

In [85]:
# Normalizar los datos
# Convertir todas las palabras a minúsculas
text = text.lower()

# Quedarse solo con las palabras, lo demás lo deja como un espacio en blanco
text = re.sub(r"[^a-z\s]", "", text)

# Tokenizar por el whitespace
tokens = text.split()

# Quedarnos solo con palabras con mas de una letra
tokens = [w for w in tokens if len(w) > 1]

# Quedanros solo con las primeras 50 mil
tokens_models = tokens[:50_000]

In [86]:
# Crear vocabulario a partir de esas palabras
vocab = sorted(list(set(tokens_models)))
vocab_set = set(vocab)

In [87]:
# Crear los word <> index diccionarios
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

tamano = len(vocab)
print(f"Tamaño del vocabulario: {tamano}")

Tamaño del vocabulario: 7979


In [88]:
tokens_idx = [word_to_idx[w] for w in tokens_models if w in vocab_set]
tokens_idx

[362,
 5086,
 543,
 7161,
 4983,
 39,
 2868,
 7568,
 203,
 2293,
 7894,
 1315,
 5829,
 3659,
 7185,
 2080,
 4983,
 7185,
 2459,
 6175,
 377,
 7185,
 6319,
 1792,
 4983,
 7185,
 2996,
 6175,
 7814,
 7185,
 7161,
 3870,
 6864,
 7568,
 3637,
 5258,
 7759,
 7280,
 1994,
 425,
 96,
 7184,
 7568,
 7666,
 4466,
 7280,
 2014,
 7185,
 5077,
 4983,
 6673,
 3882,
 3312,
 307,
 789,
 7089,
 7550,
 543,
 5483,
 4040,
 1055,
 6447,
 1920,
 366,
 7185,
 7886,
 362,
 3870,
 1988,
 3010,
 7185,
 3206,
 7873,
 490,
 6276,
 1255,
 4009,
 362,
 543,
 5453,
 5342,
 3870,
 7185,
 811,
 7184,
 6277,
 491,
 7529,
 377,
 6554,
 772,
 15,
 315,
 7212,
 491,
 2074,
 3829,
 4983,
 7799,
 7237,
 4466,
 362,
 307,
 5968,
 7280,
 6010,
 6665,
 4680,
 7184,
 166,
 7185,
 2378,
 4983,
 662,
 3779,
 5197,
 7185,
 6832,
 7185,
 7886,
 369,
 543,
 4652,
 366,
 7567,
 3882,
 2192,
 4897,
 3619,
 1220,
 4851,
 5054,
 405,
 1052,
 5871,
 3299,
 417,
 662,
 6673,
 3637,
 5387,
 4983,
 7799,
 491,
 5989,
 543,
 662,
 5453,
 6

In [89]:
# Genrar los pares para cbow
import random

def generate_cbow_pairs(tokens_idx, min_window=2, max_window=5):
    # Guardar los pares en ubna lista
    pairs = []
    # Rango la longitud de el vocabulario/texto
    n = len(tokens_idx)
    for i in range(n):
        # Elegir nuestro target
        target = tokens_idx[i]
        # Ventana random entre 2 y 5
        window_size = random.randint(min_window, max_window)
        # Posiciones de inicio y fin de la window
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, n)
        # Contexto de la palabra a predecir
        context = [tokens_idx[j] for j in range(start, end) if j != i]
        if context:
            pairs.append((context, target))
    return pairs

cbow_pairs = generate_cbow_pairs(tokens_idx, min_window=2, max_window=5)

In [90]:
# Convertir los pares CBOW a texto nuevamente
cbow_pairs_words = [
    ([idx_to_word[i] for i in context], idx_to_word[target])
    for context, target in cbow_pairs
]

for i in range(5):
    print(f"Contexto: {cbow_pairs_words[i][0]}  ->  Target: {cbow_pairs_words[i][1]}")

Contexto: ['originated', 'as', 'term', 'of', 'abuse']  ->  Target: anarchism
Contexto: ['anarchism', 'as', 'term', 'of', 'abuse']  ->  Target: originated
Contexto: ['anarchism', 'originated', 'term', 'of']  ->  Target: as
Contexto: ['anarchism', 'originated', 'as', 'of', 'abuse', 'first', 'used', 'against']  ->  Target: term
Contexto: ['anarchism', 'originated', 'as', 'term', 'abuse', 'first', 'used', 'against']  ->  Target: of


In [91]:
def generate_skipgram_pairs(tokens_idx, min_window=2, max_window=5):
    # Guardar los pares en una lista
    pairs = []
    # Rango la longitud de el vocabulario/texto
    n = len(tokens_idx)
    for i in range(n):
        # Elegir nuestro target
        target = tokens_idx[i]
        # Ventana random entre 2 y 5
        window_size = random.randint(min_window, max_window)
        # Posiciones de inicio y fin de la window
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, n)
        for j in range(start, end):
            #Skipear la target
            if j != i:
                context = tokens_idx[j]
                pairs.append((target, context))
    return pairs

skip_pairs = generate_skipgram_pairs(tokens_idx, min_window=2, max_window=5)

In [92]:
# Convertir los SKIP pairs a texto
skipgram_pairs_words = [
    (idx_to_word[target], idx_to_word[context])
    for target, context in skip_pairs
]

for i in range(5):
    print(f"Target: {skipgram_pairs_words[i][0]}  ->  Contexto: {skipgram_pairs_words[i][1]}")

Target: anarchism  ->  Contexto: originated
Target: anarchism  ->  Contexto: as
Target: originated  ->  Contexto: anarchism
Target: originated  ->  Contexto: as
Target: originated  ->  Contexto: term


In [93]:
# ----------------------
# 1. Datos de ejemplo
# tokens_idx ya está definido, vocab también
# cbow_pairs y skip_pairs generados previamente
# ----------------------

# Set device to CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ----------------------
# 2. Definir modelos simples
# ----------------------
embedding_dim = 100
vocab_size = len(vocab)

class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, contexts):
        # contexts: lista de tensores ya en el dispositivo correcto
        embeds = [self.embeddings(c) for c in contexts]  # lista de [context_len, emb_dim]
        context_embeds = torch.stack([e.mean(dim=0) for e in embeds])  # [batch_size, emb_dim]
        out = self.output(context_embeds)
        return out

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, centers):
        # centers: tensor [batch_size]
        embeds = self.embeddings(centers)  # [batch_size, emb_dim]
        out = self.output(embeds)
        return out

# ----------------------
# 3. Crear DataLoader con collate_fn para CBOW
# ----------------------
def cbow_collate(batch):
    contexts, targets = zip(*batch)
    # Convertir contextos a tensores y mover a device
    context_tensors = [torch.tensor(c, dtype=torch.long).to(device) for c in contexts]
    return context_tensors, torch.tensor(targets, dtype=torch.long).to(device)

# Set generator for DataLoader to avoid device conflict
generator = torch.Generator(device=device)

cbow_loader = DataLoader(cbow_pairs, batch_size=1024, shuffle=True, collate_fn=cbow_collate, generator=generator)

# Para Skip-gram, crear dataset en CPU y mover durante entrenamiento
skipgram_targets = torch.tensor([t for t, c in skip_pairs], dtype=torch.long)
skipgram_contexts = torch.tensor([c for t, c in skip_pairs], dtype=torch.long)
skipgram_dataset = list(zip(skipgram_targets, skipgram_contexts))
skipgram_loader = DataLoader(skipgram_dataset, batch_size=1024, shuffle=True, generator=generator)

# ----------------------
# 4. Entrenamiento CBOW
# ----------------------

cbow_model = CBOWModel(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cbow_model.parameters(), lr=0.001)
epochs = 15

for epoch in range(epochs):
    total_loss = 0
    cbow_model.train()
    for contexts, targets in cbow_loader:
        # Los datos ya están en el dispositivo correcto gracias al collate_fn
        optimizer.zero_grad()
        output = cbow_model(contexts)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"CBOW Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# ----------------------
# 5. Entrenamiento Skip-gram
# ----------------------
skipgram_model = SkipGramModel(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(skipgram_model.parameters(), lr=0.001)

for epoch in range(epochs):
    total_loss = 0
    skipgram_model.train()
    for centers, contexts in skipgram_loader:
        # Mover datos a device
        centers = centers.to(device)
        contexts = contexts.to(device)
        
        optimizer.zero_grad()
        output = skipgram_model(centers)
        loss = criterion(output, contexts)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Skip-gram Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# ----------------------
# 6. Obtener embeddings
# ----------------------
# Mover embeddings de vuelta a CPU para uso posterior
cbow_embeddings = cbow_model.embeddings.weight.data.cpu()
skipgram_embeddings = skipgram_model.embeddings.weight.data.cpu()

# Ejemplo de vector de una palabra
word = "science"
idx = word_to_idx[word]
print("Vector CBOW:", cbow_embeddings[idx])
print("Vector Skip-gram:", skipgram_embeddings[idx])

Using device: cuda
CBOW Epoch 1/15, Loss: 433.2662
CBOW Epoch 2/15, Loss: 411.1897
CBOW Epoch 3/15, Loss: 382.8012
CBOW Epoch 4/15, Loss: 352.2014
CBOW Epoch 5/15, Loss: 332.5900
CBOW Epoch 6/15, Loss: 322.9110
CBOW Epoch 7/15, Loss: 316.0400
CBOW Epoch 8/15, Loss: 310.0903
CBOW Epoch 9/15, Loss: 304.5161
CBOW Epoch 10/15, Loss: 299.1511
CBOW Epoch 11/15, Loss: 293.9339
CBOW Epoch 12/15, Loss: 288.7579
CBOW Epoch 13/15, Loss: 283.6336
CBOW Epoch 14/15, Loss: 278.5467
CBOW Epoch 15/15, Loss: 273.5027
Skip-gram Epoch 1/15, Loss: 2870.2237
Skip-gram Epoch 2/15, Loss: 2518.0833
Skip-gram Epoch 3/15, Loss: 2356.8852
Skip-gram Epoch 4/15, Loss: 2269.4033
Skip-gram Epoch 5/15, Loss: 2212.4808
Skip-gram Epoch 6/15, Loss: 2170.1928
Skip-gram Epoch 7/15, Loss: 2135.9753
Skip-gram Epoch 8/15, Loss: 2106.9532
Skip-gram Epoch 9/15, Loss: 2081.4767
Skip-gram Epoch 10/15, Loss: 2058.7242
Skip-gram Epoch 11/15, Loss: 2038.2626
Skip-gram Epoch 12/15, Loss: 2019.5557
Skip-gram Epoch 13/15, Loss: 2002.49

In [94]:
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Get embeddings from your trained models
cbow_embeddings = cbow_model.embeddings.weight.data.cpu().numpy()
skipgram_embeddings = skipgram_model.embeddings.weight.data.cpu().numpy()


def get_top_similar_words(embeddings, word, top_k=10):
    """
    Find top-k most similar words using cosine similarity
    """
    if word not in word_to_idx:
        print(f"Word '{word}' not in vocabulary")
        return []
    
    # Get the embedding for the anchor word
    word_idx = word_to_idx[word]
    word_embedding = embeddings[word_idx].reshape(1, -1)
    
    # Calculate cosine similarity with all other words
    similarities = cosine_similarity(word_embedding, embeddings)[0]
    
    # Get top-k most similar words (excluding the word itself)
    similar_indices = np.argsort(similarities)[::-1][1:top_k+1]  # Skip the word itself
    
    similar_words = []
    for idx in similar_indices:
        similar_words.append((idx_to_word[idx], similarities[idx]))
    
    return similar_words

# Choose anchor words
anchor_words = ["king", "anarchism", "communist", "revolution", "paris"]

print("CBOW Model - Top 10 Most Similar Words:")
print("=" * 60)
for word in anchor_words:
    similar_words = get_top_similar_words(cbow_embeddings, word)
    print(f"\n'{word}':")
    for similar_word, similarity in similar_words:
        print(f"  {similar_word}: {similarity:.4f}")

print("\n" + "=" * 60)
print("Skip-gram Model - Top 10 Most Similar Words:")
print("=" * 60)
for word in anchor_words:
    similar_words = get_top_similar_words(skipgram_embeddings, word)
    print(f"\n'{word}':")
    for similar_word, similarity in similar_words:
        print(f"  {similar_word}: {similarity:.4f}")

CBOW Model - Top 10 Most Similar Words:

'king':
  ensouling: 0.3759
  toxin: 0.3755
  lehrman: 0.3197
  lining: 0.3091
  is: 0.3080
  prices: 0.3045
  introduced: 0.3033
  scourge: 0.3030
  corpus: 0.3015
  topics: 0.3008

'anarchism':
  unencumbered: 0.4349
  drags: 0.3969
  argued: 0.3632
  fought: 0.3348
  wuxia: 0.3346
  interest: 0.3292
  plead: 0.3277
  importantly: 0.3260
  deep: 0.3223
  magazine: 0.3222

'communist':
  dating: 0.4331
  tombigbee: 0.3527
  fueled: 0.3285
  son: 0.3225
  caught: 0.3170
  virtuously: 0.3166
  scalped: 0.3115
  conventional: 0.3086
  pictogram: 0.3026
  soviet: 0.2981

'revolution':
  goes: 0.3917
  escape: 0.3738
  contributing: 0.3400
  another: 0.3396
  doren: 0.3245
  ways: 0.3215
  vocabularies: 0.3201
  measurement: 0.3168
  patronymic: 0.3167
  box: 0.3164

'paris':
  group: 0.3656
  echolalia: 0.3591
  legislative: 0.3582
  creating: 0.3468
  criminal: 0.3409
  encyclopaedia: 0.3408
  fine: 0.3380
  significiant: 0.3366
  involving: 0.329