In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleSelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super(SimpleSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        
        # Proyecciones lineales para Query, Key y Value
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key   = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # x shape: (batch_size, seq_len, embed_dim)
        B, L, D = x.shape
        
        # 1. Generar Q, K, V
        queries = self.query(x) 
        keys    = self.key(x)   
        values  = self.value(x) 

        # 2. Producto punto escalado (Similitud entre palabras)
        # Matriz de afinidad: (Q * K^T) / sqrt(d_k)
        scores = torch.matmul(queries, keys.transpose(-2, -1)) / (self.embed_dim ** 0.5)
        
        # 3. Softmax para obtener pesos (probabilidades de atención)
        attention_weights = F.softmax(scores, dim=-1)
        
        # 4. Multiplicar pesos por Values
        out = torch.matmul(attention_weights, values)
        
        return out, attention_weights

# --- Prueba rápida ---
batch, seq_len, dim = 1, 5, 16  # 1 oración, 5 palabras, 16 dimensiones cada una
x = torch.randn(batch, seq_len, dim)
model = SimpleSelfAttention(dim)
output, weights = model(x)

print(f"Salida del bloque: {output.shape}") # Igual a la entrada
print(f"Mapa de atención: {weights.shape}") # (1, 5, 5) -> Relación de cada palabra con todas

Salida del bloque: torch.Size([1, 5, 16])
Mapa de atención: torch.Size([1, 5, 5])


In [2]:
weights

tensor([[[0.1748, 0.1478, 0.1974, 0.1957, 0.2842],
         [0.1474, 0.1674, 0.0995, 0.1847, 0.4010],
         [0.1601, 0.2960, 0.2176, 0.1383, 0.1880],
         [0.2014, 0.1060, 0.1579, 0.2761, 0.2586],
         [0.2459, 0.1262, 0.1730, 0.2418, 0.2132]]],
       grad_fn=<SoftmaxBackward0>)

In [3]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Crear una matriz de ceros para las posiciones
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Factor de división basado en la dimensión del modelo
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Aplicar Seno a índices pares y Coseno a impares
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Añadir dimensión de batch: (1, max_len, d_model)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        # Se suma el encoding a los embeddings originales
        x = x + self.pe[:, :x.size(1), :]
        return x

# --- Prueba ---
d_model = 16
pos_encoder = PositionalEncoding(d_model)
dummy_input = torch.zeros(1, 10, d_model) # 10 palabras vacías
output = pos_encoder(dummy_input)

print(f"Input original (ceros): {dummy_input[0,0,:4]}")
print(f"Input con posición: {output[0,0,:4]}") # Ahora tienen 'identidad' posicional

Input original (ceros): tensor([0., 0., 0., 0.])
Input con posición: tensor([0., 1., 0., 1.])


In [4]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(TransformerBlock, self).__init__()
        
        # 1. Multi-Head Attention (Ya incluye las proyecciones Q, K, V internas)
        self.attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, batch_first=True)
        
        # 2. Normalización y Redes Feed-Forward
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model)
        )
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Paso A: Atención con conexión residual (Add & Norm)
        attn_output, _ = self.attention(x, x, x) # Q, K, V son 'x' al entrar
        x = self.norm1(x + self.dropout(attn_output))
        
        # Paso B: Feed Forward con conexión residual (Add & Norm)
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

# Ejemplo de uso:
d_model = 512
block = TransformerBlock(d_model=d_model, num_heads=8)
input_seq = torch.randn(1, 10, d_model) # (Batch, Seq, Dim)
output = block(input_seq)

In [5]:
output

tensor([[[-0.9341, -0.9250, -0.2683,  ...,  0.8611, -0.5008, -0.1601],
         [-0.1104,  1.9320, -0.4740,  ..., -0.5466,  0.5767, -0.8379],
         [-0.8129,  1.3367, -2.2882,  ...,  0.6953,  0.9447,  0.0632],
         ...,
         [-0.9207,  2.5044, -0.5374,  ..., -0.6356, -1.9849,  1.0492],
         [ 0.6269,  0.9470, -0.3120,  ...,  0.9391,  0.7806, -0.6197],
         [ 0.1760,  0.1502, -0.6656,  ...,  0.2499, -0.9846, -0.2453]]],
       grad_fn=<NativeLayerNormBackward0>)