<a href="https://colab.research.google.com/github/appnori73/Transformer_Study/blob/main/Transformer_Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
# 포지셔널 인코딩 클래스 정의
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * -(torch.log(torch.tensor(10000.0)) / embedding_dim))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :]

# 입력 임베딩과 포지셔널 인코딩 처리
class EncoderInput(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EncoderInput, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.positional_encoding = PositionalEncoding(embedding_dim)

    def forward(self, x):
        x = self.embedding(x)
        print("x:",x)
        x = self.positional_encoding(x)
        return x
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.embedding_dim = embedding_dim
        self.head_dim = embedding_dim // num_heads

        assert self.head_dim * num_heads == embedding_dim, "Embedding dimension must be divisible by number of heads."

        self.query = nn.Linear(embedding_dim, embedding_dim)
        self.key = nn.Linear(embedding_dim, embedding_dim)
        self.value = nn.Linear(embedding_dim, embedding_dim)
        self.out = nn.Linear(embedding_dim, embedding_dim)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, head_dim)."""
        return x.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

    def forward(self, query, key, value):
        batch_size = query.size(0)
        # Linear projections
        Q = self.split_heads(self.query(query), batch_size)
        K = self.split_heads(self.key(key), batch_size)
        V = self.split_heads(self.value(value), batch_size)
        # Scaled Dot-Product Attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        attention_weights = torch.nn.functional.softmax(scores, dim=-1)
        attention_output = torch.matmul(attention_weights, V)
        # Concatenate heads
        attention_output = attention_output.transpose(1, 2).contiguous()
        attention_output = attention_output.view(batch_size, -1, self.embedding_dim)
        # Final linear layer
        output = self.out(attention_output)
        return output

# Feed Forward Network
class FeedForward(nn.Module):
    def __init__(self, embedding_dim, ff_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embedding_dim, ff_dim)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(ff_dim, embedding_dim)

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        return x

# Add & Norm Module
class AddAndNorm(nn.Module):
    def __init__(self, embedding_dim):
        super(AddAndNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(embedding_dim)

    def forward(self, x, sublayer_output):
        return self.layer_norm(x + sublayer_output)

# Complete Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, ff_dim):
        super(EncoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(embedding_dim, num_heads)
        self.add_and_norm1 = AddAndNorm(embedding_dim)
        self.feed_forward = FeedForward(embedding_dim, ff_dim)
        self.add_and_norm2 = AddAndNorm(embedding_dim)

    def forward(self, x):
        # Multi-Head Attention + Add & Norm
        attention_output = self.multi_head_attention(x, x, x)
        x = self.add_and_norm1(x, attention_output)

        # Feed Forward + Add & Norm
        ff_output = self.feed_forward(x)
        x = self.add_and_norm2(x, ff_output)

        return x

# Encoder: Stacking Multiple Encoder Layers
class Encoder(nn.Module):
    def __init__(self, num_layers, vocab_size, embedding_dim, num_heads, ff_dim, max_length):
        super(Encoder, self).__init__()
        self.embedding_layer = EncoderInput(vocab_size, embedding_dim)
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(embedding_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])

    def forward(self, x):
        # Input Embedding
        x = self.embedding_layer(x)

        # Pass through each Encoder Layer
        for layer in self.encoder_layers:
            x = layer(x)

        return x




In [None]:
# Parameters
# 단어 집합 정의
vocab = ["<PAD>", "<UNK>", "i", "you", "am", "are", "a", "boy", "girl", "who", "do", "me", "love"]
vocab_size = len(vocab)
embedding_dim = 6  # 임베딩 차원

# 단어와 인덱스 매핑
token_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_token = {idx: word for word, idx in token_to_index.items()}

# 임베딩 테이블 생성
embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)


vocab_size = len(vocab)
embedding_dim = 6
num_heads = 2
ff_dim = 12  # Feed Forward dimension
num_layers = 2  # Number of Encoder layers

# 최대 문장 길이 설정
max_length = 5
padding_value = token_to_index["<PAD>"]
# Input sentence
sentence = ["i", "love", "you"]
token_indices = [token_to_index[token] for token in sentence]
if len(token_indices) < max_length:
    token_indices += [padding_value] * (max_length - len(token_indices))
input_tensor = torch.tensor([token_indices])

# Initialize Encoder
encoder = Encoder(num_layers, vocab_size, embedding_dim, num_heads, ff_dim, max_length)

# Forward Pass
encoder_output = encoder(input_tensor)
print("Encoder Output Shape:", encoder_output.shape)

{'<PAD>': 0, '<UNK>': 1, 'i': 2, 'you': 3, 'am': 4, 'are': 5, 'a': 6, 'boy': 7, 'girl': 8, 'who': 9, 'do': 10, 'me': 11, 'love': 12}
{0: '<PAD>', 1: '<UNK>', 2: 'i', 3: 'you', 4: 'am', 5: 'are', 6: 'a', 7: 'boy', 8: 'girl', 9: 'who', 10: 'do', 11: 'me', 12: 'love'}
x: tensor([[[ 0.1978, -1.2836,  0.3379,  0.3155, -1.1212, -2.3003],
         [ 0.5429, -0.6714,  0.2298, -0.9935, -1.8716, -2.2981],
         [ 0.4123,  0.2096, -0.5443,  1.4981, -1.2774, -1.3460],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward0>)
Encoder Output Shape: torch.Size([1, 5, 6])
