In [1]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        print("입력 Shape:", src.size())

        embedded = self.embedding(src)
        print("Embedding Layer를 거친 Shape:", embedded.size())

        outputs, (h_0, c_0) = self.rnn(embedded)
        print("LSTM Layer의 Output Shape:", outputs.size())
        print("LSTM Layer의 Hidden State Shape:", h_0.size())
        print("LSTM Layer의 Cell State Shape:", c_0.size())

        return outputs, h_0, c_0

In [2]:
vocab_size = 30000
emb_size = 256
lstm_size = 512
batch_size = 1
sample_seq_len = 3

print("Vocab Size: {0}".format(vocab_size))
print("Embedidng Size: {0}".format(emb_size))
print("LSTM Size: {0}".format(lstm_size))
print("Batch Size: {0}".format(batch_size))
print("Sample Sequence Length: {0}\n".format(sample_seq_len))

Vocab Size: 30000
Embedidng Size: 256
LSTM Size: 512
Batch Size: 1
Sample Sequence Length: 3



In [3]:
import torch

encoder = Encoder(vocab_size, emb_size, lstm_size)
sample_input = torch.randint(0, vocab_size, (batch_size, sample_seq_len))

sample_output, hidden, cell = encoder(sample_input)

입력 Shape: torch.Size([1, 3])
Embedding Layer를 거친 Shape: torch.Size([1, 3, 256])
LSTM Layer의 Output Shape: torch.Size([1, 3, 512])
LSTM Layer의 Hidden State Shape: torch.Size([1, 1, 512])
LSTM Layer의 Cell State Shape: torch.Size([1, 1, 512])


In [4]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell, context):
        print("입력 Shape:", x.size())

        embedded = self.embedding(x)
        print("Embedding Layer를 거친 Shape:", embedded.size())

        embedded = torch.cat((embedded, context), dim=2)
        print("Context Vector가 더해진 Shape:", embedded.size())

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        print("LSTM Layer의 Output Shape:", output.size())

        output = self.fc(output)
        print("Decoder 최종 Output Shape:", output.size())

        return output, hidden, cell

In [5]:
print("Vocab Size: {0}".format(vocab_size))
print("Embedidng Size: {0}".format(emb_size))
print("LSTM Size: {0}".format(lstm_size))
print("Batch Size: {0}".format(batch_size))
print("Sample Sequence Length: {0}\n".format(sample_seq_len))

Vocab Size: 30000
Embedidng Size: 256
LSTM Size: 512
Batch Size: 1
Sample Sequence Length: 3



In [6]:
decoder_input = torch.randint(0, vocab_size, (batch_size, sample_seq_len))  # (batch_size, seq_length)

decoder = Decoder(vocab_size, emb_size, lstm_size)

dec_output, hidden, cell = decoder(decoder_input, hidden, cell, sample_output)

입력 Shape: torch.Size([1, 3])
Embedding Layer를 거친 Shape: torch.Size([1, 3, 256])
Context Vector가 더해진 Shape: torch.Size([1, 3, 768])
LSTM Layer의 Output Shape: torch.Size([1, 3, 512])
Decoder 최종 Output Shape: torch.Size([1, 3, 30000])


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BahdanauAttention(nn.Module):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W_decoder = nn.Linear(512, units)  # Decoder hidden state -> units
        self.W_encoder = nn.Linear(512, units)  # Encoder hidden state -> units
        self.W_combine = nn.Linear(units, 1)   # Alignment score -> scalar weight

    def forward(self, H_encoder, H_decoder):
        print("[ H_encoder ] Shape:", H_encoder.shape)  # (batch, seq_len, hidden_dim)

        H_encoder = self.W_encoder(H_encoder)
        print("[ W_encoder X H_encoder ] Shape:", H_encoder.shape)  # (batch, seq_len, units)

        print("\n[ H_decoder ] Shape:", H_decoder.shape)  # (batch, hidden_dim)
        H_decoder = H_decoder.unsqueeze(1)  # (batch, 1, hidden_dim)
        H_decoder = self.W_decoder(H_decoder)  # (batch, 1, units)

        print("[ W_decoder X H_decoder ] Shape:", H_decoder.shape)  # (batch, 1, units)

        score = self.W_combine(torch.tanh(H_decoder + H_encoder))  # (batch, seq_len, 1)
        print("[ Score_alignment ] Shape:", score.shape)

        attention_weights = F.softmax(score, dim=1)  # (batch, seq_len, 1)
        print("\n최종 Weight:\n", attention_weights.squeeze(-1).detach().numpy())

        context_vector = attention_weights * H_encoder  # (batch, seq_len, units) # decoder -> encoder
        context_vector = torch.sum(context_vector, dim=1)  # (batch, units)

        return context_vector, attention_weights

# 설정
W_size = 100
print(f"Hidden State를 {W_size}차원으로 Mapping\n")

# 모델 생성
attention = BahdanauAttention(W_size)

# 입력 데이터 (배치 크기 = 1)
enc_state = torch.rand((1, 10, 512))  # (batch, seq_len, hidden_dim)
dec_state = torch.rand((1, 512))  # (batch, hidden_dim)

# 실행
_ = attention(enc_state, dec_state)

Hidden State를 100차원으로 Mapping

[ H_encoder ] Shape: torch.Size([1, 10, 512])
[ W_encoder X H_encoder ] Shape: torch.Size([1, 10, 100])

[ H_decoder ] Shape: torch.Size([1, 512])
[ W_decoder X H_decoder ] Shape: torch.Size([1, 1, 100])
[ Score_alignment ] Shape: torch.Size([1, 10, 1])

최종 Weight:
 [[0.11019357 0.09919199 0.09694339 0.09879664 0.09815952 0.09675518
  0.10399728 0.09687828 0.10417452 0.09490961]]


In [8]:
class LuongAttention(nn.Module):
    def __init__(self, units):
        super(LuongAttention, self).__init__()
        self.W_combine = nn.Linear(units, units)  # Encoder hidden state 변환

    def forward(self, H_encoder, H_decoder):
        print("[ H_encoder ] Shape:", H_encoder.shape)  # (batch, seq_len, hidden_dim)

        WH = self.W_combine(H_encoder)  # (batch, seq_len, hidden_dim)
        print("[ W_encoder X H_encoder ] Shape:", WH.shape)

        H_decoder = H_decoder.unsqueeze(1)  # (batch, 1, hidden_dim)
        alignment = torch.bmm(WH, H_decoder.transpose(1, 2))  # (batch, seq_len, 1)
        print("[ Score_alignment ] Shape:", alignment.shape)

        attention_weights = F.softmax(alignment, dim=1)  # (batch, seq_len, 1)
        print("\n최종 Weight:\n", attention_weights.squeeze(-1).detach().numpy())

        attention_weights = attention_weights.squeeze(-1)  # (batch, seq_len)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), H_encoder)  # (batch, 1, hidden_dim)
        context_vector = context_vector.squeeze(1)  # (batch, hidden_dim)

        return context_vector, attention_weights

# 설정
emb_dim = 512
attention = LuongAttention(emb_dim)

# 입력 데이터 (배치 크기 = 1)
enc_state = torch.rand((1, 10, emb_dim))  # (batch, seq_len, hidden_dim)
dec_state = torch.rand((1, emb_dim))  # (batch, hidden_dim)

# 실행
_ = attention(enc_state, dec_state)

[ H_encoder ] Shape: torch.Size([1, 10, 512])
[ W_encoder X H_encoder ] Shape: torch.Size([1, 10, 512])
[ Score_alignment ] Shape: torch.Size([1, 10, 1])

최종 Weight:
 [[0.00220163 0.55174404 0.00279592 0.00392605 0.00060809 0.10223878
  0.00263917 0.0074951  0.2951521  0.03119915]]
