# Attention

입력 시퀀스 전체에서 중요한 정보를 선택적으로 참조할 수 있도록 하는 구조

Seq2Seq 구조의 디코더에서 출력을 계산할 때 인코더의 hidden state 출력을 모두 사용

## 장점


긴 시퀀스 처리 능력 향상
- 고정된 크기의 컨텍스트 벡터를 사용하지 않기 때문에 긴 시퀀스에서도 중요한 정보를 놓치지 않고 사용할 수 있음

정렬 문제 해결
- Seq2Seq 모델에서 입력과 출력 시퀀스 간의 단어 정렬 문제가 발생할 수 있는데, Attention은 각 단어 간의 상관관계를 학습하여 이를 해결함

해석 가능
- Attention 가중치를 통해 모델의 어느 부분에 집중하고 있는지 확인할 수 있음

## 단점

연산량 증가
- Attention 메커니즘은 추가적인 연산을 요구하므로 모델의 복잡성이 증가함. 특히 대용량 데이터셋에서 계산 비용이 높아짐

메모리 소모
- 모든 입력 시퀀스를 저장해야 함, 시퀀스 길이가 길어질수록 메모리 사용량이 급격히 증가함

훈련 속도 저하
- 모델이 복잡해지는 만큼 학습 속도가 느려짐


## 종류

### 바다나우 어텐션 (Bahdanau Attention)


Hidden state 결함
- 디코더의 이전 은닉 상태와 인코더의 각 은닉 상태를 결합한 후 비선형 변환을 통해 어텐션 가중치를 계산

Additive 방식
- 디코더 은닉 상태 계산 이전에 어텐션 적용
- 은닉 상태를 더한 후, 특정 가중치 행렬을 사용하여 점수를 계산

### 룽 어텐션 (Luong Attention)


은닉 상태 내적
- 디코더의 현재 은닉 상태와 인코더의 은닉 상태 간의 내역을 사용해 어텐션 가중치를 계산. 바다나우 어텐션보다 계산이 효율적임

Multiplicative 방식
- 디코더 은닉 상태 계산 이후에 어텐션 적용
- 두 은닉 상태를 내적하는 방식으로 컨텍스트 벡터 생성

## 바다나우 어텐션 구현

In [2]:
import torch
import torch.nn as nn
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, seq_len, hidden_size]

        hidden = hidden.unsqueeze(1)  # [batch_size, 1, hidden_size]
        score = self.V(torch.tanh(self.W1(encoder_outputs) + self.W2(hidden))) # [batch_size,seq_len,1]

        attn_weights = torch.softmax(score, dim=1)  # [batch_size, seq_len, 1]
        context_vector = attn_weights * encoder_outputs  # [batch_size, seq_len, hidden_size]
        context_vector = torch.sum(context_vector, dim=1)  # [batch_size, hidden_size]

        return context_vector, attn_weights

## 룽 어텐션 구현

In [3]:
import torch
import torch.nn as nn
class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super(LuongAttention, self).__init__()
        self.attn = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, seq_len, hidden_size]

        # 어텐션 가중치 계산
        hidden = self.attn(hidden).unsqueeze(1)  # [batch_size, 1, hidden_size]
        scores = torch.bmm(hidden, encoder_outputs.transpose(1, 2))  # [batch_size, 1, seq_len]
        attn_weights = F.softmax(scores, dim=2)  # [batch_size, 1, seq_len]

        # 컨텍스트 벡터 계산
        context = torch.bmm(attn_weights, encoder_outputs)  # [batch_size, 1, hidden_size]
        return context, attn_weights

## Attention 기반 번역 모델 구현

### 인코더 클래스

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embed_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        # x: [batch_size, seq_len]
        embedded = self.embedding(x)  # [batch_size, seq_len, embed_size]
        outputs, (hidden, cell) = self.lstm(embedded)  # outputs: [batch_size, seq_len, hidden_size]
        return outputs, (hidden, cell)

### 디코더 클래스

In [5]:
import torch.nn.functional as F
class Decoder(nn.Module):
    def __init__(self, target_vocab_size, embed_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(target_vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, num_layers, batch_first=True)
        self.attention = BahdanauAttention(hidden_size)
        self.fc = nn.Linear(hidden_size, target_vocab_size)

    def forward(self, input_step, hidden, cell, encoder_outputs):
        embedded = self.embedding(input_step)  # [batch_size, embed_size]
        embedded = embedded.unsqueeze(1)  # [batch_size, 1, embed_size]

        context_vector, attn_weights = self.attention(hidden[-1], encoder_outputs)

        lstm_input = torch.cat((embedded, context_vector.unsqueeze(1)), dim=2)  # [batch_size, 1, embed_size + hidden_size]
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))


        output = self.fc(output.squeeze(1))  # [batch_size, target_vocab_size]
        output = F.log_softmax(output, dim=1)
        return output, hidden, cell, attn_weights

### Seq2Seq 클래스 정의

In [6]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        tgt_len = target.size(1)
        tgt_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        encoder_outputs, (hidden, cell) = self.encoder(source)

        input_step = target[:, 0]  # [batch_size]

        for t in range(1, tgt_len):
            # 디코더를 통해 다음 단어 예측
            output, hidden, cell, attn_weights = self.decoder(input_step, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            # 교사 강요 적용 여부 결정
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)  # [batch_size]
            input_step = target[:, t] if teacher_force else top1
        return outputs

# 실습

강의자료를 바탕으로 어텐션 구조를 적용한 Seq2Seq 훈련 코드를 작성하고 훈련을 수행해보시오

In [7]:
data = [
    ('je suis etudiant', 'i am a student'),
    ('j aime le football', 'i love football'),
    ('il fait beau aujourd hui', 'it is nice today'),
    ('je mange une pomme', 'i eat an apple'),
    ('nous aimons apprendre', 'we love learning'),
    ('je vais à l école', 'i go to school'),
    ('tu es mon ami', 'you are my friend'),
    ('elle lit un livre', 'she is reading a book'),
    ('il écrit une lettre', 'he is writing a letter'),
    ('nous regardons un film', 'we are watching a movie'),
    ('vous parlez français', 'you speak french'),
    ('ils jouent au tennis', 'they play tennis'),
    ('je fais du sport', 'i do sports'),
    ('tu écoutes de la musique', 'you listen to music'),
    ('elle cuisine un gâteau', 'she is baking a cake'),
    ('il conduit une voiture', 'he is driving a car'),
    ('nous visitons le musée', 'we are visiting the museum'),
    ('vous aimez la plage', 'you love the beach'),
    ('ils dansent bien', 'they dance well'),
    ('je prends le train', 'i take the train'),
    ('tu joues de la guitare', 'you play the guitar'),
    ('elle dessine un portrait', 'she draws a portrait'),
    ('il apprend l anglais', 'he learns english'),
    ('nous voyageons en avion', 'we travel by plane'),
    ('vous travaillez dur', 'you work hard'),
    ('ils étudient la biologie', 'they study biology'),
    ('je bois du café', 'i drink coffee'),
    ('tu manges du pain', 'you eat bread'),
    ('elle porte une robe', 'she wears a dress'),
    ('il lit le journal', 'he reads the newspaper'),
    ('nous aimons la nature', 'we love nature'),
    ('vous prenez le bus', 'you take the bus'),
    ('ils chantent une chanson', 'they sing a song'),
    ('je visite Paris', 'i visit paris'),
    ('tu écris un poème', 'you write a poem'),
    ('elle étudie la médecine', 'she studies medicine'),
    ('il fait ses devoirs', 'he does his homework'),
    ('nous préparons le dîner', 'we prepare dinner'),
    ('vous jouez au basketball', 'you play basketball'),
    ('ils regardent la télévision', 'they watch television'),
    ('je dors bien', 'i sleep well'),
    ('tu travailles dans un bureau', 'you work in an office'),
    ('elle nage dans la piscine', 'she swims in the pool'),
    ('il se réveille tôt', 'he wakes up early'),
    ('nous chantons ensemble', 'we sing together'),
    ('vous écrivez des emails', 'you write emails'),
    ('ils jouent aux cartes', 'they play cards'),
    ('je visite un parc', 'i visit a park'),
    ('tu fais du vélo', 'you ride a bike'),
    ('elle regarde les étoiles', 'she watches the stars'),
    ('il monte les escaliers', 'he climbs the stairs'),
    ('nous lisons un roman', 'we read a novel'),
    ('vous écoutez la radio', 'you listen to the radio'),
    ('ils se promènent en ville', 'they walk around the city'),
    ('je cours dans le parc', 'i run in the park'),
    ('tu achètes des légumes', 'you buy vegetables'),
    ('elle joue au volley', 'she plays volleyball'),
    ('il nettoie la maison', 'he cleans the house'),
    ('nous prenons le petit déjeuner', 'we have breakfast'),
    ('vous apprenez une nouvelle langue', 'you learn a new language'),
    ('ils font la cuisine', 'they cook'),
    ('je dessine une maison', 'i draw a house'),
    ('tu regardes un documentaire', 'you watch a documentary'),
    ('elle visite un château', 'she visits a castle'),
    ('il photographie le paysage', 'he photographs the landscape'),
    ('nous organisons une fête', 'we organize a party'),
    ('vous jouez aux échecs', 'you play chess'),
    ('ils courent ensemble', 'they run together'),
    ('je regarde un match de football', 'i watch a football match'),
    ('tu lis un magazine', 'you read a magazine'),
    ('elle prépare une salade', 'she makes a salad'),
    ('il voyage en train', 'he travels by train'),
    ('nous faisons du shopping', 'we go shopping'),
    ('vous dansez au club', 'you dance at the club'),
    ('ils étudient l histoire', 'they study history'),
    ('je visite le marché', 'i visit the market'),
    ('tu achètes un cadeau', 'you buy a gift'),
    ('elle travaille dans une école', 'she works in a school'),
    ('il joue du piano', 'he plays the piano'),
    ('nous regardons le coucher du soleil', 'we watch the sunset'),
    ('vous apprenez à cuisiner', 'you learn to cook'),
    ('ils se reposent après le travail', 'they rest after work'),
    ('je prends des photos', 'i take photos'),
    ('tu fais de la natation', 'you go swimming'),
    ('elle sourit toujours', 'she always smiles'),
    ('il étudie à l université', 'he studies at the university'),
    ('nous visitons nos amis', 'we visit our friends'),
    ('vous mangez au restaurant', 'you eat at the restaurant'),
    ('ils jouent dans le jardin', 'they play in the garden'),
    ('je prends des notes', 'i take notes'),
    ('tu conduis prudemment', 'you drive carefully'),
    ('elle chante magnifiquement', 'she sings beautifully'),
    ('il lit un roman policier', 'he reads a detective novel'),
    ('nous partons en vacances', 'we go on vacation'),
    ('vous regardez les étoiles', 'you watch the stars'),
    ('ils écoutent de la musique classique', 'they listen to classical music'),
    ('je prépare un café', 'i make a coffee'),
    ('tu joues avec ton chien', 'you play with your dog'),
    ('elle porte des lunettes', 'she wears glasses'),
    ('il aime le chocolat', 'he loves chocolate')
]

In [8]:
import re
import unicodedata

# 문장 전처리 함수
def preprocess_sentence(sentence):
    # 악센트 제거
    sentence = ''.join(c for c in unicodedata.normalize('NFD', sentence)
                       if unicodedata.category(c) != 'Mn')
    # 소문자 변환
    sentence = sentence.lower()
    # 특수 문자 제거
    sentence = re.sub(r"[^a-zA-Z0-9]+", " ", sentence)
    # 양쪽 공백 제거
    sentence = sentence.strip()
    return sentence

# 입력과 출력 문장 전처리
input_texts = []
target_texts = []
input_vocab = set()
target_vocab = set()

for input_sentence, target_sentence in data:
    input_sentence = preprocess_sentence(input_sentence)
    target_sentence = preprocess_sentence(target_sentence)
    # 시작과 종료 토큰 추가
    target_sentence = '<sos> ' + target_sentence + ' <eos>'
    input_texts.append(input_sentence)
    target_texts.append(target_sentence)
    # 어휘 사전 생성
    input_vocab.update(input_sentence.split(' '))
    target_vocab.update(target_sentence.split(' '))

# 단어 사전에 PAD 토큰 추가
input_vocab = ['<pad>'] + sorted(input_vocab)
target_vocab = ['<pad>'] + sorted(target_vocab)

# 단어와 인덱스 매핑
input_word2idx = {word: idx for idx, word in enumerate(input_vocab)}
input_idx2word = {idx: word for idx, word in enumerate(input_vocab)}

target_word2idx = {word: idx for idx, word in enumerate(target_vocab)}
target_idx2word = {idx: word for idx, word in enumerate(target_vocab)}

# 최대 시퀀스 길이 계산
max_input_len = max(len(seq.split(' ')) for seq in input_texts)
max_target_len = max(len(seq.split(' ')) for seq in target_texts)

# 시퀀스를 인덱스 시퀀스로 변환하고 패딩 적용
def text_to_sequence(text, word2idx, max_len):
    seq = [word2idx[word] for word in text.split(' ')]
    seq += [word2idx['<pad>']] * (max_len - len(seq))
    return seq

input_sequences = [text_to_sequence(text, input_word2idx, max_input_len) for text in input_texts]
target_sequences = [text_to_sequence(text, target_word2idx, max_target_len) for text in target_texts]


print("입력 시퀀스 예시:", input_sequences[0])
print("출력 시퀀스 예시:", target_sequences[0])

입력 시퀀스 예시: [91, 173, 72, 0, 0, 0]
출력 시퀀스 예시: [2, 71, 6, 3, 138, 1, 0]


In [9]:
import torch.optim as optim

# 하이퍼파라미터 설정
input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)
embed_size = 16
hidden_size = 32
learning_rate = 0.001
num_epochs = 5000

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 초기화
encoder = Encoder(input_vocab_size, embed_size, hidden_size).to(device)
decoder = Decoder(target_vocab_size, embed_size, hidden_size).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

# 손실 함수와 최적화기 정의
criterion = nn.CrossEntropyLoss(ignore_index=target_word2idx['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
# 입력과 출력 시퀀스를 텐서로 변환
input_tensor = torch.LongTensor(input_sequences).to(device)
target_tensor = torch.LongTensor(target_sequences).to(device)

# 학습 루프
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()
    output = model(input_tensor, target_tensor)
    # 출력 차원 변경: [batch_size * target_len, target_vocab_size]
    output_dim = output.shape[-1]
    output = output[:, 1:].reshape(-1, output_dim)
    target = target_tensor[:, 1:].reshape(-1)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

    if epoch % 500 == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item():.4f}')

Epoch: 500, Loss: 0.8891
Epoch: 1000, Loss: 0.1156
Epoch: 1500, Loss: 0.0371
Epoch: 2000, Loss: 0.0180
Epoch: 2500, Loss: 0.0105
Epoch: 3000, Loss: 0.0067
Epoch: 3500, Loss: 0.0045
Epoch: 4000, Loss: 0.0031
Epoch: 4500, Loss: 0.0022
Epoch: 5000, Loss: 0.0016


In [13]:
def translate(sentence):
    model.eval()
    sentence = preprocess_sentence(sentence)
    sequence = text_to_sequence(sentence, input_word2idx, max_target_len)
    sequence = torch.LongTensor(sequence).unsqueeze(0).to(device)
    with torch.no_grad():
        encoder_outputs, (hidden, cell) = model.encoder(sequence)
        input_token = torch.LongTensor([target_word2idx['<sos>']]).to(device)
        result = []
        for _ in range(20):
            output, hidden, cell, attn_weights = model.decoder(input_token, hidden, cell, encoder_outputs)
            top1 = output.argmax(1)
            if top1.item() == target_word2idx['<eos>']:
                break
            result.append(top1.item())
            input_token = top1
    translated_sentence = ' '.join([target_idx2word[idx] for idx in result])
    return translated_sentence

In [14]:
# 예측 예시
test_sentences = [
    'je suis etudiant',
    'il fait beau aujourd hui',
    'nous aimons apprendre'
]

for sentence in test_sentences:
    translation = translate(sentence)
    print(f"입력 문장: {sentence}")
    print(f"번역 문장: {translation}\n")

입력 문장: je suis etudiant
번역 문장: i am a student

입력 문장: il fait beau aujourd hui
번역 문장: it is nice today

입력 문장: nous aimons apprendre
번역 문장: we love learning

