<a href="https://colab.research.google.com/github/alxiom/Basic-NLP/blob/main/NLP_03_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers
!git clone https://github.com/alxiom/Basic-NLP.git

fatal: destination path 'Basic-NLP' already exists and is not an empty directory.


In [2]:
import math
import random

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as ftn
from torch.utils.data import Dataset, DataLoader
from tokenizers import CharBPETokenizer
from bokeh.layouts import column
from bokeh.plotting import figure, show
from bokeh.io import output_notebook


output_notebook()

In [3]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

special = ["<pad>", "<unk>", "<bos>", "<eos>", "<sep>", "<cls>", "<mask>"]
device = "cpu"

train_seq2seq_attention = True

In [4]:
tokenizer = CharBPETokenizer(vocab="Basic-NLP/data/vocab.json", merges="Basic-NLP/data/merges.txt")

In [5]:
train_data = pd.read_csv("Basic-NLP/data/chat_sample.csv", header=0)
print(train_data.head(5))
print(len(train_data))
print("--")

             Q              A
0       죽을거 같네  나쁜 생각 하지 마세요.
1      내일 시험이야    컨디션 조절 하세요.
2  정말.내 자신이 싫다    자신은 사랑해주세요.
3      이별후 네달째  바쁘게 살면서 잊어가요.
4      쌍커풀 해볼까       눈은 기본이죠.
128
--


In [6]:
query_tokens = []
answer_tokens = []
for i in range(len(train_data)):
    row = train_data.loc[i]
    query = row["Q"]
    answer = row["A"]

    tokenize_query = tokenizer.encode(query)
    tokenize_answer = tokenizer.encode(answer)

    query_tokens.append(tokenize_query.ids)
    answer_tokens.append(tokenize_answer.ids)

In [7]:
class LoadDataset(Dataset):

    def __init__(self, x_data, y_data):
        super(LoadDataset, self).__init__()
        self.x_data = x_data
        self.y_data = y_data

    def __getitem__(self, item):
        return self.x_data[item], self.y_data[item]

    def __len__(self):
        return len(self.y_data)


class MaxPadBatch:

    def __init__(self, max_len=24):
        super(MaxPadBatch, self).__init__()
        self.max_len = max_len

    def __call__(self, batch):
        batch_x = []
        batch_y = []
        for x, y in batch:
            batch_x.append(torch.tensor(x).long())
            batch_y.append(torch.tensor([special.index("<bos>")] + y + [special.index("<eos>")]).long())
        pad_index = special.index("<pad>")
        pad_x = [ftn.pad(item, [0, self.max_len - item.shape[0]], value=pad_index).detach() for item in batch_x]
        pad_y = [ftn.pad(item, [0, self.max_len - item.shape[0]], value=pad_index).detach() for item in batch_y]
        return torch.stack(pad_x), torch.stack(pad_y), len(batch)


max_seq_length = 20
chat_dataset = LoadDataset(query_tokens, answer_tokens)
chat_data_loader = DataLoader(chat_dataset, batch_size=32, collate_fn=MaxPadBatch(max_seq_length))

In [8]:
class Encoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(self.embedding_size, self.hidden_size, batch_first=True)

    def forward(self, x, embedding):
        # x: [batch, seq_length]
        x = embedding(x)
        x, hidden = self.rnn(x)
        return x, hidden

In [9]:
class Decoder(nn.Module):

    def __init__(self, output_size, embedding_size, hidden_size):
        super(Decoder, self).__init__()
        self.output_size = output_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(self.embedding_size, self.hidden_size, batch_first=True)

    def forward(self, x, hidden, embedding):
        # x: [batch] --> need second dimension as 1
        # hidden: [encoder_layers = 1, batch, hidden_dim]
        x = x.unsqueeze(1)
        x = embedding(x)
        x, hidden = self.rnn(x, hidden)
        return x, hidden

In [10]:
class Attention(nn.Module):

    def __init__(self):
        super(Attention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, encoder_output, decoder_output):
        # 이번 decoder 출력이 encoder 모든 출력들과 얼마나 강한 관계가 있는지 측정
        # 이번 decoder 출력과 encoder 모든 출력과 dot product 실행 --> sequence of scala (=attention score)
        # attention score --> softmax --> attention weight
        # 위에서 구한 강도에 따라서 encoder 모든 출력을 weight sum --> context_vector
        attention_score = torch.bmm(decoder_output, encoder_output.transpose(1, 2))
        attention_weight = self.softmax(attention_score)
        context_vector = torch.bmm(attention_weight, encoder_output)
        return context_vector

In [11]:
class Seq2SeqAttention(nn.Module):

    def __init__(self, encoder, decoder, attention):
        super(Seq2SeqAttention, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.embedding = nn.Embedding(self.encoder.input_size, self.encoder.embedding_size)
        self.target_vocab_size = self.decoder.output_size
        self.linear = nn.Linear(self.encoder.hidden_size + self.decoder.hidden_size, self.target_vocab_size)

    def forward(self, source, target, teacher_forcing=0.5):
        # source: [batch, seq_length]
        # target: [batch, seq_length]
        batch_size = target.shape[0]
        target_seq_length = target.shape[1]

        encoder_output, hidden = self.encoder(source, self.embedding)
        decoder_input = torch.tensor([special.index("<bos>")] * batch_size).long()

        attention_outputs = torch.zeros(batch_size, target_seq_length, self.target_vocab_size)
        for t in range(1, target_seq_length):
            decoder_output, hidden = self.decoder(decoder_input, hidden, self.embedding)
            # encoder output, decoder output 두 값을 이용하여 지금 decoding 할 context 생성
            # decoder output, context 이용하여 attention 적용된 output 도출
            # attention output 사용하여 greedy decoding
            context = self.attention(encoder_output, decoder_output)
            attention_output = self.linear(torch.cat([decoder_output, context], dim=2).squeeze(1))
            attention_outputs[:, t, :] = attention_output
            teacher = target[:, t]
            top1 = attention_output.argmax(1)
            decoder_input = teacher if random.random() < teacher_forcing else top1
        return attention_outputs

In [13]:
embedding_dim = 32
hidden_dim = 32
enc = Encoder(tokenizer.get_vocab_size(), embedding_dim, hidden_dim)
dec = Decoder(tokenizer.get_vocab_size(), embedding_dim, hidden_dim)
att = Attention()
seq2seq_att = Seq2SeqAttention(enc, dec, att)

decode_test = torch.tensor([[special.index("<bos>")] + [special.index("<pad>")] * (max_seq_length - 1)]).long()

if train_seq2seq_attention:
    learning_rate = 2e-3
    optimizer = torch.optim.Adam(seq2seq_att.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss(ignore_index=special.index("<pad>"))

    for epoch in range(300):
        seq2seq_att.train()
        epoch_loss = 0.0
        for batch_source, batch_target, batch_length in chat_data_loader:
            optimizer.zero_grad()
            seq2seq_attention_output = seq2seq_att(batch_source, batch_target)

            seq2seq_attention_output_dim = seq2seq_attention_output.shape[-1]
            seq2seq_attention_output_drop = seq2seq_attention_output[:, 1:, :].reshape(-1, seq2seq_attention_output_dim)
            batch_target_drop = batch_target[:, 1:].reshape(-1)
            loss = criterion(seq2seq_attention_output_drop, batch_target_drop)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() / batch_length

        if epoch % 10 == 0:
            print(f"{epoch} epoch loss: {epoch_loss:.4f} / ppl: {math.exp(epoch_loss):.4f}")
            seq2seq_att.eval()
            test = "썸 타는 것도 귀찮아."
            test_token = tokenizer.encode(test)
            test_tensor = torch.tensor(test_token.ids).long().unsqueeze(0)
            test_output = seq2seq_att(test_tensor, decode_test, 0.0)[:, 1:, :].squeeze(0).argmax(1).detach().tolist()
            recover_test_output = tokenizer.decode(test_output)
            print(recover_test_output.split("<eos>")[0])
            test = "죽을거 같네"
            test_token = tokenizer.encode(test)
            test_tensor = torch.tensor(test_token.ids).long().unsqueeze(0)
            test_output = seq2seq_att(test_tensor, decode_test, 0.0)[:, 1:, :].squeeze(0).argmax(1).detach().tolist()
            recover_test_output = tokenizer.decode(test_output)
            print(recover_test_output.split("<eos>")[0])
            test = "한심해서 죽고싶다"
            test_token = tokenizer.encode(test)
            test_tensor = torch.tensor(test_token.ids).long().unsqueeze(0)
            test_output = seq2seq_att(test_tensor, decode_test, 0.0)[:, 1:, :].squeeze(0).argmax(1).detach().tolist()
            recover_test_output = tokenizer.decode(test_output)
            print(recover_test_output.split("<eos>")[0])
        
    torch.save(seq2seq_att.state_dict(), "Basic-NLP/checkpoint/seq2seq_attention.pt")

0 epoch loss: 0.9099 / ppl: 2.4840
서서 꿨지났군요 지났군요 뭐해 . 싸 이게 녀라 요들. 쇼아닌가요 지난 있겠죠 훔서서
구프프이게 프프이게 프프생만해도 는지는 난난골라! 깊은 결참
쉬세요 내세요 간 못답답라도 빌컨해도 싶화를 는거 찍라도 찍찍있지난 요
10 epoch loss: 0.6264 / ppl: 1.8708
. 
연락내 내 . 
. 
20 epoch loss: 0.5884 / ppl: 1.8010
잘 . 
내 내 내 내 내 내 내 내 내 내 내 내 내 내 꼭 그래요 
잘 . 
30 epoch loss: 0.5586 / ppl: 1.7482
잘 더 . 
내 내 내 내 내 내 내 내 내 내 내 내 내 내 내 내 내 내 내
잘 더 . 
40 epoch loss: 0.5427 / ppl: 1.7207
잘 더 더 . 
잘되고싶잘되고싶한 로 이겠죠 . 
잘 더 . 
50 epoch loss: 0.5008 / ppl: 1.6500
잘 더 . 
잘되고싶잘되고싶지에서만 한 곳이라면요 . 
잘 더 더 . 
60 epoch loss: 0.4613 / ppl: 1.5861
잘 는 게 . 
잘되고싶지에서만 한 상황이네요 . 
잘 는 게 거예요 . 
70 epoch loss: 0.4167 / ppl: 1.5170
잘 는 게 . . 
잘되고싶지에서만 한 상황이네요 . 
잘 는 게 . . 
80 epoch loss: 0.3905 / ppl: 1.4778
연락는 게 . 
잘되고싶지에서만 할 수 있는 하면서 크게 심호흡해보세요 . 
잘 는 게 . . 
90 epoch loss: 0.3347 / ppl: 1.3976
연락이었으면 . 
네 말씀하세요 . 
하나 바꾸는 게 수 있다는 . 
100 epoch loss: 0.2719 / ppl: 1.3125
연락사는 는 게 수 있어요 . 
네 말씀하세요 . 
하나 바꾸는 게 수 있다는 . 
110 epoch loss: 0.2443 / ppl: 1.2768
연락사는 참석하. 
네 말씀하세요 . 
그것을 지켜볼 수 있다