<a href="https://colab.research.google.com/github/alxiom/Basic-NLP/blob/main/NLP_02_MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install tokenizers
!git clone https://github.com/alxiom/Basic-NLP.git

fatal: destination path 'Basic-NLP' already exists and is not an empty directory.


In [26]:
import math
import random

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as ftn
from torch.utils.data import Dataset, DataLoader
from tokenizers import CharBPETokenizer
from bokeh.layouts import column
from bokeh.plotting import figure, show
from bokeh.io import output_notebook


output_notebook()

In [27]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

special = ["<pad>", "<unk>", "<bos>", "<eos>", "<sep>", "<cls>", "<mask>"]
device = "cpu"

train_tokenizer = True
show_analysis = True
train_seq2seq = True

In [28]:
if train_tokenizer:
    tokenizer = CharBPETokenizer()
    tokenizer.train(files=["Basic-NLP/data/chat_sample.txt"], vocab_size=1500, special_tokens=special, min_frequency=1)
    tokenizer.save_model("Basic-NLP/data")

tokenizer = CharBPETokenizer(vocab="Basic-NLP/data/vocab.json", merges="Basic-NLP/data/merges.txt")
tokenize_sample_text = tokenizer.encode("인연이 있다고 생각해?")
print(tokenize_sample_text.tokens)
print("--")

recover = tokenizer.decode(tokenize_sample_text.ids)
print(recover)
print("--")


['인', '연', '이</w>', '있', '다', '고</w>', '생각', '해</w>', '?</w>']
--
인연이 있다고 생각해 ?
--


In [29]:
train_data = pd.read_csv("Basic-NLP/data/chat_sample.csv", header=0)
print(train_data.head(5))
print(len(train_data))
print("--")

             Q              A
0       죽을거 같네  나쁜 생각 하지 마세요.
1      내일 시험이야    컨디션 조절 하세요.
2  정말.내 자신이 싫다    자신은 사랑해주세요.
3      이별후 네달째  바쁘게 살면서 잊어가요.
4      쌍커풀 해볼까       눈은 기본이죠.
128
--


In [30]:
query_tokens = []
answer_tokens = []
query_lengths = {}
answer_lengths = {}
for i in range(len(train_data)):
    row = train_data.loc[i]
    query = row["Q"]
    answer = row["A"]

    tokenize_query = tokenizer.encode(query)
    tokenize_answer = tokenizer.encode(answer)

    query_tokens.append(tokenize_query.ids)
    answer_tokens.append(tokenize_answer.ids)

    query_length = len(tokenize_query.ids)
    answer_length = len(tokenize_answer.ids)

    if query_length in query_lengths:
        query_lengths[query_length] += 1
    else:
        query_lengths[query_length] = 1

    if answer_length in answer_lengths:
        answer_lengths[answer_length] += 1
    else:
        answer_lengths[answer_length] = 1

In [31]:
if show_analysis:
    sample_data = train_data.loc[99]
    print(sample_data)
    print("--")

    sample_query = sample_data["Q"]
    print(sample_query)
    print("--")

    tokenize_sample_query = tokenizer.encode(sample_query)
    print(tokenize_sample_query.tokens)
    print(tokenize_sample_query.ids)
    print("--")

    x_axis = list(range(1, max(max(query_lengths), max(answer_lengths)) + 1))
    query_length_list = [query_lengths.get(i, 0) for i in x_axis]
    answer_length_list = [answer_lengths.get(i, 0) for i in x_axis]
    x_axis = [str(i) for i in x_axis]

    plot_query = figure(title="query dist.", x_range=x_axis, plot_height=250, toolbar_location=None, tools="")
    plot_query.vbar(x=x_axis, top=query_length_list, width=0.9)
    plot_query.xgrid.grid_line_color = None
    plot_query.y_range.start = 0

    plot_answer = figure(title="answer dist.", x_range=x_axis, plot_height=250, toolbar_location=None, tools="")
    plot_answer.vbar(x=x_axis, top=answer_length_list, width=0.9)
    plot_answer.xgrid.grid_line_color = None
    plot_answer.y_range.start = 0

    show(column(plot_query, plot_answer))

Q        너무 빨리 철 들었어
A    철은 죽을 때 들어도 돼요.
Name: 99, dtype: object
--
너무 빨리 철 들었어
--
['너무</w>', '빨리</w>', '철</w>', '들었어</w>']
[635, 1183, 529, 772]
--


In [32]:
class LoadDataset(Dataset):

    def __init__(self, x_data, y_data):
        super(LoadDataset, self).__init__()
        self.x_data = x_data
        self.y_data = y_data

    def __getitem__(self, item):
        return self.x_data[item], self.y_data[item]

    def __len__(self):
        return len(self.y_data)


class MaxPadBatch:

    def __init__(self, max_len=24):
        super(MaxPadBatch, self).__init__()
        self.max_len = max_len

    def __call__(self, batch):
        batch_x = []
        batch_y = []
        for x, y in batch:
            batch_x.append(torch.tensor(x).long())
            batch_y.append(torch.tensor([special.index("<bos>")] + y + [special.index("<eos>")]).long())
        pad_index = special.index("<pad>")
        pad_x = [ftn.pad(item, [0, self.max_len - item.shape[0]], value=pad_index).detach() for item in batch_x]
        pad_y = [ftn.pad(item, [0, self.max_len - item.shape[0]], value=pad_index).detach() for item in batch_y]
        return torch.stack(pad_x), torch.stack(pad_y), len(batch)


max_seq_length = 20
chat_dataset = LoadDataset(query_tokens, answer_tokens)
chat_data_loader = DataLoader(chat_dataset, batch_size=32, collate_fn=MaxPadBatch(max_seq_length))

In [33]:
class Encoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(self.embedding_size, self.hidden_size, batch_first=True)

    def forward(self, x, embedding):
        # x: [batch, seq_length]
        x = embedding(x)
        x, hidden = self.rnn(x)
        return x, hidden

In [34]:
class Decoder(nn.Module):

    def __init__(self, output_size, embedding_size, hidden_size):
        super(Decoder, self).__init__()
        self.output_size = output_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(self.embedding_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x, hidden, embedding):
        # x: [batch] --> need second dimension as 1
        # hidden: [encoder_layers = 1, batch, hidden_dim]
        x = x.unsqueeze(1)
        x = embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = self.fc(x.squeeze(1))
        return x, hidden

In [35]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.embedding = nn.Embedding(self.encoder.input_size, self.encoder.embedding_size)
        self.target_vocab_size = self.decoder.output_size

    def forward(self, source, target, teacher_forcing=0.5):
        # source: [batch, seq_length]
        # target: [batch, seq_length]
        batch_size = target.shape[0]
        target_seq_length = target.shape[1]

        _, hidden = self.encoder(source, self.embedding)
        decoder_input = torch.tensor([special.index("<bos>")] * batch_size).long()

        decoder_outputs = torch.zeros(batch_size, target_seq_length, self.target_vocab_size)
        for t in range(1, target_seq_length):
            decoder_output, hidden = self.decoder(decoder_input, hidden, self.embedding)
            decoder_outputs[:, t, :] = decoder_output
            teacher = target[:, t]
            top1 = decoder_output.argmax(1)
            decoder_input = teacher if random.random() < teacher_forcing else top1
        return decoder_outputs

In [36]:
embedding_dim = 128
hidden_dim = 128
enc = Encoder(tokenizer.get_vocab_size(), embedding_dim, hidden_dim)
dec = Decoder(tokenizer.get_vocab_size(), embedding_dim, hidden_dim)
seq2seq = Seq2Seq(enc, dec)

decode_test = torch.tensor([[special.index("<bos>")] + [special.index("<pad>")] * (max_seq_length - 1)]).long()

if train_seq2seq:
    learning_rate = 3e-4
    optimizer = torch.optim.Adam(seq2seq.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss(ignore_index=special.index("<pad>"))

    for epoch in range(300):
        seq2seq.train()
        epoch_loss = 0.0
        for batch_source, batch_target, batch_length in chat_data_loader:
            optimizer.zero_grad()
            seq2seq_output = seq2seq(batch_source, batch_target)

            seq2seq_output_dim = seq2seq_output.shape[-1]
            seq2seq_output_drop = seq2seq_output[:, 1:, :].reshape(-1, seq2seq_output_dim)
            batch_target_drop = batch_target[:, 1:].reshape(-1)
            loss = criterion(seq2seq_output_drop, batch_target_drop)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() / batch_length

        if epoch % 10 == 0:
            print(f"{epoch} epoch loss: {epoch_loss:.4f} / ppl: {math.exp(epoch_loss):.4f}")
            seq2seq.eval()
            test = "썸 타는 것도 귀찮아."
            test_token = tokenizer.encode(test)
            test_tensor = torch.tensor(test_token.ids).long().unsqueeze(0)
            test_output = seq2seq(test_tensor, decode_test, 0.0)[:, 1:, :].squeeze(0).argmax(1).detach().tolist()
            recover_test_output = tokenizer.decode(test_output)
            print(recover_test_output.split("<eos>")[0])
            test = "죽을거 같네"
            test_token = tokenizer.encode(test)
            test_tensor = torch.tensor(test_token.ids).long().unsqueeze(0)
            test_output = seq2seq(test_tensor, decode_test, 0.0)[:, 1:, :].squeeze(0).argmax(1).detach().tolist()
            recover_test_output = tokenizer.decode(test_output)
            print(recover_test_output.split("<eos>")[0])
            test = "한심해서 죽고싶다"
            test_token = tokenizer.encode(test)
            test_tensor = torch.tensor(test_token.ids).long().unsqueeze(0)
            test_output = seq2seq(test_tensor, decode_test, 0.0)[:, 1:, :].squeeze(0).argmax(1).detach().tolist()
            recover_test_output = tokenizer.decode(test_output)
            print(recover_test_output.split("<eos>")[0])

    torch.save(seq2seq.state_dict(), "Basic-NLP/checkpoint/seq2seq.pt")

0 epoch loss: 0.9156 / ppl: 2.4982
알게 리세요 짝남 솔직아무 디까지 네요 지났군요 크게 나만 른집얻은 당보고 말아궁금밖났습니다
일이야 싶은 지워버리자 폭풍 톡 겠죠 톡 웠질투그런 찾찾섬섬드해보세요 싶은 사랑중이 켜
알게 리세요 어디 식 어디 식 어디 식 하기 갈 생인데 란게 나 요긴하었으면 스럽게 지났군요 크게 내가
10 epoch loss: 0.7471 / ppl: 2.1108
. 
. 
저도요 . 
20 epoch loss: 0.6208 / ppl: 1.8604
. 
. 
. 
30 epoch loss: 0.5979 / ppl: 1.8184
. 
잘 . 
잘 . 
40 epoch loss: 0.5772 / ppl: 1.7811
잘 . 
잘 . 
잘 . 
50 epoch loss: 0.5682 / ppl: 1.7650
잘 . 
잘 . 
잘 . 
60 epoch loss: 0.5517 / ppl: 1.7363
잘 . 
잘 . 
잘 한 . 
70 epoch loss: 0.5427 / ppl: 1.7206
좋은 . 
잘 한 . 
잘 한 . 
80 epoch loss: 0.5247 / ppl: 1.6899
좋은 . 
잘 한 . 
얼굴이 . 
90 epoch loss: 0.5054 / ppl: 1.6576
좋은 . 
잘 한 . 
누군가를 기다핑계한 한 상황. 
100 epoch loss: 0.4980 / ppl: 1.6455
좋은 . 
잘 한 . 
맘고생 많았
110 epoch loss: 0.4742 / ppl: 1.6067
좋은 . 
잘 한 한 . 
맘고생 많았
120 epoch loss: 0.4556 / ppl: 1.5771
좋은 . 
잘 다녀. 
맘고생 많았
130 epoch loss: 0.4269 / ppl: 1.5325
좋은 을 . 
나쁜 생각 . 
맘고생 많았어요 . 
140 epoch loss: 0.4019 / ppl: 1.4946
좋은 을 해보세요 . 
나쁜 생각 하지 마세요 . 
맘고생 많았어요 . 
150 epoch lo