# Library


In [9]:
! pip install sentencepiece einops wandb torch-summary icecream -qq


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from pprint import pprint
# from konlpy.tag import Mecab
from nltk.tokenize import word_tokenize as en_tokenizer
import sentencepiece as spm
import urllib.request
import csv
import numpy as np
from einops import rearrange, reduce, repeat
from torch.cuda import amp
from tqdm import tqdm
import wandb
import time
import copy
from collections import defaultdict
from sklearn.metrics import mean_squared_error
import joblib
import gc
import os
from icecream import ic
from sklearn.model_selection import train_test_split
import os


In [11]:
TRAIN_PATH = './train'
VOCAB_SIZE = 10000
SEQ_LEN = 60


PAD_IDX = 0
BOS_IDX = 2
EOS_IDX = 3


# ENV = 'COLAB'
ENV = 'KAGGLE'
# ENV = 'SYSTEM'

# Option for Mixed Precision
# FP16 = True
FP16 = False

N = 3
HIDDEN_DIM = 128
NUM_HEAD = 8
INNER_DIM = 256
BATCH_SIZE = 32

CONFIG = {
    'VOCAB_SIZE': VOCAB_SIZE,
    'SEQ_LEN': SEQ_LEN,
    'N': N,
    'HIDDEN_DIM': HIDDEN_DIM,
    'NUM_HEAD': NUM_HEAD,
    'INNER_DIM': INNER_DIM,
    'BATCH_SIZE': BATCH_SIZE,
}


if 'device' not in globals():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device}')


In [16]:
import wandb
run = wandb.init()
dataset = wandb.run.use_artifact(
    'jiwon7258/Transformer_bible/bible-dataset_10000_60:latest', type='dataset')
# Download the artifact's contents
artifact_dir = dataset.download()


In [13]:
if (ENV == 'COLAB'):
    from google.colab import drive
    drive.mount('/content/drive')


  # Load

In [18]:
DATASET_PATH = './'
if (ENV == 'KAGGLE'):
    DATASET_PATH = '../input/enko-bible'
elif (ENV == 'COLAB'):
    DATASET_PATH = '/content/drive/MyDrive/notebooks/transformer_bible/'

   # 영어 데이터 로드

In [19]:
en_train = open(os.path.join(DATASET_PATH, 'bible-all.en.txt'))
en_train_content = en_train.read()


In [20]:
en_train_list = en_train_content.split('\n')


   # 한국어 데이터 로드

In [21]:
ko_train = open(os.path.join(DATASET_PATH, 'bible-all.kr.txt'))
ko_train_content = ko_train.read()

In [22]:
ko_train_list = ko_train_content.split('\n')

In [23]:
en_train_list[:10]

   # DATA 데이터프레임

In [24]:
data = pd.DataFrame()
data['en_raw'] = en_train_list
data['ko_raw'] = ko_train_list

In [25]:
data.head()

In [26]:
len(data)


In [27]:
data = data.reset_index(drop = True)
data.head()

In [28]:
data['en'] = data['en_raw'].apply(lambda x: x.split(' ')[1:])
data['en'] = data['en'].apply(lambda x: (' ').join(x))
data['ko'] = data['ko_raw'].apply(lambda x: x.split(' ')[1:])
data['ko'] = data['ko'].apply(lambda x: (' ').join(x))


In [29]:
data = data[['en','ko']]
data.head()

# Transformer


## Mask Function


In [30]:
'''
Mask 행렬을 반환하는 Mask Function
Masking은 QK_T 중 srcK 의 seq_len을 중심으로 한다는 점을 알아두자!!

Input
- Tensor
    shape (bs, srcK seq_len)

Args
- Option
    If option is 'padding', function returns padding mask
    If option is 'lookahead', function returns lookahead mask

Output
- Tensor (option = 'padding' )
    shape (bs, 1, 1, srcK seq_len)


* shape 중 (1, 1) 부분은 broad casting을 위한 것이다.
'''


def makeMask(tensor, option: str) -> torch.Tensor:
    '''
    tensor (bs, seq_len)
    '''
    if option == 'padding':
        tmp = torch.full_like(tensor, fill_value=PAD_IDX).to(device)
        # tmp : (bs,seq_len)
        mask = (tensor != tmp).float()
        # mask : (bs, seq_len)
        mask = rearrange(mask, 'bs seq_len -> bs 1 1 seq_len ')

        # mask(bs, 1, seq_len,seq_len)

        '''
        Example of mask
        tensor([[
         [1., 1., 1., 1., 0., 0., 0., 0.]]])
        '''

    elif option == 'lookahead':
        # srcQ의 seq_len과 srcK의 seq_len이 동일하다고 가정한다
        # tensor : (bs, seq_len)

        padding_mask = makeMask(tensor, 'padding')
        padding_mask = repeat(
            padding_mask, 'bs 1 1 k_len -> bs 1 new k_len', new=padding_mask.shape[3])
        # padding_mask : (bs, 1, seq_len, seq_len)

        '''
        Example of padding_mask
        tensor([[
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]]])
        '''
        mask = torch.ones_like(padding_mask)
        mask = torch.tril(mask)

        '''
        Example of 'mask'
        tensor([[
        [1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]]])
        '''

        mask = mask * padding_mask
        # ic(mask.shape)

        '''
        Example
        tensor([[
         [1., 0., 0., 0., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.]]])
        '''

    return mask


In [31]:
# test = torch.Tensor([[1,2,3,4,5,6,0,0,0,0]])
# ic(test.shape)
# test1 = makeMask(test, option = 'padding')
# test2 = makeMask(test, option = 'lookahead')
# ic(test1.shape)
# ic(test2.shape)


## Multihead Attention


In [32]:
class Multiheadattention(nn.Module):
    def __init__(self, hidden_dim: int, num_head: int):
        super().__init__()

        # embedding_dim, d_model, 512 in paper
        self.hidden_dim = hidden_dim
        # 8 in paper
        self.num_head = num_head
        # head_dim, d_key, d_query, d_value, 64 in paper (= 512 / 8)
        self.head_dim = hidden_dim // num_head
        self.scale = torch.sqrt(torch.FloatTensor()).to(device)

        self.fcQ = nn.Linear(hidden_dim, hidden_dim)
        self.fcK = nn.Linear(hidden_dim, hidden_dim)
        self.fcV = nn.Linear(hidden_dim, hidden_dim)
        self.fcOut = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(0.1)

    def forward(self, srcQ, srcK, srcV, mask=None):

        ##### SCALED DOT PRODUCT ATTENTION ######

        # input : (bs, seq_len, hidden_dim)
        Q = self.fcQ(srcQ)
        K = self.fcK(srcK)
        V = self.fcV(srcV)

        Q = rearrange(
            Q, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        K_T = rearrange(
            K, 'bs seq_len (num_head head_dim) -> bs num_head head_dim seq_len', num_head=self.num_head)
        V = rearrange(
            V, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)

        attention_energy = torch.matmul(Q, K_T)
        # attention_energy : (bs, num_head, q_len, k_len)

        if mask is not None:
            '''
            mask.shape
            if padding : (bs, 1, 1, k_len)
            if lookahead : (bs, 1, q_len, k_len)
            '''
            attention_energy = torch.masked_fill(
                attention_energy, (mask == 0), -1e+4)

        attention_energy = torch.softmax(attention_energy, dim=-1)

        result = torch.matmul(self.dropout(attention_energy), V)
        # result (bs, num_head, seq_len, head_dim)

        ##### END OF SCALED DOT PRODUCT ATTENTION ######

        # CONCAT
        result = rearrange(
            result, 'bs num_head seq_len head_dim -> bs seq_len (num_head head_dim)')
        # result : (bs, seq_len, hidden_dim)

        # LINEAR

        result = self.fcOut(result)

        return result


In [33]:
# # TEST CODE #
# bs = 1
# seq_len = 10
# hidden_dim = 256
# # src = torch.Tensor([[   2, 1568,  955,  612,  221,   64,   20, 8905,  928, 8768,  167, 8841,
# #          3834,    9, 1687,   41, 7661,  562, 9073, 5204, 8794, 8931,   14, 8823,
# #          5616, 1289, 8793, 2477,  438,   27, 8783,   14, 8905,  534,  235,  204,
# #          9037, 8745, 9040, 6942,   47, 8738,    3,    0,    0,    0,    0,    0,
# #             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])
# src = torch.Tensor([[1,2,3,4,5,6,0,0,0,0]])

# padding_mask = makeMask(src, option = 'padding')
# ic(padding_mask.shape)
# lookahead_mask = makeMask(src, option = 'lookahead')
# ic(lookahead_mask.shape)


# test_Q = torch.randn((bs,2,hidden_dim))
# test_K = torch.randn((bs, 10, hidden_dim))
# ic(test_Q.shape)
# test_layer = Multiheadattention(hidden_dim=hidden_dim, num_head =8)
# ic(test_layer(srcQ = test_Q, srcK = test_K, srcV = test_K, mask = padding_mask).shape)


## Poistionwise Feedforward Network


In [34]:
class FFN(nn.Module):
    def __init__(self, hidden_dim, inner_dim):
        super().__init__()

        # 512 in paper
        self.hidden_dim = hidden_dim
        # 2048 in paper
        self.inner_dim = inner_dim

        self.fc1 = nn.Linear(hidden_dim, inner_dim)
        self.fc2 = nn.Linear(inner_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=False)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input):
        output = input
        output = self.fc1(output)
        output2 = self.relu(output)
        output2 = self.dropout(output)
        output3 = self.fc2(output2)

        return output3


## Encoder Layer


In [35]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.multiheadattention = Multiheadattention(hidden_dim, num_head)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)

        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)

    def forward(self, input, mask=None):

        # input : (bs, seq_len, hidden_dim)

        # encoder attention
        # uses only padding mask
        output = self.multiheadattention(
            srcQ=input, srcK=input, srcV=input, mask=mask)
        output = self.dropout1(output)
        output = input + output
        output = self.layerNorm1(output)

        output_ = self.ffn(output)
        output_ = self.dropout2(output_)
        output = output + output_
        output = self.layerNorm2(output)

        # output : (bs, seq_len, hidden_dim)
        return output


## Encoder Architecture


In [36]:
class Encoder(nn.Module):
    def __init__(self, N, hidden_dim, num_head, inner_dim, max_length=100):
        super().__init__()

        # N : number of encoder layer repeated
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(
            num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.enc_layers = nn.ModuleList(
            [EncoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input):

        batch_size = input.shape[0]
        seq_len = input.shape[1]
        # input : (bs, seq_len)

        mask = makeMask(input, option='padding')

        pos = torch.arange(0, seq_len).unsqueeze(
            0).repeat(batch_size, 1).to(device)
        # pos: [batch_size, src_len]

        # embedding layer
        output = self.dropout(self.embedding(input) + self.pos_embedding(pos))
        # output : (bs, seq_len, hidden_dim)

        # Positional Embedding
        # output = pos_embed(output)

        # Dropout
        output = self.dropout(output)

        # N encoder layer
        for layer in self.enc_layers:
            output = layer(output, mask)

        # output : (bs, seq_len, hidden_dim)

        return output


## Decoder Layer


In [37]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.multiheadattention1 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.multiheadattention2 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm3 = nn.LayerNorm(hidden_dim)

        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)
        self.dropout3 = nn.Dropout(p=0.1)

    def forward(self, input, enc_output, paddingMask, lookaheadMask):
        # input : (bs, seq_len, hidden_dim)
        # enc_output : (bs, seq_len, hidden_dim)

        # first multiheadattention
        output = self.multiheadattention1(input, input, input, lookaheadMask)
        output = self.dropout1(output)
        output = output + input
        output = self.layerNorm1(output)

        # second multiheadattention
        output_ = self.multiheadattention2(
            output, enc_output, enc_output, paddingMask)
        output_ = self.dropout2(output_)
        output = output_ + output
        output = self.layerNorm2(output)

        # Feedforward Network
        output_ = self.ffn(output)
        output_ = self.dropout3(output_)
        output = output + output_
        output = self.layerNorm3(output)

        return output


## Decoder Architecture


In [38]:
class Decoder(nn.Module):
    def __init__(self, N, hidden_dim, num_head, inner_dim, max_length=100):
        super().__init__()

        # N : number of encoder layer repeated
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(
            num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)

        self.dec_layers = nn.ModuleList(
            [DecoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)

        self.finalFc = nn.Linear(hidden_dim, VOCAB_SIZE)

    def forward(self, input, enc_src, enc_output):

        # input = dec_src : (bs, seq_len)
        # enc_src : (bs, seq_len)
        # enc_output : (bs, seq_len,hidden_dim)

        lookaheadMask = makeMask(input, option='lookahead')
        paddingMask = makeMask(enc_src, option='padding')

        # embedding layer
        output = self.embedding(input)
        # output = (bs, seq_len, hidden_dim)

        # Positional Embedding
        # output = pos_embed(output)

        # Dropout
        output = self.dropout(output)

        # N decoder layer
        for layer in self.dec_layers:
            output = layer(output, enc_output, paddingMask, lookaheadMask)
        # output : (bs, seq_len, hidden_dim)

        logits = self.finalFc(output)
        # logits : (bs, seq_len, VOCAB_SIZE)
        output = torch.softmax(logits, dim=-1)

        output = torch.argmax(output, dim=-1)
        # output : (bs, seq_len), dtype=int64

        return logits, output


# Transformer


In [39]:
class Transformer(nn.Module):
    def __init__(self, N=2, hidden_dim=256, num_head=8, inner_dim=512):
        super().__init__()
        self.encoder = Encoder(N, hidden_dim, num_head, inner_dim)
        self.decoder = Decoder(N, hidden_dim, num_head, inner_dim)

    def forward(self, enc_src, dec_src):
        # enc_src : (bs, seq_len)
        # dec_src : (bs, seq_len)

        # print(f'enc_src : {enc_src.shape}')
        # print(f'dec_src : {dec_src.shape}')

        enc_output = self.encoder(enc_src)
        # enc_output : (bs, seq_len, hidden_dim)
        logits, output = self.decoder(dec_src, enc_src, enc_output)
        # logits = (bs, seq_len, VOCAB_SIZE)

        return logits, output


# Inference


## Load Model


In [42]:
# Download Saved Model Weight
WEIGHT_FILE = 'final.bin'
WEIGHT_PATH = './weight'
WEIGHT_RUN_PATH = 'jiwon7258/Transformer_bible/3rzi4gl9'

wandb.restore(WEIGHT_FILE, run_path=WEIGHT_RUN_PATH, root=WEIGHT_PATH)


# Load Sentencedpiece Trained Model
SRC_MODEL_FILE = os.path.join(artifact_dir,'src.model')
TRG_MODEL_FILE = os.path.join(artifact_dir,'trg.model')


In [43]:
sp_src = spm.SentencePieceProcessor()
sp_src.Load(SRC_MODEL_FILE)
sp_trg = spm.SentencePieceProcessor()
sp_trg.Load(TRG_MODEL_FILE)


In [47]:
%%capture output
model = Transformer(N, HIDDEN_DIM, NUM_HEAD, INNER_DIM).to(device)
model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, WEIGHT_FILE), map_location=device))
model.eval()


In [48]:
def predict(src_sentence):
    # Prepare Sample Sentence
    dec_sentence = ''

    enc_src = sp_src.EncodeAsIds(src_sentence)
    dec_src = []
    dec_src = np.insert(dec_src, 0, sp_trg.bos_id())
    # dec_src = ko_encode(dec_sentence)

    enc_src = torch.Tensor(enc_src).view(1, -1).int().to(device)
    dec_src = torch.Tensor(dec_src).view(1, -1).int().to(device)
    # enc_src : (1,seq_len)
    # dec_src : (1,seq_len)

    last_token = None
    last_token_idx = 0

    while(True):

        # dec_src에 dec_output의 last token을 추가합니다
        enc_output = model.encoder(enc_src)
        # enc_output : (1,seq_len, hidden_dim)

        dec_logits, dec_output = model.decoder(
            input=dec_src, enc_src=enc_src, enc_output=enc_output
        )
        # dec_output : (1,seq_len)
        # dec_logits : (1, seq_len, VOCAB_SIZE)

        last_token = dec_output[:, last_token_idx].item()
        last_token = torch.Tensor([last_token]).view(-1, 1).int()

        # last_token : (1, 1)
        dec_src = torch.cat((dec_src, last_token), dim=-1)

        last_token_idx = last_token_idx + 1

        # print(dec_src)
        # print(sp_trg.Decode(dec_src.tolist()))
        # print(last_token.item())
        if last_token.item() is EOS_IDX:
            break

    # ic(dec_src.tolist())
    return sp_trg.Decode(dec_src.tolist())


In [None]:
# Prepare 10 Sample Sentence
indices = np.random.choice(len(data['en']), 10, replace=False)
sentences = data['en'][indices].to_list()
answers = data['ko'][indices].to_list()

for idx in range(len(sentences)):
    sentence = sentences[idx]
    print(f'en = {sentence}')
    print(f'answer = {answers[idx]}')
    print(f'ko = {predict(sentence)}')
