In [1]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "nlp"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# 데이터만 keras에서 받아오기
import tensorflow as tf
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

In [3]:
text[:100]

'Go.\tVe.\nGo.\tVete.\nGo.\tVaya.\nGo.\tVáyase.\nHi.\tHola.\nRun!\t¡Corre!\nRun.\tCorred.\nWho?\t¿Quién?\nFire!\t¡Fueg'

In [4]:
import numpy as np

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)  # extra code – ensures reproducibility on CPU
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [5]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

How boring! => Qué aburrimiento!
I love sports. => Adoro el deporte.
Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?


In [6]:
# 글자모음
print('영어문장 : ', ''.join(sorted(''.join(set(''.join(sentences_en))))))
print('스페인어문장 : ', ''.join(sorted(''.join(set(''.join(sentences_es))))))


영어문장 :   !"$%'+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz°áãèéêóöüč‘’₂€
스페인어문장 :   !"$%&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ¨ª«°º»ÁÉÍÓÚáåèéêíñóöúüčśс​—₂€


In [7]:
# puctuation을 제거 후 진행
import string
def delete_puctuation(text, punc=string.punctuation):
    return text.translate(str.maketrans("", "", punc))

In [8]:
punc_en = string.punctuation+'°áãèéêóöüč‘’₂€'
punc_es = string.punctuation+'¨ª«°º»åèêöüč​—₂€'

word_list_en = [delete_puctuation(doc,punc_en).strip().split() for doc in sentences_en]
word_list_es = [delete_puctuation(doc,punc_es).strip().split() for doc in sentences_es]

In [9]:
word_list_es_input = [['<SOS>'] + doc for doc in word_list_es]
word_list_es_output = [doc +['<EOS>'] for doc in word_list_es]

In [10]:
print(''.join(set(''.join(np.concatenate(word_list_en)))))
print(''.join(set(''.join(np.concatenate(word_list_es)))))

Fy2UA9mkYHt7IMuRsC6fjagTlw0QhroOibzG14X8WeqPBnDLSxpdKENV3vcJZ5
Fy2UA9mkYHt7IMuRsC6Ófja5gTlwÉ0QíhéoribzóOG41X8ÍÚqeWPáBnDLSxdpKENV3ZсcvJÁśúñ


In [11]:
# 모든 단어에 인덱스 부여 (??)
en_dict = dict(zip(np.unique(np.concatenate(word_list_en)), range(1, 1+len(np.unique(np.concatenate(word_list_en))))))
es_dict = dict(zip(np.unique(np.concatenate(word_list_es+[['<SOS>','<EOS>']])), range(1, 3+len(np.unique(np.concatenate(word_list_es))))))

In [12]:
en_mapping = lambda x: [en_dict[word] for word in x]
es_mapping = lambda x: [es_dict[word] for word in x]

print(word_list_en[:5])
print([en_mapping(x) for x in word_list_en[:5]])

[['How', 'boring'], ['I', 'love', 'sports'], ['Would', 'you', 'like', 'to', 'swap', 'jobs'], ['My', 'mother', 'did', 'nothing', 'but', 'weep'], ['Croatia', 'is', 'in', 'the', 'southeastern', 'part', 'of', 'Europe']]
[[1478, 4423], [1500, 9485, 13200], [3165, 15243, 9352, 14108, 13686, 9007], [1997, 9989, 6204, 10266, 4663, 14945], [861, 8954, 8650, 13949, 13097, 10672, 10358, 1097]]


In [13]:
def pad_sequence(word_list, max_len, lang='en'):
    try:
        if lang == 'en':
            x = np.asarray(en_mapping(word_list))
        elif lang == 'es':
            x = np.asarray(es_mapping(word_list))
    except:
        print(word_list)
    if len(x) < max_len:
        x = np.concatenate([np.zeros(max_len - len(x)), x])
    return x[len(x) - max_len:]

In [14]:
max_length = 50
vec_list_en = np.array(list(map(lambda x:pad_sequence(x,max_length,'en'),word_list_en)))
vec_list_es_input = np.array(list(map(lambda x:pad_sequence(x,max_length,'es'),word_list_es_input)))
vec_list_es_output = np.array(list(map(lambda x:pad_sequence(x,max_length,'es'),word_list_es_output)))


In [15]:
# decoder의 입력은 <start>로 시작
# decoder의 출력은 <end>로 끝
X_train_enc = vec_list_en[:10000]
X_test_enc = vec_list_en[10000:]
X_train_dec = vec_list_es_input[:10000]
X_test_dec = vec_list_es_input[10000:]
y_train_dec = vec_list_es_output[:10000]
y_test_dec = vec_list_es_output[10000:]

# Encoder

In [94]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

In [95]:
# sine/cosine positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, embed_size, dtype=torch.float32, **kwargs):
        super().__init__(**kwargs)
        p,i = torch.meshgrid(torch.arange(max_len), 2*torch.arange(embed_size//2))
        self.pos_emb = torch.zeros(1, max_len, embed_size)
        self.pos_emb[:, :, 0::2] = torch.sin(p / 10_000 ** (i / embed_size))
        self.pos_emb[:, :, 1::2] = torch.cos(p / 10_000 ** (i / embed_size))
        self.pos_emb = self.pos_emb.type(dtype)

    def forward(self, x):
        _, batch_max_length = x.size()
        return self.pos_emb[:,:batch_max_length, :]
        
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = PositionalEncoding(maxlen, embed_dim)
        # 논문에서처럼 고정된 positional embedding을 사용하지 않고 학습 가능한 embedding을 사용할 경우
        # self.pos_emb = nn.Embedding(maxlen, embed_dim)
        
    def forward(self, x):
        maxlen = x.shape[-1]
        positions = torch.arange(start=0, end=maxlen, dtype=torch.long)
        # 다음 부분이 없으면 position은 model.to(device) 해도 gpu로 넘어가지 않음
        positions = positions.to(x.device)
        positions = positions.unsqueeze(0).expand(x.shape)
        x = self.token_emb(x) + self.pos_emb(positions).to(x.device)
        return x


In [96]:
class EncoderBlock(nn.Module):
    def __init__(self, embed_dim, heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    def forward(self, x):
        # MultiheadAttention의 forward : (query, key, value) -> (output, output_weights)
        attn_output = self.attention(x, x, x)[0]
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)
        ff_output = self.ff(out1)
        ff_output = self.dropout2(ff_output)
        out2 = self.norm2(out1 + ff_output)
        return out2

In [97]:
# transformer block의 stack = 1
class Encoder(nn.Module):
    def __init__(self,maxlen, vocab_size, embed_dim, num_heads, ff_dim, stack=6, hidden_dim=20, dropout=0.1):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen,vocab_size,embed_dim)
        self.transformer_block = EncoderBlock(embed_dim,num_heads,ff_dim)
        # self.first_linear = nn.Linear(embed_dim,hidden_dim)
        # self.last_linear = nn.Linear(hidden_dim,2)
        self.stack = stack
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        x = self.embedding_layer(x)
        for _ in range(self.stack):
            x = self.transformer_block(x)
        # x = torch.mean(x, dim=1)   # global average pooling
        # x = self.dropout(x)
        # x = self.first_linear(x)
        # x = F.relu(x)
        # x = self.dropout(x)
        # x = self.last_linear(x)
        return x

In [98]:
vocab_size = 1+len(np.unique(np.concatenate(word_list_en)))
encoder = Encoder(maxlen=50, vocab_size=vocab_size, stack=2, embed_dim=128, num_heads=8, ff_dim=512, hidden_dim=20, dropout=0.1)

In [99]:
encoded = encoder(torch.Tensor(X_train_enc[:15]).long())
encoded.shape

torch.Size([15, 50, 128])

# Decoder
## masking

In [147]:
def generate_square_subsequent_mask(sz: int) -> torch.Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [132]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention1 = nn.MultiheadAttention(embed_dim, heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attention2 = nn.MultiheadAttention(embed_dim, heads, dropout=dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
    def forward(self, x, encoder_outputs):
        # MultiheadAttention의 forward : (query, key, value) -> (output, output_weights)
        # if x.shape[1] == 50:
        #     global yyy
        #     yyy = x
        attn_output1 = self.attention1(x, x, x)[0]
        attn_output1 = self.dropout1(attn_output1)
        out1 = self.norm1(x + attn_output1)
        attn_output2 = self.attention2(out1, encoder_outputs, encoder_outputs)[0]
        attn_output2 = self.dropout2(attn_output2)
        out2 = self.norm2(out1 + attn_output2)
        ff_output = self.ff(out2)
        ff_output = self.dropout3(ff_output)
        out3 = self.norm3(out2 + ff_output)
        return out3


In [133]:
class Decoder(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim, stack=6, hidden_dim=20, dropout=0.1):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = DecoderBlock(embed_dim, num_heads, ff_dim)
        self.first_linear = nn.Linear(embed_dim, hidden_dim)
        self.last_linear = nn.Linear(hidden_dim, embed_dim)
        self.stack = stack
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_outputs):
        # create_mask(x)
        x = self.embedding_layer(x)
        for _ in range(self.stack):
            x = self.transformer_block(x, encoder_outputs)
        x = torch.mean(x, dim=1)      # global average pooling
        # x = self.dropout(x)
        # x = self.first_linear(x)
        # x = F.relu(x)
        # x = self.dropout(x)
        # x = self.last_linear(x)
        return x

In [134]:
# encoder(torch.Tensor(X_train_enc[:15]).long())
# 15,50,128
vocab_size = 1+len(np.unique(np.concatenate(word_list_es)))
decoder = Decoder(maxlen=50, vocab_size=vocab_size, stack=2, embed_dim=128, num_heads=8, ff_dim=512, hidden_dim=20, dropout=0.1)
xx = decoder(torch.Tensor(X_train_dec[:15]).long(), encoded)
xx.shape

torch.Size([15, 128])

# mask 더하는 것 해결하고 학습 코드 돌려보기

In [148]:
# 이걸 어떻게 더하지??
mask_yyy = generate_square_subsequent_mask(yyy.size(1))
yyy.shape, mask_yyy.shape

(torch.Size([15, 50, 128]), torch.Size([50, 50]))

In [149]:
# 더해주기
mask_yyy

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [114]:
xx = torch.Tensor(X_train_dec[:15]).long()
xx_mask = create_mask(xx)

In [115]:
xx.shape, xx_mask.shape

(torch.Size([15, 50]), torch.Size([50, 50]))

In [105]:
class Transformer(nn.Module):
    def __init__(self, maxlen, vocab_size_encoder, vocab_size_decoder, embed_dim, num_heads, ff_dim, stack=6, hidden_dim=20, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(maxlen, vocab_size_encoder, embed_dim, num_heads, ff_dim, stack, hidden_dim, dropout)
        self.decoder = Decoder(maxlen, vocab_size_decoder, embed_dim, num_heads, ff_dim, stack, hidden_dim, dropout)
        self.linear = nn.Linear(embed_dim, vocab_size_decoder)
    def forward(self, x_encode, x_decode, y):
        x_encode = self.encoder(x_encode)
        x_decode = self.decoder(x_decode, x_encode)
        x_decode = self.linear(x_decode)
        return x_decode


In [106]:
vocab_size_encode = 1+len(np.unique(np.concatenate(word_list_en)))
vocab_size_decode = 1+len(np.unique(np.concatenate(word_list_es)))

trsf = Transformer(maxlen=50, vocab_size_encoder=vocab_size_encode, vocab_size_decoder=vocab_size_decode, 
                embed_dim=128, num_heads=8, ff_dim=512, stack=2, hidden_dim=20, dropout=0.1)

In [109]:
a = trsf(torch.Tensor(X_train_enc[:15]).long(), torch.Tensor(X_train_dec[:15]).long(), torch.Tensor(y_train_dec[:15]).long())
a.shape

torch.Size([15, 29015])

torch.Size([15, 29015])