In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd

from transformers import AutoModelWithLMHead, AutoTokenizer
from transformer.preprocessors.tokenizer import MecabTokenizer, SpmTokenizer
from transformer.preprocessors.preprocessor import TransformerPreprocessor
from transformer.layers.attention import MultiheadAttention, PositionwiseFeedForward
from transformer.layers.embedding import TransformerEmbedding
from transformer.layers.transformer import EncoderLayer, DecoderLayer
from transformer.layers.utils import get_pad_mask, get_sub_mask
from transformer.models.transformer import Encoder, Decoder

ModuleNotFoundError: No module named 'transformer'

### Load Dataset

In [None]:
path = "/Users/aibud_dev/_jupyter/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # parallel: eng-kor
# dataset = None
# with open(file_path["korean-english-jhe"]["pickle"], "rb") as fp:
#     dataset = pickle.load(fp)

# # conversation: dailydialog dataset
# dataset = None
# with open(file_path["dailydialog"]["pickle"], "rb") as fp:
#     dataset = pickle.load(fp)

# conversation: empatheticdialogues dataset
dataset = None
with open(file_path["empatheticdialogues"]["pickle"], "rb") as fp:
    dataset = pickle.load(fp)

print("train: {train}\tvalid: {valid}\ttest: {test}".format(train=len(dataset["train"]), valid=len(dataset["valid"]), test=len(dataset["test"])))

train_df = pd.DataFrame(dataset["train"])
valid_df = pd.DataFrame(dataset["valid"])
test_df = pd.DataFrame(dataset["test"])

### Model Configuration

In [None]:
# architecture hyperparams
src_vocab_size = tgt_vocab_size = 8000
src_timesteps = tgt_timesteps = 128
num_heads = 8
d_model = 768
d_ff = 3072
dropout = 0.1
num_encoder_layer = 6

# layer details
pwff_activation = "gelu"
layer_bias = True
layer_norm_epsilon = 1e-5
layer_initialization = "normal"

batch_size = 32

### Load Preprocessor

In [None]:
spm_model_path = "./data/empdial_spm_model_v{vocab_size}".format(vocab_size=src_vocab_size)

# # train spm_model
# sentences = train_df["utterance"].unique().tolist()
# spm_tokenizer = SpmTokenizer(mlm_ratio=0.15, random_mask_ratio=0.1, skip_mask_ratio=0.1)
# spm_tokenizer.train_spm_model(sentences=sentences, vocab_size=vocab_size)
# spm_tokenizer.save_spm_model(path=spm_model_path, copy=False)
# spm_tokenizer.load_spm_model(path=spm_model_path)

prep = TransformerPreprocessor(src_spm_model_path=spm_model_path, tgt_spm_model_path=spm_model_path)

In [None]:
src_sentences = {"train":[], "valid":[], "test":[]}
tgt_sentences = {"train":[], "valid":[], "test":[]}

for conv_id, group in train_df.groupby(["conv_id"]):
    utterances = group["utterance"].tolist()
    group_length = len(group)
    for i in range(0, group_length-1):
        src_sentence = utterances[i]
        tgt_sentence = utterances[i+1]
        src_sentences["train"].append(src_sentence)
        tgt_sentences["train"].append(tgt_sentence)
        
for conv_id, group in valid_df.groupby(["conv_id"]):
    utterances = group["utterance"].tolist()
    group_length = len(group)
    for i in range(0, group_length-1):
        src_sentence = utterances[i]
        tgt_sentence = utterances[i+1]
        src_sentences["valid"].append(src_sentence)
        tgt_sentences["valid"].append(tgt_sentence)
        
for conv_id, group in test_df.groupby(["conv_id"]):
    utterances = group["utterance"].tolist()
    group_length = len(group)
    for i in range(0, group_length-1):
        src_sentence = utterances[i]
        tgt_sentence = utterances[i+1]
        src_sentences["test"].append(src_sentence)
        tgt_sentences["test"].append(tgt_sentence)

### Stack Layers

In [None]:
import torch
src_inputs = np.random.randint(low=0, high=src_vocab_size, size=(batch_size, src_timesteps))
tgt_inputs = np.random.randint(low=0, high=tgt_vocab_size, size=(batch_size, tgt_timesteps))
src_inputs = torch.from_numpy(src_inputs)
tgt_inputs = torch.from_numpy(tgt_inputs)

In [None]:
src_embedding_layer = TransformerEmbedding(timesteps=src_timesteps, d_model=d_model, vocab_size=src_vocab_size)
src_embed, src_token_embed_weights = src_embedding_layer(token_ids=src_inputs)
src_key_padding_mask = src_inputs==0

In [None]:
prep

In [None]:
get_pad_mask

In [None]:
get_sub_mask

In [8]:
# import torch
# from torch import nn

# src_inputs = np.array([
#     [1,2,3,4,5,0,0],
#     [1,2,3,4,0,0,0],
#     [1,2,3,4,0,0,0],
#     [1,2,3,4,5,6,7],
#     [1,2,3,4,5,6,0],
# ])
# # src_inputs = np.random.randint(low=0, high=1, size=(batch_size, src_timesteps))
# src_inputs = torch.from_numpy(src_inputs)
# src_embedding_layer = TransformerEmbedding(timesteps=7, d_model=4, vocab_size=8)
# src_embed, src_token_embed_weights = src_embedding_layer(token_ids=src_inputs)
# src_key_padding_mask = src_inputs==0

In [36]:
src_embed.shape

torch.Size([32, 128, 768])

In [8]:
encoder_layer = EncoderLayer(d_model=d_model, d_ff=d_ff, num_heads=num_heads, pwff_activation=pwff_activation, dropout=dropout, bias=layer_bias, layer_norm_epsilon=layer_norm_epsilon, initialization=layer_initialization)
encoder_layer(src=src_embed, src_key_padding_mask=src_key_padding_mask)

In [10]:
transformer_encoder = Encoder(num_encoder_layer=num_encoder_layer, d_model=d_model, d_ff=d_ff, num_heads=num_heads, pwff_activation=pwff_activation, dropout=dropout, bias=layer_bias, layer_norm_epsilon=layer_norm_epsilon, initialization=layer_initialization)

In [None]:
transformer_encoder(src=src_embed)

In [None]:
self.mha_dropout_layer = nn.modules.dropout.Dropout(dropout)
self.layer_normalization = nn.modules.normalization.LayerNorm(d_model, eps=layer_norm_epsilon)
self.pwff_layer = PositionwiseFeedForward(d_model=d_model, d_ff=d_ff, activation=pwff_activation, dropout=dropout, bias=bias, layer_norm_epsilon=layer_norm_epsilon, initialization=initialization)

In [56]:
a, _ = encoder_layer.mha_layer(query=src_embed, key=src_embed, value=src_embed, attn_mask=None, key_padding_mask=src_key_padding_mask)
a = a + encoder_layer.mha_dropout_layer(a)

In [49]:
a.dtype

torch.float64

In [55]:
lm = encoder_layer.layer_normalization.double()

In [58]:
encoder_layer.layer_normalization(a)

tensor([[[-1.4627,  0.0140,  0.1592,  ...,  1.0100,  0.4225,  0.2227],
         [-1.5960, -1.8289, -0.3324,  ..., -0.9353, -0.6446, -3.4256],
         [ 0.1701,  2.0713,  1.3529,  ...,  0.0055,  0.3293,  0.4701],
         ...,
         [ 0.0941, -0.0314,  0.7386,  ...,  0.9152,  0.0209, -1.0458],
         [ 0.9894, -0.0057,  0.7685,  ...,  1.5675, -0.6311, -1.1110],
         [-1.0514, -2.1482, -1.2280,  ...,  0.3904, -0.4926, -1.0023]],

        [[ 0.5447, -0.6459, -0.0889,  ...,  1.8235, -0.7732, -0.3643],
         [-0.0459,  1.0868,  1.5638,  ...,  2.1673,  0.5771,  1.9309],
         [ 0.2335,  0.2908,  0.3148,  ...,  0.6025,  0.3757,  1.4460],
         ...,
         [-1.4925, -0.8803,  0.7332,  ...,  0.2888,  0.9605,  1.6413],
         [-1.7312, -2.7324, -0.4192,  ..., -0.0670, -1.1935, -2.4884],
         [-1.2149, -2.2582, -0.0357,  ...,  0.0203, -0.8691, -0.2761]],

        [[ 2.0191,  0.2236,  0.3186,  ...,  0.7660,  1.7304,  0.8944],
         [-0.5629, -1.2309,  0.4925,  ..., -0

RuntimeError: expected scalar type Double but found Float

In [11]:
mha_layer = MultiheadAttention(d_model=4, num_heads=2)
mha_layer(query=src_embed, key=src_embed, value=src_embed, key_padding_mask=key_padding_mask)

(tensor([[[-1.4704e-04,  6.0264e-04, -1.7196e-03, -1.3732e-03],
          [-1.7554e-04,  4.5347e-04, -1.6458e-03, -1.0186e-03],
          [-1.0611e-04,  2.3857e-04, -1.6891e-03, -1.2961e-03],
          [-1.3496e-04,  4.2852e-04, -1.9389e-03, -1.5926e-03],
          [-1.3442e-04,  4.3101e-04, -1.9348e-03, -1.5910e-03],
          [-1.6258e-04,  2.8462e-04, -1.8603e-03, -1.2430e-03],
          [-1.3510e-04,  4.2895e-04, -1.9361e-03, -1.5900e-03]],
 
         [[ 1.0305e-05,  1.5170e-03, -1.7464e-03, -2.5096e-03],
          [-1.7350e-04,  8.5958e-04, -1.9546e-03, -2.4092e-03],
          [ 9.2236e-06,  1.5129e-03, -1.7547e-03, -2.5096e-03],
          [ 9.0960e-06,  1.5125e-03, -1.7566e-03, -2.5105e-03],
          [ 8.8270e-05,  1.3808e-03, -1.6171e-03, -2.6692e-03],
          [ 9.6503e-06,  1.5143e-03, -1.7497e-03, -2.5076e-03],
          [ 9.6465e-06,  1.5143e-03, -1.7497e-03, -2.5076e-03]],
 
         [[ 1.0305e-05,  1.5170e-03, -1.7464e-03, -2.5096e-03],
          [ 1.5601e-05,  1.2196e-0

In [39]:
key_padding_mask

tensor([[False, False, False, False, False,  True,  True],
        [False, False, False, False,  True,  True,  True],
        [False, False, False, False,  True,  True,  True],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False,  True]])

In [73]:
src_inputs.shape

torch.Size([3, 5, 7])

In [96]:
token_embedding_layer.weight

torch.Size([8000, 768])

In [61]:
token_embedding_layer = nn.Embedding(vocab_size, d_model)
token_embedding = token_embedding_layer(src_inputs)

In [14]:
src_inputs.shape

torch.Size([32, 128])

In [8]:
embedding_layer = TransformerEmbedding(timesteps=src_timesteps, d_model=d_model, vocab_size=src_vocab_size)

In [37]:
embedding_layer

TransformerEmbedding(
  (embed_dropout): Dropout(p=0.1, inplace=False)
  (token_embedding_layer): Embedding(8000, 768)
)

In [34]:
a,b = embedding_layer(token_ids=src_inputs)

In [35]:
a

tensor([[[-1.2178,  0.1460,  0.0700,  ...,  0.8063,  0.3599,  0.3523],
         [-0.3894,  0.9744,  0.8984,  ...,  1.6348,  1.1883,  1.1807],
         [-0.2898,  1.0740,  0.9980,  ...,  1.7343,  1.2879,  1.2803],
         ...,
         [-0.7519,  0.6118,  0.5358,  ...,  1.2722,  0.8258,  0.8181],
         [-1.6899, -0.3262, -0.4022,  ...,  0.3342, -0.1122, -0.1198],
         [-2.2125, -0.8488, -0.9248,  ..., -0.1884, -0.6348, -0.6424]],

        [[-1.2178,  0.1460,  0.0700,  ...,  0.8063,  0.3599,  0.3523],
         [-0.3894,  0.9744,  0.8984,  ...,  1.6348,  1.1883,  1.1807],
         [-0.2898,  1.0740,  0.9980,  ...,  1.7343,  1.2879,  1.2803],
         ...,
         [-0.7519,  0.6118,  0.5358,  ...,  1.2722,  0.8258,  0.8181],
         [-1.6899, -0.3262, -0.4022,  ...,  0.3342, -0.1122, -0.1198],
         [-2.2125, -0.8488, -0.9248,  ..., -0.1884, -0.6348, -0.6424]],

        [[-1.2178,  0.1460,  0.0700,  ...,  0.8063,  0.3599,  0.3523],
         [-0.3894,  0.9744,  0.8984,  ...,  1

In [36]:
b

Parameter containing:
tensor([[-1.2178,  0.1460,  0.0700,  ...,  0.8063,  0.3599,  0.3523],
        [-0.0388,  1.3134, -0.4791,  ..., -1.5784, -0.7663,  1.0328],
        [ 0.6664,  1.1107, -0.7510,  ...,  1.0426, -2.4707,  1.5420],
        ...,
        [-2.0501, -0.2044, -0.6225,  ...,  0.2221,  2.1806, -0.3275],
        [ 0.4914, -1.0143, -1.8016,  ...,  2.1990, -1.0328,  0.2856],
        [ 0.8121, -2.6755,  0.6975,  ..., -0.8857,  1.7478,  0.2636]],
       requires_grad=True)

In [76]:
pe = Embedding()._position_embed(5, 7)
pe = torch.from_numpy(pe)

In [63]:
def _position_embed(self, timesteps, d_model):
    def _get_angles(timesteps, d_model, i):
        default_value = 1e+4
        rates = 1 / np.power(default_value, (2 * 1) / np.float32(d_model))
        return rates * timesteps

    pos_array = np.expand_dims(np.arange(timesteps), axis=1)
    i_array = np.expand_dims(np.arange(d_model), axis=0) / 2
    pos_embed_matrix = _get_angles(pos_array, d_model, i_array)
    pos_embed_matrix[:, 0::2] = np.sin(pos_embed_matrix[:, 0::2])
    pos_embed_matrix[:, 1::2] = np.sin(pos_embed_matrix[:, 1::2])
    return pos_embed_matrix

### Load HuggingFace Tokneizer

In [45]:
# model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base", bos_token="<s>")

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [9]:
src_prep = Preprocessor(timesteps=src_timesteps)
tgt_prep = Preprocessor(timesteps=tgt_timesteps)

TypeError: __init__() got an unexpected keyword argument 'timesteps'

In [10]:
"str" != None

True

In [17]:
from typing import Optional, List

In [4]:
kor_to_eng_prefix = "translate Korean to English: "
eng_to_kor_prefix = "translate English to Korean: "

Imported konlpy.tag.Mecab successfully
Imported Advanced Mecab successfully


In [5]:
rows = [[13959,  1566,    12,  2968,    10, 11560,  3896,  8881,    19, 3,     9,   748,   349,     3,   390,    16,   368,  1060, 11,  1919]]

In [None]:
prep.

In [109]:
timesteps = 6

In [114]:
len(inputs[0])

512

In [116]:
sentence = "translate English to German: Hugging Face is a technology company based in New York and Paris"
inputs = tokenizer.encode(sentence, add_special_tokens=False, return_tensors="np")
inputs

array([[13959,  1566,    12,  2968,    10, 11560,  3896,  8881,    19,
            3,     9,   748,   349,     3,   390,    16,   368,  1060,
           11,  1919]])

In [117]:
tokenizer.decode(inputs[0])

'translate English to German: Hugging Face is a technology company based in New York and Paris'

In [71]:
sentence = "translate English to German: Hugging Face is a technology company based in New York and Seoul"
inputs = tokenizer.encode(sentence, return_tensors="pt")
inputs

tensor([[13959,  1566,    12,  2968,    10, 11560,  3896,  8881,    19,     3,
             9,   748,   349,     3,   390,    16,   368,  1060,    11, 28343,
             1]])

In [72]:
tokenizer.decode(inputs[0])

'translate English to German: Hugging Face is a technology company based in New York and Seoul</s>'

In [3]:
sentence = "translate English to German: Hugging Face is a technology company based in New York and Paris"
inputs = tokenizer.encode(sentence, return_tensors="pt")

In [None]:
sentence = "translate English to German: Hugging Face is a technology company based in New York and Paris"
inputs = tokenizer.encode(sentence, return_tensors="pt")

In [None]:
tokenizer.

In [4]:
inputs

tensor([[13959,  1566,    12,  2968,    10, 11560,  3896,  8881,    19,     3,
             9,   748,   349,     3,   390,    16,   368,  1060,    11,  1919,
             1]])

In [None]:
ss = dataset["test"]
s = ss[0]["utterance"]
s

masked = prep.src_sentence_encode(sentence=s, language="eng", mask=True)
unmasked = prep.src_sentence_encode(sentence=s, language="eng", mask=False)

prep.src_decode(rows=[masked, unmasked])

In [None]:
dataset = dict()

_type = "test"
path = file_path["empatheticdialogues"]["dir"] + "{_type}.csv".format(_type=_type)
rows = []
_rows = []
with open(path, "r", encoding="UTF-8") as fp:
    for row in fp:
        row = row.strip().split(",")
        if len(row)>=8: 
            row = row[:8]
            rows.append(row)
        else:
            _rows.append(row)
            
_dataset = []
comma_token = "_comma_"
headers = rows[0]
for _row in rows[1:]:
    row = dict()
    for header,col in zip(headers,_row):
        col = col.replace(comma_token, ",")
        row[header] = col
    _dataset.append(row)
    
print(len(rows), len(_rows))
print(len(_dataset))
dataset[_type] = _dataset

with open(file_path["empatheticdialogues"]["pickle"], "wb") as fp:
    pickle.dump(dataset, fp)

In [None]:
outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)