In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, IterableDataset, DataLoader

from transformer.assertions.object_assertion import DataAssertion
from transformer.utils.tokenizer import MecabTokenizer, SpmTokenizer
from transformer.preprocessor.blender_bot_preprocessor import GeneratorPretrainingPreprocessor
from transformer.data.dataset import DatasetInterface, DatasetFromDir
from transformer.data.blender_bot_data_loader import GeneratorPretrainingDataLoader
from transformer.layers.attention import MultiheadAttention, PositionwiseFeedForward, CodeAttention
from transformer.layers.transformer import EncoderLayer, DecoderLayer
from transformer.layers.head import LanguageModelingHead, PolyEncoderHead, NextSentencePredictionHead
from transformer.layers.utils import get_pad_mask, get_sub_mask
from transformer.models.transformer import Encoder, Decoder, Transformer
from transformer.trainer.blender_bot_trainer import GeneratorPretrainingTransformerTrainer
from transformer.trainer.utils import *

### Load Dataset

In [2]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Picas_Server
# dataset_dir = "/home/picas/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Korea_Server
# dataset_dir = "/home/guest1"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# bigshane_local
dataset_dir = "D:\_jupyter"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

### Model Configuration

In [3]:
# architecture hyperparams
# TOBE: # 1 ET-encoder Block, 13 ET-decoder Blocks
src_vocab_size = 15000
tgt_vocab_size = 15000
embedding_dict = {
    "segment": 2, # context, condition
}
src_timesteps = 128
tgt_timesteps = 128
num_heads = 32
d_model = 512 # 2560
d_ff = 3072 # 4096
num_encoder_layers = 2
num_decoder_layers = 24 # 24
dropout = 0.1

# layer details
pwff_activation = "gelu"
linear_activation = "gelu"
layer_bias = True
layer_norm_epsilon = 1e-5
layer_initialization = "normal"
shared_embedding = True

In [4]:
# optimizer_params
optimizer_params = {
    "beta_1": 0.9,
    "beta_2": 0.98,
    "optimizer_epsilon": 1e-5,
    "initial_learning_rate": 1e-4
}
num_warmup_steps = 4000

# criterion_params
criterion_params = {
  "lm": 1.0,
  "ul": 0.5
}

# training_params
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 8
save_per_epoch = 1
save_per_batch = -1
keep_last = True
verbose_per_epoch = 1
verbose_per_batch = 500

In [5]:
# data_loader_params
num_workers = 1 * int(bool(torch.cuda.device_count()))
pin_memory = bool(torch.cuda.device_count())
encoding = "UTF-8"
src_sep_tokens = [["cls", "sep"], [None, "sep"]] # [["context", "sep"], ["candidate", "sep"]]
approach = "ignore"
nprocs = 1

### Load Preprocessor

In [6]:
src_language = "kor"
tgt_language = "kor"
encoding = "utf-8"

src_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=src_language, vocab_size=src_vocab_size)
tgt_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=tgt_language, vocab_size=tgt_vocab_size)
trfr_prep = GeneratorPretrainingPreprocessor(src_language=src_language, tgt_language=tgt_language, src_spm_model_path=src_spm_model_path, tgt_spm_model_path=tgt_spm_model_path, embedding_dict=embedding_dict)
src_pad_token_id = trfr_prep.src_spm_tokenizer.special_token_dict["pad"]["id"]
tgt_pad_token_id = trfr_prep.tgt_spm_tokenizer.special_token_dict["pad"]["id"]

Cannot Import konlpy Mecab tagger: <class 'Exception'> - Install MeCab in order to use it: http://konlpy.org/en/latest/install/
Importing MeCab for Windows
Imported MeCab for Windows successfully
loaded spm_model: 'D:\_jupyter/spm_model/kor/spoken_pretrain_spm_v15000/'


## Set Trainer

In [7]:
trainer = GeneratorPretrainingTransformerTrainer()
trainer.set_lr_update(initial_learning_rate=optimizer_params["initial_learning_rate"], num_warmup_steps=num_warmup_steps)

'temp_dir' has been set to './20210818_013419/' to save model while training
LearningRate schedule has been set to 'transformer_lambda'


## Single-GPU Training

### Build Transformer

In [8]:
# transformer = Transformer(src_timesteps=src_timesteps, tgt_timesteps=tgt_timesteps, src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size, embedding_dict=embedding_dict, 
#                     src_pad_token_id=trfr_prep.src_spm_tokenizer.special_token_dict["pad"]["id"], tgt_pad_token_id=trfr_prep.tgt_spm_tokenizer.special_token_dict["pad"]["id"],
#                     d_model=d_model, d_ff=d_ff, num_heads=num_heads, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, shared_embedding=shared_embedding,
#                     dropout=dropout, pwff_activation=pwff_activation, linear_activation=linear_activation, bias=layer_bias, layer_norm_epsilon=layer_norm_epsilon, initialization=layer_initialization)

In [9]:
with open("./scripts/transformer/config/dialog_pretrain_picas.json", "r", encoding="utf-8") as fp:
    config = json.load(fp)
transformer = Transformer(src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, **config["model"])

### Set criterions & optimizer

In [10]:
criterions, criterion_weights = trainer.get_criterions(tgt_timesteps=tgt_timesteps, tgt_vocab_size=tgt_vocab_size, tgt_pad_token_id=trfr_prep.tgt_spm_tokenizer.special_token_dict["pad"]["id"], lm=criterion_params["lm"], ul=criterion_params["ul"])
optimizer = trainer.get_optimizer(model=transformer, initial_learning_rate=optimizer_params["initial_learning_rate"], beta_1=optimizer_params["beta_1"], beta_2=optimizer_params["beta_2"], optimizer_epsilon=optimizer_params["optimizer_epsilon"])

### Set Device

In [11]:
transformer = GeneratorPretrainingTransformerTrainer.set_device(obj=transformer, device=device)
optimizer = GeneratorPretrainingTransformerTrainer.set_device(obj=optimizer, device=device)
criterions = GeneratorPretrainingTransformerTrainer.set_device(obj=criterions, device=device)

Setting model device: cuda:0
Setting criterions device: cuda:0


## Load Dataset & DataLoader

In [12]:
# dataset_name = "KaggleConversation"
# data_dir = dataset_dir + "/dataset/conversation/{dataset_name}/{language}/multi_turn/".format(dataset_name=dataset_name, language=config["data"]["src_language"])
# dataset = DatasetFromDir(data_dir=data_dir, batch_size=batch_size, encoding=config["data"]["encoding"], extension=config["data"]["extension"], device=device, nprocs=nprocs)
# dialog_data_loader_params = TransformerDialogPreTrainer.get_data_loader_params(dataset=dataset, preprocessor=trfr_prep, batch_size=batch_size, 
#                                                                                device=device, nprocs=nprocs, num_workers=dialog_pretrainer.num_workers, pin_memory=dialog_pretrainer.pin_memory,
#                                                                                **config["model"], **config["data_loader"])
# kaggle_conversation_data_loader = dialog_pretrainer.create_data_loader(**dialog_data_loader_params)

In [13]:
# data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/{language}/multi_turn/".format(language=config["data"]["src_language"])
# data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/{language}/multi_turn/sample/".format(language=config["data"]["src_language"])
dataset_name = "KaggleConversation"
data_dir = dataset_dir + "/dataset/conversation/{dataset_name}/{language}/multi_turn".format(language=config["data"]["src_language"], dataset_name=dataset_name)

dataset = DatasetFromDir(data_dir=data_dir, batch_size=batch_size, encoding=config["data"]["encoding"], extension=config["data"]["extension"], device=device, nprocs=nprocs)

data_loader_params = GeneratorPretrainingTransformerTrainer.get_data_loader_params(dataset=dataset, preprocessor=trfr_prep, batch_size=batch_size, 
                                                                               device=device, nprocs=nprocs, num_workers=trainer.num_workers, pin_memory=trainer.pin_memory,
                                                                               **config["model"], **config["data_loader"])
train_data_loader = trainer.create_data_loader(**data_loader_params)

## Train test

In [14]:
epoch = 5
amp = True
scaler = None
if amp: scaler = torch.cuda.amp.GradScaler()
save_per_epoch = -1
save_per_batch = -1
keep_last = True
verbose_per_epoch = 1
verbose_per_batch = 100

### trainer.fit

In [15]:
history = dialog_pretrainer.fit(model=transformer, train_data_loader=dialog_pretrain_data_loader, val_data_loader=None, 
                                criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                                epoch=epoch, amp=amp, save_per_epoch=save_per_epoch, save_per_batch=save_per_batch, keep_last=keep_last, verbose_per_epoch=verbose_per_epoch, verbose_per_batch=verbose_per_batch)

NameError: name 'dialog_pretrainer' is not defined

### trainer.train_epoch

In [None]:
dialog_pretrain_data_iter = tqdm(dialog_pretrain_data_loader, initial=dialog_pretrain_data_loader.iter_start, total=len(dialog_pretrain_data_loader))
dialog_pretrain_data_iter.iter_size = dialog_pretrain_data_loader.iter_end - dialog_pretrain_data_loader.iter_start
epoch_train_history = dialog_pretrainer.train_epoch(model=transformer, data_loader=dialog_pretrain_data_iter, 
                                                    criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                                                    amp=amp, scaler=scaler, save_per_batch=save_per_batch, verbose_per_batch=verbose_per_batch)

### trainer.iteration

In [None]:
for batch_idx, batch in enumerate(dialog_pretrain_data_loader):
    batch_idx += 1
    batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]

    loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
                                            criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                            train=True, amp=amp, scaler=scaler)

    print(loss_dict)
    print(acc_dict)
    break
    

In [28]:
data_dir = "D:\_jupyter\dataset\conversation\SelectStar\kor\multi_turn/"
dataset = DatasetFromDir(data_dir=data_dir, batch_size=4, encoding="utf-8", extension="json", device="cuda:0", nprocs=1)
data = dataset.get_all_data()

utterances = [utterance for row in data for utterance in row["utterances"]]
target_prev_token_distribution = trfr_prep.extract_prev_token_distribution(sentences=utterances, ngram=5)

from collections import Counter
predicted_prev_token_distribution = dict()
for k, v in target_prev_token_distribution.items():
    counter = Counter()
    for _k,_v in v.items():
        counter[_k] = np.random.randint(0, 10000)
    predicted_prev_token_distribution[k] = counter
    
for k,v in predicted_prev_token_distribution.items(): break

Extracting token_ids: 100%|██████████████████████████████████████████████████| 304082/304082 [00:45<00:00, 6691.13it/s]
Normalizing distribution: 100%|█████████████████████████████████████████████████| 5964/5964 [00:00<00:00, 17706.58it/s]


Extracted prev_token_distribution from total 5964 tokens


In [16]:
_batch = [next(dialog_pretrain_data_loader.dataset.__iter__()) for i in range(0, batch_size)]
batch_idx = 1
batch = dialog_pretrain_data_loader.collate_fn(batch=_batch)
batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]
src_inputs, tgt_inputs, tgt_outputs = batch
predictions = transformer(src_inputs=src_inputs, tgt_inputs=tgt_inputs)

# loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
#                                         criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
#                                         train=True, amp=amp, scaler=scaler)

# print(loss_dict)
# print(acc_dict)
# break
    

In [142]:
len(_targets.shape)

2

In [423]:
_targets = prev_token_distribution
_prediction = predictions["lm"]
prediction_token_ids = torch.argmax(_prediction, axis=-1)
# prediction_token_ids = trainer.convert_to_numpy(tensor=prediction_token_ids)

In [429]:
a = prediction_token_ids[0][0]

In [436]:
a.item()

14795

In [327]:
_prediction = _prediction.exp()
for input_row, target_row in zip(_prediction, _targets):
#     candidate_mask = target_row
#     if not ngram_distribution:
#         # make subsequent mask as a default
#         target_expanded = target_row.unsqueeze(0).expand(self.timesteps, self.timesteps)
#         target_tril = target_expanded.tril(-1)
#         ignore_index_mask = target_expanded.triu().to(torch.bool) * self.ignore_index
#         target_tril = target_tril + ignore_index_mask
#         candidate_mask = target_tril.masked_fill(target_tril == target_row.unsqueeze(1), 0)

    # clamp: prevent underflow
    probs_not_to_be = torch.clamp(1 - input_row, min=1e-7)
    # convert probabilities to log_probailities again
    negatvie_candidates = torch.zeros_like(input_row).scatter_(1, target_row, 1)
    _loss = -1 * torch.log(probs_not_to_be) * negatvie_candidates
    _loss[:, ignore_index] = 0

In [321]:
prev_token_distribution[0][10]

tensor([    0,     0,     0,     0,     0, 12500,  8978, 14120,  4676,  8356,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],

In [405]:
row[0]

tensor(14795, device='cuda:0')

In [408]:
ulk_targets = []
empty_row = [0] * ngram
for row in prediction_token_ids:
    ulk_token_row = [most_different_tokens[token_id] if token_id in most_different_tokens else empty_row for token_id in row]
    ulk_targets.append(ulk_token_row)
ulk_targets = trainer.convert_to_tensor(data=ulk_targets, device=device)

In [34]:
for prev_token_id in prediction_token_ids:
    distribution = predicted_prev_token_distribution[prev_token_id]
    token_ids = list(distribution.keys())
    frequencies = np.array(list(distribution.values()))
    _frequencies = frequencies / sum(frequencies)
    
    dict(zip(token_ids, _frequencies))
    break

TypeError: unhashable type: 'numpy.ndarray'

In [25]:
prediction_token_ids

tensor([[14795, 12500, 12312,  ...,  3325, 14120, 14120],
        [ 6906, 12500, 14469,  ..., 14120, 14120, 14120],
        [  261,  3375, 14469,  ..., 14120,   830, 14120],
        ...,
        [ 9505, 12500, 12689,  ..., 14120, 14120,  3227],
        [ 6734, 12500,  8025,  ...,   830, 14120, 14120],
        [ 9505, 12500, 14469,  ..., 14120, 14120, 12274]], device='cuda:0')

In [21]:
for src_inputs, tgt_inputs, tgt_outputs in dialog_pretrain_data_loader:
    if len(src_inputs["token"]) < 1: continue
    src_inputs = {k: dialog_pretrainer.convert_to_tensor(data=v, device=device) for k, v in src_inputs.items()}
    tgt_inputs = {k: dialog_pretrainer.convert_to_tensor(data=v, device=device) for k, v in tgt_inputs.items()}
    tgt_outputs = {k: dialog_pretrainer.convert_to_tensor(data=v, device=device) for k, v in tgt_outputs.items()}
    break
    
output = model(src_inputs=src_inputs, tgt_inputs=tgt_inputs)

TypeError: cannot pickle 'Tagger' object

### Save & Load

In [None]:
model_path = "./temp/20210607_174705/epoch_1/"
# # save
# BertTrainer.save(path=model_path, model=model, optimizer=optimizer)

# load
checkpoint = BertTrainer.load(path=model_path)
model = checkpoint["model"]
optimizer = checkpoint["optimizer"]

In [22]:
cnt = 0
for inputs, outputs in train_data_loader:
#     print(outputs["nsp"])
#     cnt += 1
#     if cnt > 10: break
#     break
    print(np.array(inputs["token_ids"]).shape, np.array(inputs["segment_ids"]).shape, np.array(outputs["nsp"]).shape, np.array(outputs["mlm"]).shape)
    break
        
for n,i,o in zip(outputs["nsp"], bert_prep.decode(inputs["token_ids"]), bert_prep.decode(outputs["mlm"])):
    print("nsp:", n)
    print("input:", i)
    print("output:", o)
    print()

NameError: name 'train_data_loader' is not defined

In [5]:
# src_sentences = {"train":[], "valid":[], "test":[]}
# tgt_sentences = {"train":[], "valid":[], "test":[]}

# for conv_id, group in train_df.groupby(["conv_id"]):
#     utterances = group["utterance"].tolist()
#     group_length = len(group)
#     for i in range(0, group_length-1):
#         src_sentence = utterances[i]
#         tgt_sentence = utterances[i+1]
#         src_sentences["train"].append(src_sentence)
#         tgt_sentences["train"].append(tgt_sentence)

# for conv_id, group in valid_df.groupby(["conv_id"]):
#     utterances = group["utterance"].tolist()
#     group_length = len(group)
#     for i in range(0, group_length-1):
#         src_sentence = utterances[i]
#         tgt_sentence = utterances[i+1]
#         src_sentences["valid"].append(src_sentence)
#         tgt_sentences["valid"].append(tgt_sentence)

# for conv_id, group in test_df.groupby(["conv_id"]):
#     utterances = group["utterance"].tolist()
#     group_length = len(group)
#     for i in range(0, group_length-1):
#         src_sentence = utterances[i]
#         tgt_sentence = utterances[i+1]
#         src_sentences["test"].append(src_sentence)
#         tgt_sentences["test"].append(tgt_sentence)

# data_type = "test"
# with open(file_path["empatheticdialogues"]["feed_data"][data_type].format(dataset_dir=dataset_dir), "w", encoding="UTF-8") as fp:
#     for src_sentence, tgt_sentence in zip(src_sentences["train"], tgt_sentences["train"]):
#         row = src_sentence + "\t" + tgt_sentence + "\n"
#         fp.write(row)

In [40]:
src_sentence = "test 센텐텐스스입입니니다다."
model.eval()
model.inference(preprocessor=prep, src_sentence=src_sentence, device=None, method="greedy")

[5285,
 4646,
 4260,
 6083,
 3973,
 6620,
 4103,
 3918,
 2997,
 3604,
 4444,
 5162,
 4510,
 2175,
 1908,
 567]

#### Test

In [None]:
segment_ids

In [19]:
from_idx = 0
batch_data = transformer_dataset.get_batch(from_idx=from_idx, to_idx=from_idx+batch_size, device=None)
predictions = model.forward(batch_data["src_inputs"], batch_data["tgt_inputs"])
torch.argmax(predictions, dim=-1)

In [None]:
batch_

In [47]:
for batch_data in train_loader:
    break
    
predictions = model(batch_data["src_inputs"], batch_data["tgt_inputs"])
targets = batch_data["tgt_labels"]
lm_loss.get_loss(predictions=predictions, targets=targets
                

before: torch.Size([16, 16, 8000]) torch.Size([16, 16])
after: torch.Size([256, 8000]) torch.Size([256])


tensor(28.2944, dtype=torch.float64, grad_fn=<NllLossBackward>)

In [14]:
num_epochs = 5

# Loop over epochs
for epoch in range(num_epochs):
    # Training
    for batch_data in train_loader:
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

        # Model computations
        [...]

    # Validation
    with torch.set_grad_enabled(False):
        for local_batch, local_labels in validation_generator:
            # Transfer to GPU
            local_batch, local_labels = local_batch.to(device), local_labels.to(device)

            # Model computations
            [...]

TypeError: get_loss() missing 2 required positional arguments: 'predictions' and 'targets'

In [15]:
model.criterions["lm"].get_loss()
model.optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    eps: 1e-05
    lr: 0.0001
    weight_decay: 0
)

### Train

In [None]:
num_epochs = 5

for data in train_loader:
    break


# Loop over epochs
for epoch in range(num_epochs):
    # Training
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

        # Model computations
        [...]

    # Validation
    with torch.set_grad_enabled(False):
        for local_batch, local_labels in validation_generator:
            # Transfer to GPU
            local_batch, local_labels = local_batch.to(device), local_labels.to(device)

            # Model computations
            [...]

In [7]:
# inputs
src_inputs = np.random.randint(low=0, high=src_vocab_size, size=(batch_size, src_timesteps))
tgt_inputs = np.random.randint(low=0, high=tgt_vocab_size, size=(batch_size, tgt_timesteps))
src_inputs = torch.from_numpy(src_inputs)
tgt_inputs = torch.from_numpy(tgt_inputs)
# src_inputs = src_inputs.to(device)
# tgt_inputs = tgt_inputs.to(device)

tgt_labels = []
for i in range(0, batch_size):
    row = [prep.tgt_spm_tokenizer.special_token_dict["pad"]["id"]] * tgt_timesteps
    l = np.random.randint(low=0, high=tgt_timesteps)
    for j in range(0, l):
        row[j] = np.random.randint(low=len(prep.tgt_spm_tokenizer.special_token_dict), high=tgt_vocab_size)
    tgt_labels.append(row)
tgt_labels = np.array(tgt_labels)
tgt_labels = torch.from_numpy(tgt_labels).contiguous().view(-1)
# tgt_labels = tgt_labels.to(device)

def dummy_get_batch(mask, approach):
    output = dict()
    output["src_inputs"] = src_inputs
    output["tgt_inputs"] = tgt_inputs
    output["tgt_labels"] = tgt_labels
    return output

trainer.get_batch = dummy_get_batch
batch_data = trainer.get_batch(mask=False, approach="igrnore")

In [9]:
predictions = model(batch_data["src_inputs"], batch_data["tgt_inputs"])
loss = trainer.loss_function_dict["lm"](predictions, batch_data["tgt_labels"])
print("loss:", loss)
print("parameter:", model.encoder.layers[num_encoder_layers-1].pwff_layer.f1_weight)

loss: tensor(30.3885, dtype=torch.float64, grad_fn=<NllLossBackward>)
parameter: Parameter containing:
tensor([[ 0.0007, -0.0181,  0.0168,  ..., -0.0046, -0.0427,  0.0172],
        [-0.0270, -0.0181, -0.0124,  ...,  0.0088,  0.0038,  0.0095],
        [-0.0252,  0.0149,  0.0377,  ...,  0.0115,  0.0098, -0.0456],
        ...,
        [-0.0165,  0.0137, -0.0111,  ..., -0.0247, -0.0099,  0.0249],
        [-0.0220,  0.0086,  0.0368,  ...,  0.0049, -0.0242,  0.0098],
        [-0.0356,  0.0149,  0.0124,  ..., -0.0130, -0.0306,  0.0397]],
       dtype=torch.float64, requires_grad=True)


In [17]:
predictions = model(batch_data["src_inputs"], batch_data["tgt_inputs"])
loss = trainer.loss_function_dict["lm"](predictions, batch_data["tgt_labels"])
print("loss:", loss)
print("parameter:", model.encoder.layers[num_encoder_layers-1].pwff_layer.f1_weight)

loss: tensor(0.0064, dtype=torch.float64, grad_fn=<NllLossBackward>)
parameter: Parameter containing:
tensor([[ 0.0172, -0.0072,  0.0255,  ...,  0.0112, -0.0251,  0.0327],
        [-0.0369, -0.0546, -0.0205,  ..., -0.0075,  0.0238,  0.0247],
        [-0.0220,  0.0214,  0.0454,  ...,  0.0023,  0.0287, -0.0571],
        ...,
        [-0.0182,  0.0326,  0.0083,  ..., -0.0156, -0.0324,  0.0314],
        [-0.0217, -0.0004,  0.0601,  ..., -0.0231, -0.0172, -0.0039],
        [-0.0243,  0.0155,  0.0115,  ..., -0.0232, -0.0054,  0.0553]],
       dtype=torch.float64, requires_grad=True)


In [9]:
PATH = './model/test/'
# # save
# batch_data = trainer.get_batch(mask=False, approach="igrnore")
# # torch.save(model, PATH + 'model.pt')  # 전체 모델 저장
# if not os.path.isdir(PATH): os.mkdir(PATH)
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer': optimizer,
#     'optimizer_state_dict': optimizer.state_dict(),
#     'batch_data': batch_data
# }, PATH+'all.tar')

# load
checkpoint = torch.load(PATH+"all.tar")
print(checkpoint.keys())
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
# model.load_state_dict(torch.load(PATH, map_location=device))
optimizer = checkpoint["optimizer"]
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

dict_keys(['model_state_dict', 'optimizer', 'optimizer_state_dict', 'batch_data'])


In [10]:
# predictions = model(batch_data["src_inputs"], batch_data["tgt_inputs"])
predictions = model(checkpoint["batch_data"]["src_inputs"], checkpoint["batch_data"]["tgt_inputs"])
loss = trainer.loss_function_dict["lm"](predictions, checkpoint["batch_data"]["tgt_labels"])
print("loss:", loss)
print("parameter:", model.encoder.layers[num_encoder_layers-1].pwff_layer.f1_weight)

loss: tensor(0.0087, dtype=torch.float64, grad_fn=<NllLossBackward>)
parameter: Parameter containing:
tensor([[ 0.0172, -0.0072,  0.0255,  ...,  0.0112, -0.0251,  0.0327],
        [-0.0369, -0.0546, -0.0205,  ..., -0.0075,  0.0238,  0.0247],
        [-0.0220,  0.0214,  0.0454,  ...,  0.0023,  0.0287, -0.0571],
        ...,
        [-0.0182,  0.0326,  0.0083,  ..., -0.0156, -0.0324,  0.0314],
        [-0.0217, -0.0004,  0.0601,  ..., -0.0231, -0.0172, -0.0039],
        [-0.0243,  0.0155,  0.0115,  ..., -0.0232, -0.0054,  0.0553]],
       dtype=torch.float64, requires_grad=True)


In [None]:
# # gpu_setting
# os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3"
# torch.cuda.current_device()
# torch.cuda.device(0)
# torch.cuda.device_count()
# torch.cuda.get_device_name(3)
# torch.cuda.set_device()

NameError: name 'predictions' is not defined

In [None]:
if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

In [None]:
target = target.cuda(args.gpu, non_blocking=True)

In [10]:
import copy
import torch
a = copy.deepcopy(model.src_embedding_layer.token_embedding_layer.weight)
a

Parameter containing:
tensor([[ 2.4639,  0.0497, -0.2549,  ..., -0.2591,  1.0861,  0.1500],
        [-0.3107,  0.4950, -2.7424,  ..., -1.0883,  1.3823,  0.6258],
        [-1.1147, -0.8964, -0.6551,  ...,  1.2953, -0.4786, -0.5413],
        ...,
        [ 0.6452, -0.4104, -0.7140,  ..., -0.8958, -1.1715,  0.9950],
        [ 1.8464,  0.7755,  1.3069,  ..., -0.5902, -0.6805,  0.2433],
        [ 0.2092,  0.9304,  0.1618,  ..., -0.8570, -0.1002, -2.4730]],
       requires_grad=True)

In [11]:
b = copy.deepcopy(model.tgt_embedding_layer.token_embedding_layer.weight)
b

Parameter containing:
tensor([[ 1.6363,  1.2300, -1.5523,  ...,  0.1990, -0.3337,  0.5117],
        [ 1.1525,  1.2044, -0.7870,  ...,  0.2773, -0.6321,  0.6474],
        [-0.8847,  1.1973, -0.0119,  ..., -0.6822,  0.5145,  0.0370],
        ...,
        [ 0.3510,  1.3155,  0.4474,  ...,  1.0271,  0.0918, -0.4542],
        [-1.2046, -1.0570, -0.8211,  ...,  0.3967, -0.2932, -0.1849],
        [-0.7565, -0.5659, -1.2976,  ..., -0.2027, -0.2616,  0.7284]],
       requires_grad=True)

In [12]:
model.encoder.layers[num_encoder_layers-1].mha_layer.out_proj_weight

Parameter containing:
tensor([[-0.0010,  0.0003,  0.0235,  ...,  0.0152,  0.0392,  0.0209],
        [ 0.0052, -0.0196, -0.0027,  ..., -0.0016,  0.0207,  0.0317],
        [-0.0008,  0.0257, -0.0094,  ...,  0.0165,  0.0052,  0.0043],
        ...,
        [-0.0060,  0.0060, -0.0131,  ...,  0.0077, -0.0048, -0.0066],
        [ 0.0039,  0.0254, -0.0172,  ...,  0.0257, -0.0269,  0.0087],
        [ 0.0379, -0.0153,  0.0181,  ..., -0.0191,  0.0176, -0.0002]],
       dtype=torch.float64, requires_grad=True)

In [13]:
model.encoder.layers[num_encoder_layers-1].pwff_layer.f1_weight

Parameter containing:
tensor([[-0.0447, -0.0400, -0.0359,  ...,  0.0119, -0.0061,  0.0235],
        [-0.0030,  0.0002,  0.0044,  ...,  0.0094,  0.0155, -0.0033],
        [-0.0093, -0.0082, -0.0192,  ..., -0.0090, -0.0152,  0.0343],
        ...,
        [ 0.0012,  0.0349,  0.0127,  ..., -0.0248,  0.0058,  0.0218],
        [ 0.0101,  0.0152,  0.0048,  ...,  0.0061,  0.0144,  0.0031],
        [-0.0039, -0.0151,  0.0045,  ..., -0.0449,  0.0122,  0.0017]],
       dtype=torch.float64, requires_grad=True)

In [None]:
epochs = 2

print('Started Training')
model.train()
for epoch in range(0, epochs):  # loop over the dataset multiple times
    epoch_loss = 0.0
    
    num_iters = trainer.get_num_iters()
    for i in range(0,num_iters):  
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward
        batch_data = trainer.get_batch(mask=False, approach="ignore")
        predictions = model(src_inputs=batch_data["src_inputs"], tgt_inputs=batch_data["tgt_inputs"])
        predictions_flatten = predictions.view(-1, predictions.size(-1))
        tgt_labels_flatten = batch_data["tgt_labels"].contiguous().view(-1)
        # backward & optimize
        batch_loss = lm_loss(predictions_flatten, tgt_labels_flatten)
        batch_loss.backward()
        optimizer.step()
        epoch_loss += batch_loss.item()

    # print statistics
    print("[epoch: {epoch}] loss: {loss}".format(epoch=epoch+1, loss=epoch_loss/num_iters))
    epoch_loss = 0.0

print('Finished Training')

Started Training


In [None]:
epochs = 5
for epoch in range(0, epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

In [None]:
print()

In [16]:
loss_function

NameError: name 'loss_function' is not defined

In [37]:
a = nn.functional.log_softmax(input=decoder_output, dim=-1)
a = nn.functional.softmax(input=decoder_output, dim=-1)

In [39]:
a.shape

torch.Size([16, 64, 32])

In [42]:
torch.sum(a, dim=-1)

tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        ...,
        [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
       dtype=torch.float64, grad_fn=<SumBackward1>)

In [21]:
torch.transpose(tgt_token_embed_weights, 0, 1)

AttributeError: module 'torch' has no attribute 'transpose_'

In [20]:
tgt_token_embed_weights.shape

torch.Size([8000, 32])

In [14]:
decoder_output.shape

torch.Size([16, 64, 32])

In [11]:
from datetime import datetime, timedelta

In [12]:
a = datetime.now()

In [13]:
b = datetime.now()

In [14]:
b-a

datetime.timedelta(seconds=4, microseconds=858459)

In [15]:
(b-a).seconds

4

In [12]:
def main():
    best_acc = 0

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print('==> Preparing data..')
    transforms_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

    dataset_train = CIFAR10(root='../data', train=True, download=True, 
                            transform=transforms_train)

    train_loader = DataLoader(dataset_train, batch_size=args.batch_size, 
                              shuffle=True, num_workers=args.num_worker)

    # there are 10 classes so the dataset name is cifar-10
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 
               'dog', 'frog', 'horse', 'ship', 'truck')

    print('==> Making model..')

    net = pyramidnet()
    net = net.to(device)
    num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print('The number of parameters of model is', num_params)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=args.lr, 
                          momentum=0.9, weight_decay=1e-4)
    
    train(net, criterion, optimizer, train_loader, device)
            

def train_on_single_device(self, model, loss_function, optimizer, device, verbose_per_batch=100):
    model.train()

    train_loss = 0
    correct = 0
    total = 0
    
    train_begin_time = datetime.now()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        batch_begin_time = datetime.now()
        
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        acc = 100 * correct / total
        
        batch_end_time = datetime.now()
        batch_train_time = batch_end_time - batch_begin_time
        
        if batch_idx % verbose_per_batch == 0:
            print('Epoch: [{}/{}]| loss: {:.4f} | acc: {:.4f} | batch time: {:.4f}s '.format(batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_train_time.seconds))
    
    train_end_time = datetime.now()
    total_train_time = train_end_time - train_begin_time
    print("Training time: {seconds}s".format(seconds=total_train_time.seconds))
    

In [None]:
loss_template = 

In [38]:
print("Epoch: [{epoch:^3d}/{epochs:^3d}] | {loss_template} | {acc_template} | train_time: batch time: {:.2f}s".format(epoch=1, epochs=5, loss=0.7123))
      
#       "| loss: {:.4f} | acc: {:.4f} | batch time: {:.4f}s '.format(batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_train_time.seconds))

Epoch: [ 1 / 5 ] | loss: 0.7123


In [8]:
# import torch
# from torch import nn

# src_inputs = np.array([
#     [1,2,3,4,5,0,0],
#     [1,2,3,4,0,0,0],
#     [1,2,3,4,0,0,0],
#     [1,2,3,4,5,6,7],
#     [1,2,3,4,5,6,0],
# ])
# # src_inputs = np.random.randint(low=0, high=1, size=(batch_size, src_timesteps))
# src_inputs = torch.from_numpy(src_inputs)
# src_embedding_layer = TransformerEmbedding(timesteps=7, d_model=4, vocab_size=8)
# src_embed, src_token_embed_weights = src_embedding_layer(token_ids=src_inputs)
# src_key_padding_mask = src_inputs==0

In [25]:
language = "kor"
dataset_name = "open_subtitles"

data_dir = file_path["open_subtitles"]["dir"].format(dataset_dir=dataset_dir)+"/" + language +"/v2018/"
filenames = os.listdir(data_dir)

test = []
data_type = "test"
for filename in filenames:
    if not filename.startswith(data_type): continue
    data = None
    with open(data_dir+filename, "r", encoding="utf-8") as fp:
        data = fp.read()
        data = data.strip()
        if not data.startswith("[") and not data.endswith("]"):
            data = "[" + data + "]"
        data = data.replace("\n", ",")
        data = json.loads(data)
    test += data

train = []
data_type = "train"
for filename in filenames:
    if not filename.startswith(data_type): continue
    data = None
    with open(data_dir+filename, "r", encoding="utf-8") as fp:
        data = fp.read()
        data = data.strip()
        if not data.startswith("[") and not data.endswith("]"):
            data = "[" + data + "]"
        data = data.replace("\n", ",")
        data = json.loads(data)
    train += data

len(train), len(test)
dataset = {"train":train, "test":test}

with open(file_path[dataset_name]["pickle"].format(dataset_dir=dataset_dir, language=language), "wb") as fp:
    pickle.dum(dataset, fp)

In [None]:
len(dataset["train"])

In [None]:
dataset["train"][10]

In [None]:
# conversation - open_subtitles
# sequences 중 약 8%에 전문 영어인 sequence가 있음
with open(file_path[dataset_name]["pickle"].format(dataset_dir=dataset_dir, language=language), "rb") as fp:
    dataset = pickle.load(fp)

def extract_sequence_from_row(row):
    sequence = [None] * (len(row) - 1) # file_id
    file_id = row.pop("file_id")
    context = row.pop("context")
    response = row.pop("response")

    sequence[0] = response
    sequence[1] = context
    for k,v in row.items():
        v = v.strip()
        if v == "nbsp;": continue
        sequence[int(k[-1])+2] = v
    sequence.reverse()
    sequence = [sentence for sentence in sequence if sentence is not None]
    return sequence

train_sequences = []
for row in dataset["train"]:
    sequence = extract_sequence_from_row(row=row)
    if len(sequence) < 2: continue
    train_sequences.append(sequence)

test_sequences = []
for row in dataset["test"]:
    sequence = extract_sequence_from_row(row=row)
    if len(sequence) < 2: continue
    test_sequences.append(sequence)
    
total_sequences = train_sequences + test_sequences
sentences = [sentence for sequence in total_sequences for sentence in sequence]

print(len(train_sequences), len(test_sequences))

In [None]:
512/2560/16/10/10 -> 47m
512/2560/32/10/10 -> 47m
512/3072/16/10/10 -> 47m
512/3072/32/10/10 -> 47m

512/2560/16/12/12 -> 53m *
512/2560/32/12/12 -> 53m
512/3072/16/12/12 -> 53m
512/3072/32/12/12 -> 53m

512/2560/16/2/12 -> 42m
512/2560/32/2/12 -> 42m
512/3072/16/2/12 -> 42m
512/3072/32/2/12 -> 42m

512/2560/16/2/24 -> 68m # 
512/2560/32/2/24 -> 68m
512/3072/16/2/24 -> 68m
512/3072/32/2/24 -> 68m

768/2560/16/10/10 -> 94m
768/2560/32/10/10 -> 94m
768/3072/16/10/10 -> 94m
768/3072/32/10/10 -> 94m

768/2560/16/12/12 -> 108m
768/2560/32/12/12 -> 108m
768/3072/16/12/12 -> 108m
768/3072/32/12/12 -> 108m

768/2560/16/2/12 -> 84m
768/2560/32/2/12 -> 84m
768/3072/16/2/12 -> 84m
768/3072/32/2/12 -> 84m

768/2560/16/2/24 -> 140m
768/2560/32/2/24 -> 140m
768/3072/16/2/24 -> 140m
768/3072/32/2/24 -> 140m

## UnlikelyhoodLoss Test

In [None]:
from datetime import datetime
from transformer.trainer.criterions import UnlikelihoodCriterion
from transformer.trainer.custom_loss import UnlikelihoodLoss, UnlikelihoodLoss2d

def generate_sample(batch_size, timesteps, vocab_size):
    predictions = torch.rand((batch_size, timesteps, vocab_size), dtype=torch.double)
    predictions = torch.log_softmax(predictions, dim=-1)
    targets = torch.randint(low=0, high=vocab_size, size=(batch_size, timesteps))
#     predictions = predictions.view(-1, predictions.size(-1))
#     targets = targets.contiguous().view(-1)
    return predictions, targets

batch_size = 64
timesteps_list = [8, 16, 32, 64, 128, 256]
vocab_size_list = [32, 64, 8000, 15000, 30000]
ignore_index = 0 
underflow = 1e-5

output = []
for timesteps in timesteps_list:
    for vocab_size in vocab_size_list:
        if batch_size * timesteps >= vocab_size: continue
        ulf_1 = UnlikelihoodLoss(batch_size=batch_size, timesteps=timesteps, vocab_size=vocab_size, ignore_index=ignore_index, underflow=underflow)
        ulf_2 = UnlikelihoodLoss2d(timesteps=timesteps, vocab_size=vocab_size, ignore_index=ignore_index, underflow=underflow)
        
        predictions, targets = generate_sample(batch_size, timesteps, vocab_size)
        print("timesteps:{t}, vocab_size:{v}\tpredictions:{p}, targets:{g}".format(t=timesteps, v=vocab_size, p=predictions.shape, g=targets.shape))
        
        begin = datetime.now()
        loss_1 = ulf_1(predictions.view(-1, predictions.size(-1)), targets.contiguous().view(-1))
        end = datetime.now()
        exec_time_1 = (end - begin).seconds
        
        begin = datetime.now()
        loss_2 = ulf_2(predictions, targets)
        end = datetime.now()
        exec_time_2 = (end - begin).seconds
        
        _output = {"type": "1d", "batch_size": batch_size, "timesteps": timesteps, "vocab_size": vocab_size, "loss": loss_1.item(), "exec_time": exec_time_1}
        output.append(_output)
        _output = {"type": "2d", "batch_size": batch_size, "timesteps": timesteps, "vocab_size": vocab_size, "loss": loss_2.item(), "exec_time": exec_time_2}
        output.append(_output)
        
output_df = pd.DataFrame(output)
for group in output_df.groupby(["timesteps", "vocab_size"]):
    print(group)
    print()