In [1]:
import argparse
import json
import os
import random

import easydict
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import AdamW, BertTokenizer, get_linear_schedule_with_warmup

from data_utils import (WOSDataset, get_examples_from_dialogues, load_dataset, set_seed,
                        seed_everything)
from eval_utils import DSTEvaluator
from evaluation import _evaluation
from inference import inference
from preprocessor import TRADEPreprocessor
# from my_transformer import Decoder

import pdb

In [2]:
args = easydict.EasyDict({
    'data_dir' : "../../input/data/train_dataset",
    'model_dir' : "results",
    'train_batch_size' : 4,
    'eval_batch_size' : 32,
    'learning_rate' : 1e-4,
    'adam_epsilon' : 1e-8,
    'max_grad_norm' : 1.0,
    'num_train_epochs' : 30,
    'warmup_ratio' : 0.1,
    'random_seed' : 42,
    'model_name_or_path' : 'dsksd/bert-ko-small-minimal',
    'hidden_size' : 768,
    'vocab_size' : None,
    'hidden_dropout_prob' : 0.1,
    'proj_dim' : None,
    'teacher_forcing_ratio' : 0.5,
    'word_dropout' : 0,
    'max_position' : 512,
    'attention_drop_out' : 0.1,
    'num_attention_heads' : 6,
    'ffn_dim' : 768*2,
    'num_decoder_layers' : 3
    
})

In [3]:
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

In [4]:
slot_meta = json.load(open(f"{args.data_dir}/slot_meta.json"))
processor = TRADEPreprocessor(slot_meta, tokenizer, word_dropout_rate = args.word_dropout)


In [10]:
id_list = [11764,21832,11764]
tokenizer.decode(id_list,skip_special_tokens=False)

'##ne none'

In [3]:
train_data_file = f"{args.data_dir}/train_dials.json"
slot_meta = json.load(open(f"{args.data_dir}/slot_meta.json"))
'''['관광-경치 좋은', '관광-교육적', '관광-도보 가능', '관광-문화 예술', '관광-역사적', '관광-이름', '관광-종류', '관광-주차 가능', '관광-지역', '숙소-가격대', '숙소-도보 가능', '숙소-수영장 유무', '숙소-스파 유무', '숙소-예약 기간', '숙소-예약 명수', '숙소-예약 요일', '숙소-이름', '숙소-인터넷 가능', '숙소-조식 가능', '숙소-종류', '숙소-주차 가능', '숙소-지역', '숙소-헬스장 유무', '숙소-흡연 가능', '식당-가격대', '식당-도보 가능', '식당-야외석 유무', '식당-예약 명수', '식당-예약 시간', '식당-예약 요일', '식당-이름', '식당-인터넷 가능', '식당-종류', '식당-주류 판매', '식당-주차 가능', '식당-지역', '식당-흡연 가능', '지하철-도착지', '지하철-출발 시간', '지하철-출발지', '택시-도착 시간', '택시-도착지', '택시-종류', '택시-출발 시간', '택시-출발지']'''
train_data, dev_data, dev_labels = load_dataset(train_data_file)

train_examples = get_examples_from_dialogues(
    train_data, user_first=False, dialogue_level=False
)
'''
train_examples[10] = DSTInputExample(guid='polished-poetry-0057:관광_9-2', context_turns=['', '쇼핑을 하려는데 서울 서쪽에 있을까요?', '서울 서쪽에 쇼핑이 가능한 곳이라면 노량진 수산물 도매시장이 있습니다.', '오 네 거기 주소 좀 알려주세요.'], current_turn=['노량진 수산물 도매시장의 주소는 서울 동작구 93806입니다.', '알려주시는김에 연락처랑 평점도 좀 알려주세요.'], label=['관광-종류-쇼핑', '관광-지역-서울 서쪽', '관광-이름-노량진 수산물 도매시장'])
'''
dev_examples = get_examples_from_dialogues(
    dev_data, user_first=False, dialogue_level=False
)
'''
dev_examples[10] = DSTInputExample(guid='shy-sea-4716:관광_11-2', context_turns=['', '제가 서울을 처음 와봐서 문화 예술과 관련된 곳으로 관광하고 싶은데 어디로 가면 될까요?', '안녕하세요. 관광을 원하시는 지역과 종류가 있으시면 말씀해주세요.', '서울 중앙쪽으로 알려주세요. 종류는 글쎄요 잘 모르겠어요.'], current_turn=['네. 그럼 명동난타극장과 삼성미술관 라움, 정동극장을 추천해드립니다. 어디가 괜찮으세요?', '미술관이 좋을것 같아요. 지하철로 이동하려고 하는데 어디에서 내리면 되나요?'], label=None)
'''

100%|██████████| 6301/6301 [00:00<00:00, 11377.65it/s]
100%|██████████| 699/699 [00:00<00:00, 2841.56it/s]


"\ndev_examples[10] = DSTInputExample(guid='shy-sea-4716:관광_11-2', context_turns=['', '제가 서울을 처음 와봐서 문화 예술과 관련된 곳으로 관광하고 싶은데 어디로 가면 될까요?', '안녕하세요. 관광을 원하시는 지역과 종류가 있으시면 말씀해주세요.', '서울 중앙쪽으로 알려주세요. 종류는 글쎄요 잘 모르겠어요.'], current_turn=['네. 그럼 명동난타극장과 삼성미술관 라움, 정동극장을 추천해드립니다. 어디가 괜찮으세요?', '미술관이 좋을것 같아요. 지하철로 이동하려고 하는데 어디에서 내리면 되나요?'], label=None)\n"

In [4]:
# Define Preprocessor
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
processor = TRADEPreprocessor(slot_meta, tokenizer, word_dropout_rate = args.word_dropout)
args.vocab_size = len(tokenizer)
args.n_gate = len(processor.gating2id) 

# Slot Meta tokenizing for the decoder initial inputs
tokenized_slot_meta = []
for slot in slot_meta:
    tokenized_slot_meta.append(
        tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)
    )
'''각 domain-slot pair를 토크나이징 한다.
tokenized_slot_meta[0] = [6728, 21170, 3311, 4112]
tokenized_slot_meta[1] = [6728, 6295, 4199, 0]
tokenized_slot_meta[2] = [6728, 17502, 6259, 0]
'''

'각 domain-slot pair를 토크나이징 한다.\ntokenized_slot_meta[0] = [6728, 21170, 3311, 4112]\ntokenized_slot_meta[1] = [6728, 6295, 4199, 0]\ntokenized_slot_meta[2] = [6728, 17502, 6259, 0]\n'

In [5]:
# import importlib
# import model_transformer
# importlib.reload(model_transformer)
# from model_transformer import TRADE, masked_cross_entropy_for_value

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # Model 선언
# model = TRADE(args, tokenized_slot_meta)
# # model.set_subword_embedding(args.model_name_or_path)  # Subword Embedding 초기화
# # print(f"Subword Embeddings is loaded from {args.model_name_or_path}")
# model.to(device)
# print("Model is initialized")

In [5]:
# gating 갯수 none, dontcare, ptr

# Extracting Featrues
train_features = processor.convert_examples_to_features(train_examples)
'''train_features[2]
OpenVocabDSTFeature(guid='snowy-hat-8324:관광_식당_11-2', input_id=[2, 3, 6265, 6672, 4073, 3249, 4034, 8732, 4292, 6722, 4076, 8553, 3, 11655, 4279, 8553, 18, 6336, 4481, 22014, 6771, 4204, 4112, 8538, 4147, 27233, 35, 18790, 4086, 24, 4469, 10749, 14043, 4006, 4073, 4325, 3311, 4112, 6392, 4110, 2734, 4219, 3249, 4576, 6216, 18, 3, 3311, 4116, 4150, 7149, 18790, 4112, 2633, 4151, 4076, 5240, 4050, 6698, 4467, 4029, 4070, 13177, 4479, 4065, 4150, 35, 3, 6698, 4467, 4029, 4034, 9908, 26885, 11684, 25845, 4204, 10561, 18, 2373, 6289, 4279, 4147, 2054, 3249, 4154, 4161, 10397, 35, 3, 2279, 13090, 4192, 2024, 4112, 6249, 4234, 15532, 4403, 4292, 2010, 4219, 4451, 4112, 4244, 4150, 11431, 4221, 4007, 3249, 16868, 4479, 4150, 3], segment_id=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], gating_id=[0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], target_ids=[[21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [6336, 4481, 22014, 6771, 4204, 3], [8732, 3, 0, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [6265, 6672, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [93, 6756, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [15532, 4403, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [6265, 6672, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0], [21832, 11764, 3, 0, 0, 0]])
'''
dev_features = processor.convert_examples_to_features(dev_examples)
'''dev_features[2]
OpenVocabDSTFeature(guid='wild-bonus-5601:식당_택시_12-2', input_id=[2, 3, 11655, 4279, 8553, 18, 6265, 10097, 4073, 8117, 4070, 6259, 4283, 26713, 4403, 4292, 3430, 4219, 3249, 4576, 6216, 18, 3, 11655, 4279, 8553, 18, 8863, 6243, 29365, 4034, 27672, 4034, 13177, 2411, 4114, 4065, 4150, 35, 3, 27672, 4034, 14053, 18781, 4150, 18, 3234, 18, 11139, 4147, 10472, 4110, 6477, 4279, 4034, 2084, 10749, 6465, 8161, 10756, 18, 3, 2279, 18, 3084, 5012, 4576, 6216, 18, 8863, 6265, 16417, 4050, 4073, 6767, 4283, 15119, 4083, 4007, 30524, 2084, 4112, 8538, 4147, 27233, 35, 18790, 4112, 24, 18, 23, 10749, 6465, 4608, 6216, 18, 3, 11946, 4279, 17164, 6479, 3757, 4467, 4172, 6304, 7090, 4151, 4076, 4114, 5012, 7933, 35, 3], segment_id=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], gating_id=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], target_ids=[[21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3]])

--> gating_id가 전부 0이고 taret_id가 전부 똑같다(전부 'none'임).

dev_labels['wild-bonus-5601:식당_택시_12-2'] = ['식당-가격대-dontcare', '식당-지역-서울 북쪽', '식당-종류-중식당', '식당-주차 가능-yes', '식당-주류 판매-yes']

'''
print('hoho')

Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors


hoho


In [6]:


train_data = WOSDataset(train_features)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(
    train_data,
    batch_size=3,
    sampler=train_sampler,
    collate_fn=processor.collate_fn,
)
'''collate_fn에서 한 배치안에서 가장 긴 input_id의 길이를 기준으로 다른 input_id를 패딩한다. 
'''
print("# train:", len(train_data))

dev_data = WOSDataset(dev_features)
dev_sampler = SequentialSampler(dev_data)
dev_loader = DataLoader(
    dev_data,
    batch_size=3,
    sampler=dev_sampler,
    collate_fn=processor.collate_fn,
)
print("# dev:", len(dev_data))

# train: 46287
# dev: 4958


In [9]:
import importlib
import model_transformer
import inference
importlib.reload(model_transformer)
importlib.reload(inference)
from inference import inference
from model_transformer import TRADE, masked_cross_entropy_for_value

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Model 선언
model = TRADE(args, tokenized_slot_meta)
# model.set_subword_embedding(args.model_name_or_path)  # Subword Embedding 초기화
# print(f"Subword Embeddings is loaded from {args.model_name_or_path}")
model.to(device)
print("Model is initialized")

# Optimizer 및 Scheduler 선언
n_epochs = args.num_train_epochs
t_total = len(train_loader) * n_epochs
warmup_steps = int(t_total * args.warmup_ratio)
optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

loss_fnc_1 = masked_cross_entropy_for_value  # generation
loss_fnc_2 = nn.CrossEntropyLoss()

Model is initialized


In [9]:
## reference하는 변수가 없는 오브젝트들을 gpu 메모리에서 free시킨다.
torch.cuda.empty_cache()

In [18]:
import pdb

In [None]:
best_score, best_checkpoint = 0, 0
for epoch in range(n_epochs):
    model.train()
    for step, batch in enumerate(train_loader):
        input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [
            b.to(device) if not isinstance(b, list) else b for b in batch
        ] ## b가 list가 아니면 b.to(device)를 하고 b가 list면 그냥 b를 쓴다.

        all_point_outputs, all_gate_outputs = model(
            input_ids, target_ids, segment_ids, input_masks, 
        )
        pdb.set_trace()

        # generation loss
        loss_1 = loss_fnc_1(
            all_point_outputs.contiguous(),
            target_ids.contiguous().view(-1),
            tokenizer.pad_token_id,
        )

        # gating loss
        loss_2 = loss_fnc_2(
            all_gate_outputs.contiguous().view(-1, args.n_gate),
            gating_ids.contiguous().view(-1),
        )
        loss = loss_1 + loss_2

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        predictions = inference(model, dev_loader, processor, device)
        eval_result = _evaluation(predictions, dev_labels, slot_meta)
        torch.cuda.empty_cache()

> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(422)[0;36mforward[0;34m()[0m
[0;32m    420 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    421 [0;31m        [0;31m# dom_slot자리를 고려하여 causal_mask를 한칸더 크게 만들기 - 했음.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 422 [0;31m        [0mtrg_len[0m [0;34m=[0m [0mtarget_ids[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    423 [0;31m        [0mcausal_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mgenerate_causal_mask[0m[0;34m([0m[0mtrg_len[0m[0;34m+[0m[0;36m1[0m[0;34m)[0m [0;31m# shape (trg_len+1, trg_len+1)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    424 [0;31m        [0;31m## causal_mask는 parallel decoding을 위한 처리를 따로 할 필요없다. 알아서 브로드캐스팅 됨. MultiHeadAttention의 코드 참고.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(423)[0;36mforward[0;34m()[0m
[0;32m    421 [0;31m        [0;31m# dom_slot자리를 고려하여 causal_mask를 한칸더 크게 만들기 - 했음.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    422 [0;31m        [0mtrg_len[0m [0;34m=[0m [0mtarget_ids[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 423 [0;31m        [0mcausal_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mgenerate_causal_mask[0m[0;34m([0m[0mtrg_len[0m[0;34m+[0m[0;36m1[0m[0;34m)[0m [0;31m# shape (trg_len+1, trg_len+1)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    424 [0;31m        [0;31m## causal_mask는 parallel decoding을 위한 처리를 따로 할 필요없다. 알아서 브로드캐스팅 됨. MultiHeadAttention의 코드 참고.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    425 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(429)[0;36mforward[0;34m()[0m
[0;32m    427 [0;31m        [0;31m#####################################[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    428 [0;31m[0;34m[0m[0m
[0m[0;32m--> 429 [0;31m        [0minput_masks[0m [0;34m=[0m [0minput_masks[0m[0;34m.[0m[0mne[0m[0;34m([0m[0;36m1[0m[0;34m)[0m [0;31m## input_masks의 True와 False를 반전시킨다.(True는 False로, False는 True로)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    430 [0;31m        [0;34m'''transformer 라이브러리의 BERT에서 쓰는 패딩마스크는 [True, True, True, True,True, False, False, False] 와 같은 형태임. 내가 짠 transformer코드에서는 [False, False, False, False, False, True, True, True]같은 형태를 쓰므로 토글시켜줘야 한다.'''[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    431 [0;31m        [0;31m# J, slot_meta : key : [domain, slot] ex> LongTensor([1,2])[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(434)[0;36mforward[0;34m()[0m
[0;32m    432 [0;31m        [0;31m# J,2[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    433 [0;31m[0;34m[0m[0m
[0m[0;32m--> 434 [0;31m        [0mslot[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mLongTensor[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mslot_embed_idx[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0minput_ids[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    435 [0;31m        '''domain-slot을 토크나이징하여 얻은 인덱스들에 패딩까지 넣은것이 slot임
[0m[0;32m    436 [0;31m        [0mshape[0m [0;34m([0m[0mJ[0m[0;34m,[0m [0;36m4[0m[0;34m)[0m[0;34m.[0m [0m현재[0m [0m데이터에서는[0m [0;36m4[0m[0m가[0m [0m토크나이징된[0m [0mdomain[0m[0;34m-[0m[0mslot들[0m [0m중에[0m [0m가장[0m [0m긴[0m [0m것이다[0m[0;34m.[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(438)[0;36mforward[0;34m()[0m
[0;32m    436 [0;31m        [0mshape[0m [0;34m([0m[0mJ[0m[0;34m,[0m [0;36m4[0m[0;34m)[0m[0;34m.[0m [0m현재[0m [0m데이터에서는[0m [0;36m4[0m[0m가[0m [0m토크나이징된[0m [0mdomain[0m[0;34m-[0m[0mslot들[0m [0m중에[0m [0m가장[0m [0m긴[0m [0m것이다[0m[0;34m.[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    437 [0;31m        '''
[0m[0;32m--> 438 [0;31m        [0mslot_e[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0msum[0m[0;34m([0m[0mself[0m[0;34m.[0m[0membedding[0m[0;34m([0m[0mslot[0m[0;34m)[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m  [0;31m# (J, embedding_dim = 768)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    439 [0;31m        '''self.embedding(slot).size() = torch.Size([J, slot_length, hidden_size])
[0m[0;32m    440 [0;31m        [0m한[0m [0mdomain[0m[0;34m-[0m[0mslot에[0m [0m대한[0m [0m모든[0m [0m토큰들의[0m [0m임베딩벡터를[0m [0m합친다[0m[0;34m.[0m 

ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(443)[0;36mforward[0;34m()[0m
[0;32m    441 [0;31m        [0;34m-[0m[0;34m->[0m [0mslot_e[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0m은[0m [0;36m0[0m[0m번째[0m [0mslot의[0m [0membedding[0m [0mvector인[0m [0m것임[0m[0;34m.[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    442 [0;31m        '''
[0m[0;32m--> 443 [0;31m        [0mbatch_size[0m [0;34m=[0m [0mencoder_output[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    444 [0;31m        [0mJ[0m[0;34m,[0m [0mhidden_size[0m [0;34m=[0m [0mslot_e[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    445 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(444)[0;36mforward[0;34m()[0m
[0;32m    442 [0;31m        '''
[0m[0;32m    443 [0;31m        [0mbatch_size[0m [0;34m=[0m [0mencoder_output[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 444 [0;31m        [0mJ[0m[0;34m,[0m [0mhidden_size[0m [0;34m=[0m [0mslot_e[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    445 [0;31m[0;34m[0m[0m
[0m[0;32m    446 [0;31m        [0;31m##inputs_embed에 dom_slot에 대한 embeding vecotor를 맨 앞에 넣어준다.(J*batch_size, trg_len+1, hidden_size)로 만들어야 함.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(447)[0;36mforward[0;34m()[0m
[0;32m    445 [0;31m[0;34m[0m[0m
[0m[0;32m    446 [0;31m        [0;31m##inputs_embed에 dom_slot에 대한 embeding vecotor를 맨 앞에 넣어준다.(J*batch_size, trg_len+1, hidden_size)로 만들어야 함.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 447 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mzeros[0m[0;34m([0m[0mJ[0m[0;34m*[0m[0mbatch_size[0m[0;34m,[0m [0mtrg_len[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0mhidden_size[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0minput_ids[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m [0;31m# (J*batch_size, trg_len+1, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    448 [0;31m        [0mslot_e[0m [0;34m=[0m [0mslot_e[0m[0;34m.[0m[0mrepeat[0m[0;34m([0m[0mbatch_size[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m [0;31m# (J*batch_size, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m

ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(448)[0;36mforward[0;34m()[0m
[0;32m    446 [0;31m        [0;31m##inputs_embed에 dom_slot에 대한 embeding vecotor를 맨 앞에 넣어준다.(J*batch_size, trg_len+1, hidden_size)로 만들어야 함.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    447 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mzeros[0m[0;34m([0m[0mJ[0m[0;34m*[0m[0mbatch_size[0m[0;34m,[0m [0mtrg_len[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0mhidden_size[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0minput_ids[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m [0;31m# (J*batch_size, trg_len+1, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 448 [0;31m        [0mslot_e[0m [0;34m=[0m [0mslot_e[0m[0;34m.[0m[0mrepeat[0m[0;34m([0m[0mbatch_size[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m [0;31m# (J*batch_size, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    449 [0;31m        [0mtarget_ids[0m 

ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(449)[0;36mforward[0;34m()[0m
[0;32m    447 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mzeros[0m[0;34m([0m[0mJ[0m[0;34m*[0m[0mbatch_size[0m[0;34m,[0m [0mtrg_len[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0mhidden_size[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0minput_ids[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m [0;31m# (J*batch_size, trg_len+1, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    448 [0;31m        [0mslot_e[0m [0;34m=[0m [0mslot_e[0m[0;34m.[0m[0mrepeat[0m[0;34m([0m[0mbatch_size[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m [0;31m# (J*batch_size, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 449 [0;31m        [0mtarget_ids[0m [0;34m=[0m [0mtarget_ids[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0mJ[0m[0;34m*[0m[0mbatch_size[0m[0;34m,[0m [0;34m-[0m[0;36m1[0m[0;34m)[0m [0;31m## (J*batch_size, trg

ipdb>  target_ids


tensor([[[21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [ 9459,     3,     0,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [   93,  6756,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [21832, 11764,     3,     0,     0],
         [ 6265,  9778,     3,    

ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(450)[0;36mforward[0;34m()[0m
[0;32m    448 [0;31m        [0mslot_e[0m [0;34m=[0m [0mslot_e[0m[0;34m.[0m[0mrepeat[0m[0;34m([0m[0mbatch_size[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m [0;31m# (J*batch_size, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    449 [0;31m        [0mtarget_ids[0m [0;34m=[0m [0mtarget_ids[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0mJ[0m[0;34m*[0m[0mbatch_size[0m[0;34m,[0m [0;34m-[0m[0;36m1[0m[0;34m)[0m [0;31m## (J*batch_size, trg_len)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 450 [0;31m        [0mtargets_embed[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0membed[0m[0;34m([0m[0mtarget_ids[0m[0;34m)[0m [0;31m## (J*batch_size, trg_len, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    451 [0;31m[0;34m[0m[0m
[0m[0;32m    452 [0;31m        [0mdecoder_input[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m1[0m[0;34m:[0m[0

ipdb>  target_ids


tensor([[21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [ 9459,     3,     0,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [   93,  6756,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [21832, 11764,     3,     0,     0],
        [ 6265,  9778,     3,     0,     0],
        [2

ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(452)[0;36mforward[0;34m()[0m
[0;32m    450 [0;31m        [0mtargets_embed[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0membed[0m[0;34m([0m[0mtarget_ids[0m[0;34m)[0m [0;31m## (J*batch_size, trg_len, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    451 [0;31m[0;34m[0m[0m
[0m[0;32m--> 452 [0;31m        [0mdecoder_input[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m1[0m[0;34m:[0m[0;34m,[0m[0;34m:[0m[0;34m][0m [0;34m=[0m [0mtargets_embed[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    453 [0;31m        [0mdecoder_input[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m0[0m[0;34m,[0m[0;34m:[0m[0;34m][0m [0;34m=[0m [0mslot_e[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    454 [0;31m        '''첫번째 데이터에 대한 slot1 의 value벡터(trg_len+1, hidden_size)가 나오고 그다음
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(453)[0;36mforward[0;34m()[0m
[0;32m    451 [0;31m[0;34m[0m[0m
[0m[0;32m    452 [0;31m        [0mdecoder_input[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m1[0m[0;34m:[0m[0;34m,[0m[0;34m:[0m[0;34m][0m [0;34m=[0m [0mtargets_embed[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 453 [0;31m        [0mdecoder_input[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m0[0m[0;34m,[0m[0;34m:[0m[0;34m][0m [0;34m=[0m [0mslot_e[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    454 [0;31m        '''첫번째 데이터에 대한 slot1 의 value벡터(trg_len+1, hidden_size)가 나오고 그다음
[0m[0;32m    455 [0;31m        [0m첫번째[0m [0m데이터에[0m [0m대한[0m [0mslot2의[0m [0mvalue벡터[0m[0;34m([0m[0mtrg_len[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0mhidden_size[0m[0;34m)[0m[0m가[0m [0m나오고[0m[0;34m...[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(462)[0;36mforward[0;34m()[0m
[0;32m    460 [0;31m[0;34m[0m[0m
[0m[0;32m    461 [0;31m        [0;31m##dom_slot을 고려하여 한칸 만큼 더 큰 pos_embed를 만든다. -> 했음.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 462 [0;31m        [0mpos_embed[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0membed_positions[0m[0;34m([0m[0mtarget_ids[0m[0;34m)[0m [0;31m## (J*batch_size, trg_len+1, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    463 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mdecoder_input[0m [0;34m+[0m [0mpos_embed[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    464 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0mdropout[0m[0;34m([0m[0mdecoder_input[0m[0;34m,[0m [0mp[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mattention_drop_out[0m[0;34m,[0m [0mtraining[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtraining[0m[0;34m)[0m[0;34m[0m[0;34m[0m[

ipdb>  decoder_input[:,0,:3]


tensor([[-0.0447, -0.1087, -0.0159],
        [-0.0150, -0.0414, -0.0780],
        [ 0.1801,  0.0679, -0.0746],
        [ 0.1522, -0.0320,  0.0410],
        [ 0.0706,  0.0852, -0.1165],
        [ 0.1603,  0.1618, -0.1339],
        [ 0.1611,  0.0808, -0.0767],
        [ 0.1622,  0.0553, -0.1443],
        [ 0.2025,  0.0596, -0.1153],
        [ 0.1520,  0.1092, -0.1678],
        [ 0.1547,  0.1440, -0.1313],
        [ 0.1407, -0.0244, -0.2119],
        [ 0.1015, -0.0085, -0.1874],
        [ 0.1322, -0.0368, -0.1185],
        [ 0.1528,  0.0038, -0.0061],
        [ 0.0697,  0.0268, -0.0932],
        [ 0.1349,  0.2378, -0.1905],
        [ 0.1374,  0.2059, -0.0642],
        [ 0.1769,  0.2451, -0.1319],
        [ 0.1357,  0.1568, -0.1334],
        [ 0.1368,  0.1313, -0.2010],
        [ 0.1772,  0.1356, -0.1720],
        [ 0.1460, -0.0657,  0.0242],
        [ 0.1880,  0.2472, -0.1353],
        [ 0.1605,  0.0937, -0.1446],
        [ 0.1631,  0.1285, -0.1081],
        [ 0.0675, -0.1390, -0.0153],
 

ipdb>  decoder_input[0,:,0:3]


tensor([[-0.0447, -0.1087, -0.0159],
        [-0.0521,  0.0296,  0.0271],
        [ 0.0053, -0.0106,  0.0190],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  decoder_input[1,:,0:3]


tensor([[-0.0150, -0.0414, -0.0780],
        [-0.0521,  0.0296,  0.0271],
        [ 0.0053, -0.0106,  0.0190],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  targets_embed[0,:,0:3]


tensor([[-0.0521,  0.0296,  0.0271],
        [ 0.0053, -0.0106,  0.0190],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  targets_embed[9,:,0:3]


tensor([[ 0.0227, -0.0157, -0.0697],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  decoder_input[9,:,0:3]


tensor([[ 0.1520,  0.1092, -0.1678],
        [ 0.0227, -0.0157, -0.0697],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  slot_e[9,:3]


tensor([ 0.1520,  0.1092, -0.1678], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  decoder_input[9+45,:,0:3]


tensor([[ 0.1520,  0.1092, -0.1678],
        [-0.0691,  0.0293, -0.0100],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(463)[0;36mforward[0;34m()[0m
[0;32m    461 [0;31m        [0;31m##dom_slot을 고려하여 한칸 만큼 더 큰 pos_embed를 만든다. -> 했음.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    462 [0;31m        [0mpos_embed[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0membed_positions[0m[0;34m([0m[0mtarget_ids[0m[0;34m)[0m [0;31m## (J*batch_size, trg_len+1, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 463 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mdecoder_input[0m [0;34m+[0m [0mpos_embed[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    464 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0mdropout[0m[0;34m([0m[0mdecoder_input[0m[0;34m,[0m [0mp[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mattention_drop_out[0m[0;34m,[0m [0mtraining[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtraining[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    465 [0;31m[0;34m[0m[

ipdb>  pos_enbed[:,:,0:3]


*** NameError: name 'pos_enbed' is not defined


ipdb>  pos_embed[:,:,0:3]


*** IndexError: too many indices for tensor of dimension 2


ipdb>  target_ids.size()


torch.Size([135, 5])


ipdb>  positions.size()


*** NameError: name 'positions' is not defined


ipdb>  pos_embed.size()


torch.Size([6, 768])


ipdb>  pos_embed[:,:3]


tensor([[ 0.0000,  1.0000,  0.0000],
        [ 0.8415,  0.5403,  0.8284],
        [ 0.9093, -0.4161,  0.9280],
        [ 0.1411, -0.9900,  0.2111],
        [-0.7568, -0.6536, -0.6915],
        [-0.9589,  0.2837, -0.9857]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  decoder_input[0,:,:3]


tensor([[-0.0447, -0.1087, -0.0159],
        [-0.0521,  0.0296,  0.0271],
        [ 0.0053, -0.0106,  0.0190],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  decoder_input[1,:,:3]


tensor([[-0.0150, -0.0414, -0.0780],
        [-0.0521,  0.0296,  0.0271],
        [ 0.0053, -0.0106,  0.0190],
        [-0.0117, -0.0335,  0.0225],
        [ 0.0405,  0.0565, -0.0827],
        [ 0.0405,  0.0565, -0.0827]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(464)[0;36mforward[0;34m()[0m
[0;32m    462 [0;31m        [0mpos_embed[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0membed_positions[0m[0;34m([0m[0mtarget_ids[0m[0;34m)[0m [0;31m## (J*batch_size, trg_len+1, hidden_size)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    463 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mdecoder_input[0m [0;34m+[0m [0mpos_embed[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 464 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0mdropout[0m[0;34m([0m[0mdecoder_input[0m[0;34m,[0m [0mp[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mattention_drop_out[0m[0;34m,[0m [0mtraining[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtraining[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    465 [0;31m[0;34m[0m[0m
[0m[0;32m    466 [0;31m        [0minput_ids[0m [0;34m=[0m [0minput_ids[0m[0;34m.[0m[0mrepeat_interleave[0m[0;34m([

ipdb>  decoder_input[0,:,:3]


tensor([[-0.0447,  0.8913, -0.0159],
        [ 0.7894,  0.5699,  0.8555],
        [ 0.9146, -0.4268,  0.9470],
        [ 0.1294, -1.0235,  0.2336],
        [-0.7163, -0.5971, -0.7742],
        [-0.9184,  0.3402, -1.0684]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  decoder_input[1,:,:3]


tensor([[-0.0150,  0.9586, -0.0780],
        [ 0.7894,  0.5699,  0.8555],
        [ 0.9146, -0.4268,  0.9470],
        [ 0.1294, -1.0235,  0.2336],
        [-0.7163, -0.5971, -0.7742],
        [-0.9184,  0.3402, -1.0684]], device='cuda:0', grad_fn=<SliceBackward>)


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(466)[0;36mforward[0;34m()[0m
[0;32m    464 [0;31m        [0mdecoder_input[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0mdropout[0m[0;34m([0m[0mdecoder_input[0m[0;34m,[0m [0mp[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mattention_drop_out[0m[0;34m,[0m [0mtraining[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtraining[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    465 [0;31m[0;34m[0m[0m
[0m[0;32m--> 466 [0;31m        [0minput_ids[0m [0;34m=[0m [0minput_ids[0m[0;34m.[0m[0mrepeat_interleave[0m[0;34m([0m[0mJ[0m[0;34m,[0m [0mdim[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    467 [0;31m        '''(J*batch_size, src_len)
[0m[0;32m    468 [0;31m        [0m첫번째[0m [0m데이터에[0m [0m대한[0m [0minput_id가[0m [0mJ번[0m [0m반복되고[0m [0m두번째[0m [0m데이터에[0m [0m대한[0m [0minput_id가[0m [0mJ번[0m [0m반복되고[0m [0;34m...[0m[0;34m[0m[0;34m

ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(471)[0;36mforward[0;34m()[0m
[0;32m    469 [0;31m        '''        
[0m[0;32m    470 [0;31m[0;34m[0m[0m
[0m[0;32m--> 471 [0;31m        [0minput_masks[0m [0;34m=[0m [0minput_masks[0m[0;34m.[0m[0mrepeat_interleave[0m[0;34m([0m[0mJ[0m[0;34m,[0m [0mdim[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    472 [0;31m        '''(J*batch_size, seq_len)
[0m[0;32m    473 [0;31m        첫번째 데이터에 대한 input_mask가 J번 반복되고 두번째 데이터에 대한 input_mask가 J번 반복되고 ...'''
[0m


ipdb>  input_ids[:,:3]


tensor([[    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        [    2,     3, 11655],
        

ipdb>  input_ids.size()


torch.Size([135, 446])


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(474)[0;36mforward[0;34m()[0m
[0;32m    472 [0;31m        '''(J*batch_size, seq_len)
[0m[0;32m    473 [0;31m        첫번째 데이터에 대한 input_mask가 J번 반복되고 두번째 데이터에 대한 input_mask가 J번 반복되고 ...'''
[0m[0;32m--> 474 [0;31m        [0mencoder_output[0m [0;34m=[0m [0mencoder_output[0m[0;34m.[0m[0mrepeat_interleave[0m[0;34m([0m[0mJ[0m[0;34m,[0m [0mdim[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    475 [0;31m        '''(J*batch_size, src_len, hidden_size)첫번째 데이터에 대한 (1, src_len, hidden_size)가 J번 반복되고 
[0m[0;32m    476 [0;31m        [0m그[0m [0m다음[0m [0m두번째[0m [0m데이터에[0m [0m대한[0m [0;34m([0m[0;36m1[0m[0;34m,[0m [0mseq_len[0m[0;34m,[0m [0mhidden_size[0m[0;34m)[0m[0m가[0m [0mJ번[0m [0m반복되고[0m [0;34m...[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/opt/ml/code/TRADE_transformer_decoder/model_transformer.py[0m(479)[0;36mforward[0;34m()[0m
[0;32m    477 [0;31m        '''
[0m[0;32m    478 [0;31m[0;34m[0m[0m
[0m[0;32m--> 479 [0;31m        [0menc_dec_attn_weights[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    480 [0;31m        [0;32mfor[0m [0mdecoder_layer[0m [0;32min[0m [0mself[0m[0;34m.[0m[0mlayers[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    481 [0;31m            decoder_input, _, attn_weights = decoder_layer(decoder_input, 
[0m


ipdb>  target_embed[:,:,:3]


*** NameError: name 'target_embed' is not defined


ipdb>  targets_embed[:,:,:3]


tensor([[[-0.0521,  0.0296,  0.0271],
         [ 0.0053, -0.0106,  0.0190],
         [-0.0117, -0.0335,  0.0225],
         [ 0.0405,  0.0565, -0.0827],
         [ 0.0405,  0.0565, -0.0827]],

        [[-0.0521,  0.0296,  0.0271],
         [ 0.0053, -0.0106,  0.0190],
         [-0.0117, -0.0335,  0.0225],
         [ 0.0405,  0.0565, -0.0827],
         [ 0.0405,  0.0565, -0.0827]],

        [[-0.0521,  0.0296,  0.0271],
         [ 0.0053, -0.0106,  0.0190],
         [-0.0117, -0.0335,  0.0225],
         [ 0.0405,  0.0565, -0.0827],
         [ 0.0405,  0.0565, -0.0827]],

        ...,

        [[ 0.0534,  0.0643,  0.0315],
         [ 0.1325,  0.0267, -0.0237],
         [-0.0600, -0.0250,  0.0295],
         [ 0.0294, -0.0456, -0.0897],
         [-0.0117, -0.0335,  0.0225]],

        [[-0.0066,  0.0227, -0.0155],
         [-0.0577,  0.0336, -0.0785],
         [ 0.0541,  0.0101,  0.0498],
         [-0.0117, -0.0335,  0.0225],
         [ 0.0405,  0.0565, -0.0827]],

        [[ 0.0866, -0.0871