In [1]:
import os
import sys
import json
import wandb
import random
import numpy as np
from importlib import import_module
from sklearn.model_selection import StratifiedKFold, train_test_split
from collections import defaultdict

sys.path.insert(0, "../CustomizedModule")
from CustomizedScheduler import get_scheduler
from CustomizedOptimizer import get_optimizer

import torch
import torch.nn as nn
from tqdm import tqdm

from eval_utils import DSTEvaluator
from evaluation import _evaluation
from inference import inference_TRADE
from data_utils import train_data_loading, get_data_loader

from preprocessor import TRADEPreprocessor
from model import TRADE
from criterions import LabelSmoothingLoss, masked_cross_entropy_for_value


In [2]:
def get_informations(args):
    # Define Tokenizer
    tokenizer_module = getattr(
        import_module("transformers"), f"{args.model_name}Tokenizer"
    )
    tokenizer = tokenizer_module.from_pretrained(args.pretrained_name_or_path)

    slot_meta, train_examples, train_labels = train_data_loading(args, isUserFirst=False, isDialogueLevel=False)

    # Define Preprocessor
    processor = TRADEPreprocessor(slot_meta, tokenizer)

    # Extract Features
    train_features = processor.convert_examples_to_features(train_examples)

    # Slot Meta tokenizing for the decoder initial inputs
    tokenized_slot_meta = []
    for slot in slot_meta:
        tokenized_slot_meta.append(
            tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)
        )

    args.vocab_size = len(tokenizer)
    args.n_gate = len(processor.gating2id)  # gating 갯수 none, dontcare, ptr

    # json.dump(
    #     vars(args),
    #     open(f"{args.model_dir}/{args.model_fold}/exp_config.json", "w"),
    #     indent=2,
    #     ensure_ascii=False,
    # )
    # json.dump(
    #     slot_meta,
    #     open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "w"),
    #     indent=2,
    #     ensure_ascii=False,
    # )
    return tokenizer, processor, slot_meta, tokenized_slot_meta, train_features, train_labels
    

In [21]:
list(train_labels.values())[0]

['관광-종류-박물관', '관광-지역-서울 중앙']

In [12]:
from argparse import Namespace
args = Namespace()
args.model_name = "Bert"
args.pretrained_name_or_path = "dsksd/bert-ko-small-minimal"
args.model_dir = "../models"
args.model_fold = "trade-kfold"
args.data_dir = "../input/data/train_dataset"

tokenizer, processor, slot_meta, tokenized_slot_meta, train_features, train_labels = get_informations(args)

100%|██████████| 7000/7000 [00:00<00:00, 16035.25it/s]
[Conversion: Examples > Features]:   0%|          | 51/51245 [00:00<05:04, 167.94it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
[Conversion: Examples > Features]: 100%|██████████| 51245/51245 [06:02<00:00, 141.28it/s]


In [13]:
def select_kfold_or_full(args, tokenizer, processor, slot_meta, tokenized_slot_meta, features, labels):
    domain_group = {
        '관광_식당':0,
        '관광':1,
        '지하철':2,
        '택시':3,
        '식당_택시':4,
        '숙소_택시':5,
        '식당':6,
        '숙소_식당':7,
        '숙소':8,
        '관광_택시':9,
        '관광_숙소_식당':10,
        '관광_숙소':11,
        '숙소_식당_택시':12,
        '관광_식당_택시':13,
        '관광_숙소_택시':14
    }

    features = np.array(features)
    dialogue_features, dialogue_labels, domain_labels = defaultdict(list), defaultdict(list), []
    for f in features:
        dialogue = '-'.join(f.guid.split('-')[:-1])
        # TODO: 의심스럽... 순서대로 들어갈까..? 순서대로가 아니어도 괜찮기는 함.. 흠
        # dialogue_features는 키 값이 dialogue_idx이고 value가 feature들의 리스트로 이루어짐
        dialogue_features[dialogue].append(f)

    for k, v in dialogue_features.items():
        feature_domain = '_'.join(sorted(v[0].domain))
        if '지하철' in feature_domain:
            feature_domain = '지하철'
        domain_labels.append(domain_group[feature_domain])

    for k, v in labels.items():
        dialogue_labels['-'.join(k.split('-')[:-1])].append([k, v])

    if args.isKfold:
        kf = StratifiedKFold(n_splits=args.fold_num, random_state=args.seed, shuffle=True)
        fold_idx = 1
        
        print(len(features))
        print(len(domain_labels))

        for train_index, dev_index in kf.split(list(dialogue_features.keys()), domain_labels):
            # 찾은듯!! domain_labels는 길이가 domain 갯수 만큼이고, features는 turn 별 값일 것이다
            # TODO: 여기서 출력 해보고 확인한자 - 갯수 달랐음
            # os.makedirs(f'{args.model_dir}/{args.model_fold}/{fold_idx}-fold', exist_ok=True)

            train_dialogue_features, dev_dialogue_features = np.array(list(dialogue_features.items()))[train_index.astype(int)], np.array(list(dialogue_features.items()))[dev_index.astype(int)]
            
            train_features, dev_features = [], []
            [train_features.extend(t[1]) for t in train_dialogue_features]
            [dev_features.extend(t[1]) for t in dev_dialogue_features]

            dev_dialogue_labels = {k: labels[k] for k in dev_features}
            dev_dialogue_labels = np.array(list(dialogue_labels.items()))[dev_index.astype(int)]
            dev_labels = {t[0]:t[1] for turn in dev_dialogue_labels[:, 1] for t in turn}

            train_loader = get_data_loader(processor, train_features, args.train_batch_size)
            dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size)

            print(f"========= {fold_idx} fold =========")
            train_model(args, tokenizer, processor, slot_meta, tokenized_slot_meta, fold_idx, train_loader, dev_loader, dev_labels)
            fold_idx += 1
        
    else:
        fold_idx = None
        train_index, dev_index = train_test_split(np.array(range(len(dialogue_features))), test_size=0.1, random_state=args.seed, stratify=domain_labels)
        
        train_dialogue_features, dev_dialogue_features = np.array(list(dialogue_features.items()))[train_index.astype(int)], np.array(list(dialogue_features.items()))[dev_index.astype(int)]

        train_features, dev_features = [], []
        [train_features.extend(t[1]) for t in train_dialogue_features]
        [dev_features.extend(t[1]) for t in dev_dialogue_features]

        dev_dialogue_labels = np.array(list(dialogue_labels.items()))[dev_index.astype(int)]
        dev_labels = {t[0]:t[1] for turn in dev_dialogue_labels[:, 1] for t in turn}

        train_loader = get_data_loader(processor, train_features, args.train_batch_size)
        dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size)

        train_model(args, tokenizer, processor, slot_meta, tokenized_slot_meta, fold_idx, train_loader, dev_loader, dev_labels)
    return train_loader, dev_loader

In [17]:
args.isKfold = True
args.fold_num = 5
args.seed = 42
select_kfold_or_full(args, tokenizer, processor, slot_meta, tokenized_slot_meta, train_features, train_labels)


51245
7000


In [23]:
dialogue_features, dialogue_labels, domain_labels = defaultdict(list), defaultdict(list), []
for f in train_features:
    dialogue = '-'.join(f.guid.split('-')[:-1])
    # TODO: 의심스럽... 순서대로 들어갈까..? 순서대로가 아니어도 괜찮기는 함.. 흠
    # dialogue_features는 키 값이 dialogue_idx이고 value가 feature들의 리스트로 이루어짐
    dialogue_features[dialogue].append(f)

In [24]:
len(dialogue_features)

7000

In [28]:
list(dialogue_features.keys())[0]

'snowy-hat-8324:관광_식당_11'

In [32]:
dialogue_features['snowy-hat-8324:관광_식당_11'][3].guid

'snowy-hat-8324:관광_식당_11-3'

In [35]:
features = np.array(train_features)
labels = train_labels

domain_group = {
        '관광_식당':0,
        '관광':1,
        '지하철':2,
        '택시':3,
        '식당_택시':4,
        '숙소_택시':5,
        '식당':6,
        '숙소_식당':7,
        '숙소':8,
        '관광_택시':9,
        '관광_숙소_식당':10,
        '관광_숙소':11,
        '숙소_식당_택시':12,
        '관광_식당_택시':13,
        '관광_숙소_택시':14
    }
dialogue_features, dialogue_labels, domain_labels = defaultdict(list), defaultdict(list), []
for f in features:
    dialogue = '-'.join(f.guid.split('-')[:-1])
    dialogue_features[dialogue].append(f)

for k, v in dialogue_features.items():
    feature_domain = '_'.join(sorted(v[0].domain))
    if '지하철' in feature_domain:
        feature_domain = '지하철'
    domain_labels.append(domain_group[feature_domain])

for k, v in labels.items():
    dialogue_labels['-'.join(k.split('-')[:-1])].append([k, v])

In [53]:
if True:
    kf = StratifiedKFold(n_splits=args.fold_num, random_state=args.seed, shuffle=True)
    fold_idx = 1
    
    for train_index, dev_index in kf.split(list(features), domain_labels):
        print(train_index, dev_index)
        print(len(train_index), len(dev_index))
        # os.makedirs(f'{args.model_dir}/{args.model_fold}/{fold_idx}-fold', exist_ok=True)

        # train_dialogue_keys, dev_dialogue_keys = np.array(list(dialogue_features.keys()))[train_index.astype(int)], np.array(list(dialogue_features.keys()))[dev_index.astype(int)]
            
        # train_features, dev_features = [], []
        # [train_features.extend(dialogue_features[dialogue_id]) for dialogue_id in train_dialogue_keys]
        # [dev_features.extend(dialogue_features[dialogue_id]) for dialogue_id in dev_dialogue_keys]

        # dev_labels = {f.guid: labels[f.guid] for f in dev_features}
        # train_loader = get_data_loader(processor, train_features, args.train_batch_size)
        # dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size)
 
        break

ValueError: Found input variables with inconsistent numbers of samples: [51245, 7000]

In [38]:
print(len(train_features))
print(len(dev_features))

40952
10293


In [39]:
print(len(features))

51245


In [41]:
dev_dialogue_labels_2 = np.array(list(dialogue_labels.items()))[dev_index.astype(int)]
dev_labels_2 = {t[0]:t[1] for turn in dev_dialogue_labels[:, 1] for t in turn}

In [43]:
dev_features[0]

OpenVocabDSTFeature(guid='summer-voice-4986:식당_관광_9-0', domain=['식당', '관광'], input_id=[2, 3, 10238, 27672, 4234, 15532, 4403, 4292, 3430, 4219, 5158, 7933, 3], segment_id=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], gating_id=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], target_ids=[[21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [21832, 11764, 3], [2

In [49]:
dev_labels_3 = {f.guid: labels[f.guid] for f in dev_features}
# labels['summer-voice-4986:식당_관광_9-0']

In [50]:
for k, v in dev_labels_3.items():
    if dev_labels_2[k] != v:
        print("..")