In [1]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
#-*- coding: utf-8 -*-
#import tensorflow as tf

tf.app.flags.DEFINE_string('f', '', 'kernel') # 주피터에서 커널에 전달하기 위한 프레그 방법
tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size') # 배치 크기
tf.app.flags.DEFINE_integer('train_steps', 20000, 'train steps') # 학습 에포크
tf.app.flags.DEFINE_float('dropout_width', 0.5, 'dropout width') # 드롭아웃 크기
tf.app.flags.DEFINE_integer('layer_size', 3, 'layer size') # 멀티 레이어 크기 (multi rnn)
tf.app.flags.DEFINE_integer('hidden_size', 128, 'weights size') # 가중치 크기
tf.app.flags.DEFINE_float('learning_rate', 1e-3, 'learning rate') # 학습률
tf.app.flags.DEFINE_string('data_path', './../data_in/ChatBotData.csv', 'data path') #  데이터 위치
tf.app.flags.DEFINE_string('vocabulary_path', './data_out/vocabularyData.voc', 'vocabulary path') # 사전 위치
tf.app.flags.DEFINE_string('check_point_path', './data_out/check_point', 'check point path') # 체크 포인트 위치
tf.app.flags.DEFINE_integer('shuffle_seek', 1000, 'shuffle random seek') # 셔플 시드값
tf.app.flags.DEFINE_integer('max_sequence_length', 25, 'max sequence length') # 시퀀스 길이
tf.app.flags.DEFINE_integer('embedding_size', 128, 'embedding size') # 임베딩 크기
tf.app.flags.DEFINE_boolean('tokenize_as_morph', True, 'set morph tokenize') # 형태소에 따른 토크나이징 사용 유무
tf.app.flags.DEFINE_boolean('embedding', True, 'Use Embedding flag') # 임베딩 유무 설정
tf.app.flags.DEFINE_boolean('multilayer', True, 'Use Multi RNN Cell') # 멀티 RNN 유무
# Define FLAGS
DEFINES = tf.app.flags.FLAGS

In [None]:
#if __name__ == '__main__':
#    tf.logging.set_verbosity(tf.logging.INFO)
#    tf.app.run(main)

# data.py 

In [3]:
from konlpy.tag import Okt
import pandas as pd
#import tensorflow as tf
import enum
import os
import re
from sklearn.model_selection import train_test_split
import numpy as np
#from configs import DEFINES
from tqdm import tqdm

In [4]:
FILTERS = "([~.,!?\"':;)(])"
PAD = "<PADDING>"
STD = "<START>"
END = "<END>"
UNK = "<UNKNOWN>"

PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, STD, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)

In [5]:
# 판다스를 통해서 데이터를 불러와 학습 셋과 평가 셋으로
# 나누어 그 값을 리턴한다.
def load_data():
    data_df = pd.read_csv(DEFINES.data_path, header=0)
    question, answer = list(data_df['Q']), list(data_df['A'])
    train_input, eval_input, train_label, eval_label = train_test_split(question, answer, test_size=0.33, random_state=42)
    return train_input, train_label, eval_input, eval_label

In [6]:
# Okt.morphs 함수를 통해 토크나이즈 된 
# 리스트 객체를 받아 문자열을 재구성해서 리턴한다.
def prepro_like_morphlized(data):
    morph_analyzer = Okt()
    result_data = list()
    for seq in tqdm(data):
        morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', '')))
        result_data.append(morphlized_seq)

    return result_data

In [7]:
# 인코딩 데이터를 만드는 함수이며 
# 인덱스화 할 value와 키가 단어이고 값이 인덱스인 딕셔너리를 받아
# 넘파이 배열에 인덱스화된 배열과 그 길이를 넘겨준다.  
def enc_processing(value, dictionary):
    sequences_input_index = []
    sequences_length = []
    
    # step1
    # 형태소 토크나이징 사용 유무
    if DEFINES.tokenize_as_morph:
        value = prepro_like_morphlized(value)

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, "", sequence)
        sequence_index = []
        
        # 문장을 스페이스 단위로 자르고 있다.
        for word in sequence.split():
            # step2
            # 잘려진 단어들이 딕셔너리에 존재 하는지 보고 
            # 그 값을 가져와 sequence_index에 추가한다.
            if dictionary.get(word) is not None:
                sequence_index.extend([dictionary[word]])
        
            # 잘려진 단어가 딕셔너리에 존재 하지 않는 
            # 경우 이므로 UNK(2)를 넣어 준다.
            else:
                sequence_index.extend([dictionary[UNK]])
        
        # step3
        # 문장 제한 길이보다 길어질 경우 뒤에 토큰을 자르고 있다.
        if len(sequence_index) > DEFINES.max_sequence_length:
            sequence_index = sequence_index[:DEFINES.max_sequence_length]

        # step4
        sequences_length.append(len(sequence_index))
        
        # step5
        # max_sequence_length보다 문장 길이가 작다면 빈 부분에 PAD(0)를 넣어준다.
        sequence_index += (DEFINES.max_sequence_length - len(sequence_index)) * [dictionary[PAD]]
        
        # step6
        sequences_input_index.append(sequence_index)
    
    # 인덱스화된 일반 배열을 넘파이 배열로 변경한다. 
    # 이유는 텐서플로우 dataset에 넣어 주기 위한 사전 작업이다.
    return np.asarray(sequences_input_index), sequences_length

In [8]:
# 디코딩 입력 데이터를 만드는 함수이다.
def dec_input_processing(value, dictionary):
    sequences_output_index = []
    sequences_length = []

    # step1
    if DEFINES.tokenize_as_morph:
        value = prepro_like_morphlized(value)

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, "", sequence)
        sequence_index = []
        
        # step2
        # 디코딩 입력의 처음에는 START가 와야 하므로 
        # 그 값을 넣어 주고 시작한다.
        sequence_index = [dictionary[STD]] + [dictionary[word] for word in sequence.split()]

        # step3
        if len(sequence_index) > DEFINES.max_sequence_length:
            sequence_index = sequence_index[:DEFINES.max_sequence_length]
        
        # step4
        sequences_length.append(len(sequence_index))
        
        # step5
        sequence_index += (DEFINES.max_sequence_length - len(sequence_index)) * [dictionary[PAD]]
        
        # step6
        sequences_output_index.append(sequence_index)

    return np.asarray(sequences_output_index), sequences_length


In [27]:
# 디코딩 출력 데이터를 만드는 함수이다.
def dec_target_processing(value, dictionary):
    sequences_target_index = []

    # step1
    if DEFINES.tokenize_as_morph:
        value = prepro_like_morphlized(value)
    
    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, "", sequence)
        
        # step2 
        # 문장에서 스페이스 단위별로 단어를 가져와서 
        # 딕셔너리의 값인 인덱스를 넣어 준다.
        # 디코딩 출력의 마지막에 END를 넣어 준다.
        sequence_index = [dictionary[word] for word in sequence.split()]
        
        # step3
        # max_sequence_length보다 문장 길이가 
        # 문장 제한 길이보다 길어질 경우 뒤에 토큰을 자르고 있다.
        # 그리고 END 토큰을 넣어 준다
        if len(sequence_index) >= DEFINES.max_sequence_length:
            sequence_index = sequence_index[:DEFINES.max_sequence_length-1] + [dictionary[END]]
        else:
            sequence_index += [dictionary[END]]
  
        # step5
        sequence_index += (DEFINES.max_sequence_length - len(sequence_index)) * [dictionary[PAD]]
        
        # step6
        sequences_target_index.append(sequence_index)

    return np.asarray(sequences_target_index)

In [10]:
# 인덱스를 스트링으로 변경하는 함수이다.
def pred2string(value, dictionary):
    sentence_string = []
    for v in value:
        # 딕셔너리에 있는 단어로 변경해서 배열에 담는다.
        sentence_string = [dictionary[index] for index in v['indexs']]

    print(sentence_string)
    answer = ""
    # 패딩값과 엔드값이 담겨 있으므로 패딩은 모두 스페이스 처리 한다.
    for word in sentence_string:
        if word not in PAD and word not in END:
            answer += word
            answer += " "

    print(answer)
    return answer

In [11]:
# 데이터 각 요소에 대해서 rearrange 함수를 
# 통해서 요소를 변환하여 맵으로 구성한다.
def rearrange(input, output, target):
    features = {"input": input, "output": output}
    return features, target

In [12]:
# 학습에 들어가 배치 데이터를 만드는 함수이다.
def train_input_fn(train_input_enc, train_output_dec, train_target_dec, batch_size):
    # Dataset을 생성하는 부분으로써 from_tensor_slices부분은 
    # 각각 한 문장으로 자른다고 보면 된다.
    # train_input_enc, train_output_dec, train_target_dec 
    # 3개를 각각 한문장으로 나눈다.
    dataset = tf.data.Dataset.from_tensor_slices((train_input_enc, train_output_dec, train_target_dec))
    dataset = dataset.shuffle(buffer_size=len(train_input_enc))
    # 배치 인자 값이 없다면  에러를 발생 시킨다.
    assert batch_size is not None, "train batchSize must not be None"
    # from_tensor_slices를 통해 나눈것을 배치크기 만큼 묶어 준다.
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(rearrange)
    # repeat()함수에 원하는 에포크 수를 넣을수 있으면 
    # 아무 인자도 없다면 무한으로 이터레이터 된다.
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    # 이터레이터를 통해 다음 항목의 텐서 개체를 넘겨준다.
    return iterator.get_next()

In [13]:
# 평가에 들어가 배치 데이터를 만드는 함수이다.
def eval_input_fn(eval_input_enc, eval_output_dec, eval_target_dec, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((eval_input_enc, eval_output_dec, eval_target_dec))
    # 전체 데이터를 섞는다.
    dataset = dataset.shuffle(buffer_size=len(eval_input_enc))
    assert batch_size is not None, "eval batchSize must not be None"
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(rearrange)
    # 평가이므로 1회만 동작 시킨다.
    dataset = dataset.repeat(1)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [14]:
# 토크나이징 해서 담을 배열을 생성하고 
# 토그나이징과 정규표현식을 통해 만들어진 값들을 넘겨 준다.
def data_tokenizer(data):
    words = []
    for sentence in data:
        sentence = re.sub(CHANGE_FILTER, "", sentence)
        for word in sentence.split():
            words.append(word)
    return [word for word in words if word]

In [15]:
# 최초 사전 파일을 만드는 함수이며 파일이 존재 한다면 불러오는 함수이다.
def load_vocabulary():
    vocabulary_list = []
    
    # 사전 파일의 존재 유무를 확인한다.
    if (not (os.path.exists(DEFINES.vocabulary_path))):
        if (os.path.exists(DEFINES.data_path)):
            data_df = pd.read_csv(DEFINES.data_path, encoding='utf-8')
            question, answer = list(data_df['Q']), list(data_df['A'])
    
            if DEFINES.tokenize_as_morph:  
                question = prepro_like_morphlized(question)
                answer = prepro_like_morphlized(answer)
            
            data = []
            data.extend(question)
            data.extend(answer)
            words = data_tokenizer(data)
            words = list(set(words))
            
            # 데이터 없는 내용중에 MARKER를 사전에 
            # 추가 하기 위해서 아래와 같이 처리 한다.
            # 아래는 MARKER 값이며 리스트의 첫번째 부터 
            # 순서대로 넣기 위해서 인덱스 0에 추가한다.
            # PAD = "<PADDING>"
            # STD = "<START>"
            # END = "<END>"
            # UNK = "<UNKNOWN>"     
            words[:0] = MARKER
            
        # 사전 리스트를 사전 파일로 만들어 넣는다.
        with open(DEFINES.vocabulary_path, 'w', encoding='utf-8') as vocabulary_file:
            for word in words:
                vocabulary_file.write(word + '\n')

    # 사전 파일이 존재하면 여기에서 그 파일을 불러서 배열에 넣어 준다.
    with open(DEFINES.vocabulary_path, 'r', encoding='utf-8') as vocabulary_file:
        for line in vocabulary_file:
            vocabulary_list.append(line.strip())

    word2idx, idx2word = make_vocabulary(vocabulary_list)
    # 두가지 형태의 키와 값이 있는 형태를 리턴한다. 
    # (예) 단어: 인덱스 , 인덱스: 단어)
    return word2idx, idx2word, len(word2idx)

In [16]:
# 리스트를 키가 단어이고 값이 인덱스인 딕셔너리를 만든다.
# 리스트를 키가 인덱스이고 값이 단어인 딕셔너리를 만든다.
def make_vocabulary(vocabulary_list):
    word2idx = {word: idx for idx, word in enumerate(vocabulary_list)}
    idx2word = {idx: word for idx, word in enumerate(vocabulary_list)}
    return word2idx, idx2word

In [17]:
def main_data(self):
    DATA_OUT_PATH = './data_out/'
    data_out_path = os.path.join(os.getcwd(), DATA_OUT_PATH)
    os.makedirs(data_out_path, exist_ok=True)
    char2idx, idx2char, vocabulary_length = load_vocabulary()

# model.py

In [18]:
#-*- coding: utf-8 -*-
#import tensorflow as tf
import sys
#from configs import DEFINES

# 엘에스티엠(LSTM) 단층 네트워크 구성하는 부분
def make_lstm_cell(mode, hiddenSize, index):
    cell = tf.nn.rnn_cell.BasicLSTMCell(hiddenSize, name = "lstm"+str(index))
    if mode == tf.estimator.ModeKeys.TRAIN:
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=DEFINES.dropout_width) #change: tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=DEFINES.dropout_width)
    return cell

# 에스티메이터 모델 부분이다.
def model(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    # 인코딩 부분 (미리 정의된 임베딩 벡터 사용 유무)
    if params['embedding'] == True:
        # 가중치 행렬에 대한 초기화 함수이다.
        # xavier (Xavier Glorot와 Yoshua Bengio (2010)
        # URL : http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
        initializer = tf.keras.initializers.glorot_uniform() # change: tf.contrib.layers.xavier_initializer()
        embedding = tf.get_variable(name = "embedding", # 이름
                                 	  shape=[params['vocabulary_length'], params['embedding_size']], #  모양
                                 	  dtype=tf.float32, # 타입
                                 	  initializer=initializer, # 초기화 값
                                 	  trainable=True) # 학습 유무
    else:   
        # tf.eye를 통해서 사전의 크기 만큼의 단위행렬 
        # 구조를 만든다.
        embedding = tf.eye(num_rows = params['vocabulary_length'], dtype = tf.float32)
        embedding = tf.get_variable(name = "embedding", # 이름
                                            initializer = embedding, # 초기화 값
                                            trainable = False) # 학습 유무

    # 임베딩된 인코딩 배치를 만든다.
    embedding_encoder = tf.nn.embedding_lookup(params = embedding, ids = features['input'])

    # 임베딩된 디코딩 배치를 만든다.
    embedding_decoder = tf.nn.embedding_lookup(params = embedding, ids = features['output'])

    with tf.variable_scope('encoder_scope', reuse=tf.AUTO_REUSE):
        # 값이 True이면 멀티레이어로 모델을 구성하고 False이면 
        # 단일레이어로 모델을 구성 한다.
        if params['multilayer'] == True:
            encoder_cell_list = [make_lstm_cell(mode, params['hidden_size'], i) for i in range(params['layer_size'])]
            rnn_cell = tf.nn.rnn_cell.MultiRNNCell(encoder_cell_list) #tf.contrib.rnn.MultiRNNCell(encoder_cell_list)
        else:
            rnn_cell = make_lstm_cell(mode, params['hidden_size'], "")
        
        # rnn_cell에 의해 지정된 dynamic_rnn 반복적인 신경망을 만든다. 
        # encoder_states 최종 상태  [batch_size, cell.state_size]
        encoder_outputs, encoder_states = tf.nn.dynamic_rnn(cell=rnn_cell, # RNN 셀
                                                                inputs=embedding_encoder, # 입력 값
                                                                dtype=tf.float32) # 타입

    with tf.variable_scope('decoder_scope', reuse=tf.AUTO_REUSE):
        if params['multilayer'] == True:
            decoder_cell_list = [make_lstm_cell(mode, params['hidden_size'], i) for i in range(params['layer_size'])]
            rnn_cell = tf.nn.rnn_cell.MultiRNNCell(decoder_cell_list) #tf.contrib.rnn.MultiRNNCell(decoder_cell_list)
        else:
            rnn_cell = make_lstm_cell(mode, params['hidden_size'], "")

        decoder_initial_state = encoder_states
        decoder_outputs, decoder_states = tf.nn.dynamic_rnn(cell=rnn_cell, # RNN 셀
                       inputs=embedding_decoder, # 입력 값
                       initial_state=decoder_initial_state, # 인코딩의 마지막 값으로 초기화
                       dtype=tf.float32) # 타입


    # logits는 마지막 히든레이어를 통과한 결과값이다.
    logits = tf.layers.dense(decoder_outputs, params['vocabulary_length'], activation=None)

	# argmax를 통해서 최대 값을 가져 온다.
    predict = tf.argmax(logits, 2)

    if PREDICT:
        predictions = { # 예측 값들이 여기에 딕셔너리 형태로 담긴다.
            'indexs': predict, # 시퀀스 마다 예측한 값
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    
    #  
    # logits과 같은 차원을 만들어 마지막 결과 값과 정답 값을 비교하여 에러를 구한다.
    labels_ = tf.one_hot(labels, params['vocabulary_length'])
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels_))
    # 라벨과 결과가 일치하는지 빈도 계산을 통해 정확도를 측정하는 방법이다.
    accuracy = tf.metrics.accuracy(labels=labels, predictions=predict,name='accOp')

    # accuracy를 전체 값으로 나눠 확률 값으로 한다.
    metrics = {'accuracy': accuracy}
    tf.summary.scalar('accuracy', accuracy[1])
    
    if EVAL:
        # 에러 값(loss)과 정확도 값(eval_metric_ops) 전달
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

    # 수행 mode(tf.estimator.ModeKeys.TRAIN)가 
    # 아닌 경우는 여기 까지 오면 안되도록 방어적 코드를 넣은것이다.
    assert TRAIN

    optimizer = tf.train.AdamOptimizer(learning_rate=DEFINES.learning_rate)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())  

    # 에러 값(loss)과 그라디언트 반환값 (train_op) 전달
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

# main.py

In [19]:
#import tensorflow as tf
#import model as ml
#import data
import numpy as np
import os
import sys

#from configs import DEFINES

In [20]:
DATA_OUT_PATH = './data_out/'
data_out_path = os.path.join(os.getcwd(), DATA_OUT_PATH)
os.makedirs(data_out_path, exist_ok=True)

In [21]:
# 데이터를 통한 사전 구성 한다.
word2idx, idx2word, vocabulary_length = load_vocabulary() #data.load_vocabulary()  

In [22]:
word2idx

{'<PADDING>': 0,
 '<START>': 1,
 '<END>': 2,
 '<UNKNOWN>': 3,
 '안될것도': 4,
 '하셔': 5,
 '쓰일것': 6,
 '하지말아': 7,
 '만나야해': 8,
 '잊혀진답니다': 9,
 '듣는': 10,
 '살수있을지': 11,
 '내가나답게': 12,
 '표시': 13,
 '울었어': 14,
 '무안해': 15,
 '하고있어': 16,
 '식었나': 17,
 '지우고자': 18,
 '믿어요': 19,
 '외향': 20,
 '할거면': 21,
 '왔다는데': 22,
 '5일': 23,
 '여드름': 24,
 '로또': 25,
 '꼴사나워질': 26,
 '환': 27,
 '안다니고': 28,
 '호호': 29,
 '숏컷': 30,
 '새로운시작': 31,
 '물이': 32,
 '절약': 33,
 '갖는게': 34,
 '다르겠죠': 35,
 '잘살지': 36,
 '부추기면': 37,
 '닿지': 38,
 '아닌듯': 39,
 '가배': 40,
 '나위': 41,
 '받으니': 42,
 '나오지': 43,
 '사업': 44,
 '후젤': 45,
 '수박': 46,
 '만들고': 47,
 '해드리는': 48,
 '퍼즐': 49,
 '먹죠': 50,
 '돌아보는것': 51,
 '맞는것과': 52,
 '않은듯': 53,
 '그러려고': 54,
 '떨려서': 55,
 '오더라': 56,
 '안되지만': 57,
 '결혼식장': 58,
 '뭐라고답': 59,
 '태연하게': 60,
 '비슷하죠': 61,
 '걸거': 62,
 '가늘고긴': 63,
 '맞추려고': 64,
 '손목': 65,
 '놀고싶다': 66,
 '버텻': 67,
 '순수했던걸': 68,
 '황당하지만': 69,
 '철벽치': 70,
 '맞추세요': 71,
 '자꾸': 72,
 '되었다': 73,
 '똑같이': 74,
 '어려운게': 75,
 '도와준': 76,
 '살수있': 77,
 '중요한가요': 78,
 '배송': 79,
 '비해': 80,
 '드리러

In [23]:
idx2word

{0: '<PADDING>',
 1: '<START>',
 2: '<END>',
 3: '<UNKNOWN>',
 4: '안될것도',
 5: '하셔',
 6: '쓰일것',
 7: '하지말아',
 8: '만나야해',
 9: '잊혀진답니다',
 10: '듣는',
 11: '살수있을지',
 12: '내가나답게',
 13: '표시',
 14: '울었어',
 15: '무안해',
 16: '하고있어',
 17: '식었나',
 18: '지우고자',
 19: '믿어요',
 20: '외향',
 21: '할거면',
 22: '왔다는데',
 23: '5일',
 24: '여드름',
 25: '로또',
 26: '꼴사나워질',
 27: '환',
 28: '안다니고',
 29: '호호',
 30: '숏컷',
 31: '새로운시작',
 32: '물이',
 33: '절약',
 34: '갖는게',
 35: '다르겠죠',
 36: '잘살지',
 37: '부추기면',
 38: '닿지',
 39: '아닌듯',
 40: '가배',
 41: '나위',
 42: '받으니',
 43: '나오지',
 44: '사업',
 45: '후젤',
 46: '수박',
 47: '만들고',
 48: '해드리는',
 49: '퍼즐',
 50: '먹죠',
 51: '돌아보는것',
 52: '맞는것과',
 53: '않은듯',
 54: '그러려고',
 55: '떨려서',
 56: '오더라',
 57: '안되지만',
 58: '결혼식장',
 59: '뭐라고답',
 60: '태연하게',
 61: '비슷하죠',
 62: '걸거',
 63: '가늘고긴',
 64: '맞추려고',
 65: '손목',
 66: '놀고싶다',
 67: '버텻',
 68: '순수했던걸',
 69: '황당하지만',
 70: '철벽치',
 71: '맞추세요',
 72: '자꾸',
 73: '되었다',
 74: '똑같이',
 75: '어려운게',
 76: '도와준',
 77: '살수있',
 78: '중요한가요',
 79: '배송',
 80: '비해',
 81: 

In [24]:
vocabulary_length

15684

In [25]:
# 훈련 데이터와 테스트 데이터를 가져온다.
train_input, train_label, eval_input, eval_label = load_data() #data.load_data()
print("train_input:", len(train_input))
print("train_label:", len(train_label))
print("train_input:", train_input[10])
print("train_label:", train_label[10])
print("eval_input:", len(eval_input))
print("eval_label:", len(eval_label))
print("eval_input:", eval_input[10])
print("eval_label:", eval_label[10])

train_input: 7921
train_label: 7921
train_input: 너무 힘든데
train_label: 조금만 더 버텨보세요.
eval_input: 3902
eval_label: 3902
eval_input: 뿌염해야지
eval_label: 참 귀찮은 일이죠.


### 훈련셋 인코딩 / 디코딩 입력 / 디코딩 출력 만들기

In [26]:
# 훈련셋 인코딩 / 디코딩 입력 / 디코딩 출력 만드는 부분이다.
train_input_enc, train_input_enc_length = enc_processing(train_input, word2idx) #data.enc_processing(train_input, word2idx)

100%|██████████| 7921/7921 [00:34<00:00, 229.79it/s]


In [28]:
train_input_enc.shape

(7921, 25)

In [29]:
train_input_enc[0]

array([ 2947,  8987, 10198,  8885,  5887,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

In [30]:
max(train_input_enc_length)

19

In [31]:
train_input_dec, train_input_dec_length = dec_input_processing(train_label, word2idx) #data.dec_input_processing(train_label, word2idx)

100%|██████████| 7921/7921 [00:58<00:00, 136.48it/s]


In [33]:
train_input_dec.shape

(7921, 25)

In [34]:
train_input_dec[0]

array([    1,  2187,  7713,  2734, 11588, 11754,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

In [35]:
max(train_input_dec_length)

25

In [36]:
train_target_dec = dec_target_processing(train_label, word2idx) #data.dec_target_processing(train_label, word2idx)

100%|██████████| 7921/7921 [01:09<00:00, 114.30it/s]


In [37]:
train_target_dec.shape

(7921, 25)

In [38]:
train_target_dec[0]

array([ 2187,  7713,  2734, 11588, 11754,     2,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

### 평가셋 인코딩 / 디코딩 입력 / 디코딩 출력 만들기

In [39]:
eval_input_enc, eval_input_enc_length = enc_processing(eval_input,word2idx) #data.enc_processing(eval_input,word2idx)

100%|██████████| 3902/3902 [00:27<00:00, 141.73it/s]


In [40]:
eval_input_dec, eval_input_dec_length = dec_input_processing(eval_label, word2idx) #data.dec_input_processing(eval_label, word2idx)

100%|██████████| 3902/3902 [00:36<00:00, 107.53it/s]


In [41]:
eval_target_dec = dec_target_processing(eval_label, word2idx) #data.dec_target_processing(eval_label, word2idx)

100%|██████████| 3902/3902 [00:38<00:00, 100.60it/s]


In [42]:
# 현재 경로'./'에 현재 경로 하부에 체크 포인트를 저장한 디렉토리를 설정한다.
check_point_path = os.path.join(os.getcwd(), DEFINES.check_point_path)
os.makedirs(check_point_path, exist_ok=True)

### 에스티메이터 구성

In [43]:
# 에스티메이터 구성한다.
classifier = tf.estimator.Estimator(
        model_fn=model, # 모델 등록한다. #ml.model,
        model_dir=DEFINES.check_point_path, 
        params={
            'hidden_size': DEFINES.hidden_size, 
            'layer_size': DEFINES.layer_size, 
            'learning_rate': DEFINES.learning_rate, 
            'vocabulary_length': vocabulary_length, 
            'embedding_size': DEFINES.embedding_size, 
            'embedding': DEFINES.embedding, 
            'multilayer': DEFINES.multilayer,
        })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './data_out/check_point', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a38cd3910>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### 학습 실행

In [None]:
# 학습 실행
#data.train_input_fn
classifier.train(input_fn=lambda:train_input_fn(
    train_input_enc, train_input_dec, train_target_dec, DEFINES.batch_size), steps=DEFINES.train_steps)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions

In [None]:
"""
INFO:tensorflow:Calling model_fn.
I0907 03:37:04.831048 4584078784 estimator.py:1147] Calling model_fn.
WARNING:tensorflow:From <ipython-input-77-89ff9786b264>:48: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
W0907 03:37:05.137772 4584078784 deprecation.py:323] From <ipython-input-77-89ff9786b264>:48: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
WARNING:tensorflow:From <ipython-input-77-89ff9786b264>:56: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
W0907 03:37:05.139495 4584078784 deprecation.py:323] From <ipython-input-77-89ff9786b264>:56: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:From /Users/csg/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:735: Layer.add_variable (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.
W0907 03:37:05.362714 4584078784 deprecation.py:323] From /Users/csg/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:735: Layer.add_variable (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.
WARNING:tensorflow:From /Users/csg/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:739: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0907 03:37:05.387197 4584078784 deprecation.py:506] From /Users/csg/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/rnn_cell_impl.py:739: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
WARNING:tensorflow:From <ipython-input-77-89ff9786b264>:73: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.Dense instead.
W0907 03:37:05.807761 4584078784 deprecation.py:323] From <ipython-input-77-89ff9786b264>:73: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.Dense instead.
WARNING:tensorflow:From /Users/csg/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
W0907 03:37:05.809349 4584078784 deprecation.py:323] From /Users/csg/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
INFO:tensorflow:Done calling model_fn.
I0907 03:37:08.739642 4584078784 estimator.py:1149] Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
I0907 03:37:08.743927 4584078784 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
I0907 03:37:09.971866 4584078784 monitored_session.py:240] Graph was finalized.
INFO:tensorflow:Running local_init_op.
I0907 03:37:10.826373 4584078784 session_manager.py:500] Running local_init_op.
INFO:tensorflow:Done running local_init_op.
I0907 03:37:10.971440 4584078784 session_manager.py:502] Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./data_out/check_point/model.ckpt.
I0907 03:37:15.994513 4584078784 basic_session_run_hooks.py:606] Saving checkpoints for 0 into ./data_out/check_point/model.ckpt.
INFO:tensorflow:loss = 9.660523, step = 1
I0907 03:37:19.402611 4584078784 basic_session_run_hooks.py:262] loss = 9.660523, step = 1
INFO:tensorflow:global_step/sec: 0.995075
I0907 03:38:59.897099 4584078784 basic_session_run_hooks.py:692] global_step/sec: 0.995075
INFO:tensorflow:loss = 2.2393522, step = 101 (100.499 sec)
I0907 03:38:59.901525 4584078784 basic_session_run_hooks.py:260] loss = 2.2393522, step = 101 (100.499 sec)
INFO:tensorflow:global_step/sec: 0.921418
I0907 03:40:48.425472 4584078784 basic_session_run_hooks.py:692] global_step/sec: 0.921418
INFO:tensorflow:loss = 1.6588095, step = 201 (108.527 sec)
I0907 03:40:48.428763 4584078784 basic_session_run_hooks.py:260] loss = 1.6588095, step = 201 (108.527 sec)
INFO:tensorflow:global_step/sec: 1.01371
I0907 03:42:27.073051 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.01371
INFO:tensorflow:loss = 1.5958899, step = 301 (98.646 sec)
I0907 03:42:27.074995 4584078784 basic_session_run_hooks.py:260] loss = 1.5958899, step = 301 (98.646 sec)
INFO:tensorflow:global_step/sec: 1.00465
I0907 03:44:06.610792 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.00465
INFO:tensorflow:loss = 1.4954394, step = 401 (99.542 sec)
I0907 03:44:06.617012 4584078784 basic_session_run_hooks.py:260] loss = 1.4954394, step = 401 (99.542 sec)
INFO:tensorflow:global_step/sec: 0.915876
I0907 03:45:55.795295 4584078784 basic_session_run_hooks.py:692] global_step/sec: 0.915876
INFO:tensorflow:loss = 1.5127687, step = 501 (109.180 sec)
I0907 03:45:55.797347 4584078784 basic_session_run_hooks.py:260] loss = 1.5127687, step = 501 (109.180 sec)
INFO:tensorflow:global_step/sec: 1.30263
I0907 03:47:12.563019 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.30263
INFO:tensorflow:loss = 1.4577101, step = 601 (76.769 sec)
I0907 03:47:12.566308 4584078784 basic_session_run_hooks.py:260] loss = 1.4577101, step = 601 (76.769 sec)
INFO:tensorflow:Saving checkpoints for 607 into ./data_out/check_point/model.ckpt.
I0907 03:47:17.131443 4584078784 basic_session_run_hooks.py:606] Saving checkpoints for 607 into ./data_out/check_point/model.ckpt.
INFO:tensorflow:global_step/sec: 1.28965
I0907 03:48:30.103338 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.28965
INFO:tensorflow:loss = 1.4678216, step = 701 (77.539 sec)
I0907 03:48:30.105018 4584078784 basic_session_run_hooks.py:260] loss = 1.4678216, step = 701 (77.539 sec)
INFO:tensorflow:global_step/sec: 1.30026
I0907 03:49:47.011075 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.30026
INFO:tensorflow:loss = 1.552615, step = 801 (76.909 sec)
I0907 03:49:47.014289 4584078784 basic_session_run_hooks.py:260] loss = 1.552615, step = 801 (76.909 sec)
INFO:tensorflow:global_step/sec: 1.29715
I0907 03:51:04.103052 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.29715
INFO:tensorflow:loss = 1.1586355, step = 901 (77.091 sec)
I0907 03:51:04.105104 4584078784 basic_session_run_hooks.py:260] loss = 1.1586355, step = 901 (77.091 sec)
INFO:tensorflow:global_step/sec: 1.31233
I0907 03:52:20.303287 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.31233
INFO:tensorflow:loss = 1.2831036, step = 1001 (76.201 sec)
I0907 03:52:20.306480 4584078784 basic_session_run_hooks.py:260] loss = 1.2831036, step = 1001 (76.201 sec)
INFO:tensorflow:global_step/sec: 1.30454
I0907 03:53:36.958671 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.30454
INFO:tensorflow:loss = 1.3305126, step = 1101 (76.654 sec)
I0907 03:53:36.960531 4584078784 basic_session_run_hooks.py:260] loss = 1.3305126, step = 1101 (76.654 sec)
INFO:tensorflow:global_step/sec: 1.31378
I0907 03:54:53.074881 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.31378
INFO:tensorflow:loss = 1.2443384, step = 1201 (76.117 sec)
I0907 03:54:53.077033 4584078784 basic_session_run_hooks.py:260] loss = 1.2443384, step = 1201 (76.117 sec)
INFO:tensorflow:global_step/sec: 1.29052
I0907 03:56:10.563235 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.29052
INFO:tensorflow:loss = 1.3646286, step = 1301 (77.488 sec)
I0907 03:56:10.565088 4584078784 basic_session_run_hooks.py:260] loss = 1.3646286, step = 1301 (77.488 sec)
INFO:tensorflow:Saving checkpoints for 1386 into ./data_out/check_point/model.ckpt.
I0907 03:57:17.941159 4584078784 basic_session_run_hooks.py:606] Saving checkpoints for 1386 into ./data_out/check_point/model.ckpt.
INFO:tensorflow:global_step/sec: 1.25737
I0907 03:57:30.094022 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.25737
INFO:tensorflow:loss = 1.3437358, step = 1401 (79.531 sec)
I0907 03:57:30.095847 4584078784 basic_session_run_hooks.py:260] loss = 1.3437358, step = 1401 (79.531 sec)
INFO:tensorflow:global_step/sec: 1.29726
I0907 03:58:47.179399 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.29726
INFO:tensorflow:loss = 1.2380457, step = 1501 (77.086 sec)
I0907 03:58:47.181392 4584078784 basic_session_run_hooks.py:260] loss = 1.2380457, step = 1501 (77.086 sec)
INFO:tensorflow:global_step/sec: 1.29136
I0907 04:00:04.617318 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.29136
INFO:tensorflow:loss = 1.2809906, step = 1601 (77.439 sec)
I0907 04:00:04.620599 4584078784 basic_session_run_hooks.py:260] loss = 1.2809906, step = 1601 (77.439 sec)
INFO:tensorflow:global_step/sec: 1.29941
I0907 04:01:21.575551 4584078784 basic_session_run_hooks.py:692] global_step/sec: 1.29941
INFO:tensorflow:loss = 1.3097394, step = 1701 (76.957 sec)
I0907 04:01:21.577551 4584078784 basic_session_run_hooks.py:260] loss = 1.3097394, step = 1701 (76.957 sec)
"""
"""

### 평가 실행

In [None]:
# 평가 실행
eval_result = classifier.evaluate(input_fn=lambda:data.eval_input_fn(
    eval_input_enc, eval_input_dec, eval_target_dec,  DEFINES.batch_size))
print('\nEVAL set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

### 테스트셋 인코딩 / 디코딩 입력 / 디코딩 출력 만들기

In [None]:
# 테스트셋 인코딩 / 디코딩 입력 / 디코딩 출력 만드는 부분이다.
predic_input_enc, predic_input_enc_length = data.enc_processing(["가끔 궁금해"], word2idx)
predic_input_dec, predic_input_dec_length = data.dec_input_processing([""], word2idx)       
predic_target_dec = data.dec_target_processing([""], word2idx)

### 예측 실행

In [None]:
# 예측 실행
predictions = classifier.predict(
    input_fn=lambda:data.eval_input_fn(predic_input_enc, predic_input_dec, predic_target_dec, DEFINES.batch_size))

In [None]:
# 예측한 값을 텍스트로 변경하는 부분이다.
data.pred2string(predictions, idx2word)

# predict.py

In [None]:
"""
#import tensorflow as tf
#import data
import sys
#import model as ml

#from configs import DEFINES
	
if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)
    arg_length = len(sys.argv)
    
    if(arg_length < 2):
        raise Exception("Don't call us. We'll call you")
  
    
    char2idx,  idx2char, vocabulary_length = data.load_vocabulary()
    input = ""
    for i in sys.argv[1:]:
        input += i 
        input += " "
        
    print(input)
    # 테스트셋 인코딩 / 디코딩 입력 / 디코딩 출력 만드는 부분이다.
    predic_input_enc, predic_input_enc_length = data.enc_processing([input], char2idx)
    predic_output_dec, predic_output_dec_length = data.dec_input_processing([""], char2idx)
    predic_target_dec = data.dec_target_processing([""], char2idx)

	# 에스티메이터 구성
    classifier = tf.estimator.Estimator(
            model_fn=ml.model,
            model_dir=DEFINES.check_point_path, 
            params={ 
                'hidden_size': DEFINES.hidden_size, 
                'layer_size': DEFINES.layer_size, 
                'learning_rate': DEFINES.learning_rate, 
                'vocabulary_length': vocabulary_length, 
                'embedding_size': DEFINES.embedding_size, 
                'embedding': DEFINES.embedding, 
                'multilayer': DEFINES.multilayer, 
            })

    predictions = classifier.predict(
        input_fn=lambda:data.eval_input_fn(predic_input_enc, predic_output_dec, predic_target_dec, DEFINES.batch_size))
    
    data.pred2string(predictions, idx2char)
"""