In [2]:
'''
Created on 2018. 2. 14.

@author: phs
'''
"""
1.대화 말뭉치 파일을 읽어들인다.
2.대화 말뭉치를 읽어서 자연어 처리 및  Bag of word 생성
3.Bag of word를 딥러닝 알고리즘 활용을 위한 입력으로 변환
4.딥러닝(tensorflow)을 통한 자연어 이해 모델 생성
5.자연어 이해 모델을 관리한다(저장,읽기)
"""
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

# things we need for NLP
import json
import pickle
import random

# import nltk
# from nltk.stem.lancaster import LancasterStemmer
# stemmer = LancasterStemmer()

# from konlpy.tag import Komoran
from konlpy.tag import Twitter
# komoran = Komoran()
twitter = Twitter()

import tensorflow as tf
import tflearn
import numpy as np

def read_dialog_intents_jsonfile(input_file_name):
    """
     대화 말뭉치 파일을 읽어들인다.
    """
    with open(input_file_name, 'rt', encoding='UTF8') as json_data:
        intents = json.load(json_data)
        
    return intents
    
def dialog_nlp_processing(intents):
    """
     대화 말뭉치를 읽어서 자연어 처리 및  Bag of word 생성
    """
    words = []
    classes = []
    documents = []
    ignore_words = ['?']
    # loop through each sentence in our intents patterns
    for intent in intents['intents']:
        for pattern in intent['patterns']:
            # tokenize each word in the sentence
#             w = nltk.word_tokenize(pattern)
            pos_result = twitter.pos(pattern, norm=True, stem=True)
            w = [lex for lex, pos in pos_result]
            # add to our words list
            words.extend(w)
            # add to documents in our corpus
            documents.append((w, intent['tag']))
            # add to our classes list
            if intent['tag'] not in classes:
                classes.append(intent['tag'])
    
    # stem and lower each word and remove duplicates
#     words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
#     words = sorted(list(set(words)))
    words = [w for w in words if w not in ignore_words]
    words = sorted(list(set(words)))
    
    # remove duplicates
    classes = sorted(list(set(classes)))
    
    return classes, documents, words

def prepare_machine_learning(classes, documents, words):
    """
    Bag of word를 딥러닝 알고리즘 활용을 위한 입력으로 변환
    """
    
    # create our training data
    training = []
    output_row = []
    # create an empty array for our output
    output_empty = [0] * len(classes)
    
    # training set, bag of words for each sentence
    for doc in documents:
        # initialize our bag of words
        bag = []
        # list of tokenized words for the pattern
        pattern_words = doc[0]
        # stem each word
#         pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
        # create our bag of words array
        for w in words:
            bag.append(1) if w in pattern_words else bag.append(0)
    
        # output is a '0' for each tag and '1' for current tag
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
    
        training.append([bag, output_row])
    
    # shuffle our features and turn into np.array
    random.shuffle(training)
    training = np.array(training)
    
    # create train and test lists
    train_x = list(training[:,0])
    train_y = list(training[:,1])

    return train_x, train_y

def create_tensorflow_learning_model(train_x, train_y, output_model_file_name):
    """
    딥러닝(tensorflow)을 통한 자연어 이해 모델 생성
    """
    
    # reset underlying graph data
    tf.reset_default_graph()
    # Build neural network
    net = tflearn.input_data(shape=[None, len(train_x[0])])
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
    net = tflearn.regression(net)
    
    # Define model and setup tensorboard
    model = tflearn.DNN(net, tensorboard_dir='home_tflearn_kr_logs')
    # Start training (apply gradient descent algorithm)
    model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
    # save the trained model to directory
    model.save(output_model_file_name)

    return model

def clean_up_sentence(sentence):
    # tokenize the pattern
#     sentence_words = nltk.word_tokenize(sentence)
    pos_result = twitter.pos(sentence, norm=True, stem=True)
    sentence_words = [lex for lex, pos in pos_result]
    # stem each word
#     sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

# save all of our data structures
def save_training_data_structures(words, classes, train_x, train_y, output_training_data_file_name):
    """
    자연어 이해 모델을 관리한다(저장,읽기)
    """
    # save all of our data structures
    pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( output_training_data_file_name, "wb" ) )


if __name__ == '__main__':
    
    # 대화 말뭉치 파일을 읽어들인다.
    input_file_name = './DialogIntents/intents_home_kr.json'
    intents = read_dialog_intents_jsonfile(input_file_name)
    
    # 대화 말뭉치를 읽어서 자연어 처리 및  Bag of word 생성
    classes, documents, words = dialog_nlp_processing(intents)
    print (len(documents), "documents")
    print (len(classes), "classes", classes)
    print (len(words), "unique stemmed words", words)
    
    # Bag of word를 딥러닝 알고리즘 활용을 위한 입력으로 변환
    train_x, train_y = prepare_machine_learning(classes, documents, words)
    
    # 딥러닝(tensorflow)을 통한 자연어 이해 모델 생성
    output_model_file_name = './NLUModel/model_home_kr.tflearn'
    model = create_tensorflow_learning_model(train_x, train_y, output_model_file_name)

    # 자연어 이해 모델을 관리한다(저장,읽기)
    output_training_data_file_name = "./NLUModel/training_data_home_kr"
    save_training_data_structures(words, classes, train_x, train_y, output_training_data_file_name)
    
    p = bow("비젼은 무엇입니까?", words)
    print("p is Bag of word for '비젼은 무엇입니까?' :{}".format(p))
    print("classes :{}".format(classes))
    print("model.predict([p]) :{}".format(model.predict([p])))
    
    print (len(documents), "documents")
    print (len(classes), "classes", classes)
    print (len(words), "unique stemmed words", words)


Training Step: 9999  | total loss: [1m[32m0.62170[0m[0m | time: 0.041s
| Adam | epoch: 1000 | loss: 0.62170 - acc: 0.9651 -- iter: 72/74
Training Step: 10000  | total loss: [1m[32m0.55953[0m[0m | time: 0.044s
| Adam | epoch: 1000 | loss: 0.55953 - acc: 0.9686 -- iter: 74/74
--
INFO:tensorflow:C:\Users\phs\textmining\python\text-mining-camp\note\arkwith\home_chat\NLUModel\model_home_kr.tflearn is not in all_model_checkpoint_paths. Manually adding it.
p is Bag of word for '오늘 가게  여나요?' :[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
classes :['Slang', 'contact', 'goodbye', 'greeting', 'history', 'keyCustomers', 'mission', 'possessTechnology', 'product', 'service', 'thanks', 'vision']
model.predict([p]) :[[1.2514025e-03 5.7955305e-07 4.8065453e-04 9.8171186e-01 2.1619460e-09
  3.8542622e-03 1.2965342e-12 1.2701