# 챗봇

## 예제 데이터

In [38]:
import csv

In [39]:
texts = []
codes = []
intents = []
with open('chatbot.csv', encoding='utf8') as f:
    for char, code in csv.reader(f):
        if char == '@':
            texts.append([])
            codes.append([])
            intents.append(code)
        else:
            texts[-1].append(char if char else ' ')
            codes[-1].append(code)

## 발화 텍스트 처리

In [40]:
texts = [''.join(text) for text in texts]

In [41]:
texts

['치킨 한 마리 주세요', '콜라 한 병 주세요', '아까시킨치킨취소', '콜라는 됐어요']

### 글자에 번호를 부여

In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [43]:
# char_level=True 글자를 번호로
# oov - out of voca 없는 어휘는 @로 처리
tok = Tokenizer(char_level=True, oov_token='@')

In [44]:
# 글자에 번호 세팅
tok.fit_on_texts(texts)

In [45]:
tok.index_word

{1: '@',
 2: ' ',
 3: '킨',
 4: '요',
 5: '치',
 6: '한',
 7: '주',
 8: '세',
 9: '콜',
 10: '라',
 11: '마',
 12: '리',
 13: '병',
 14: '아',
 15: '까',
 16: '시',
 17: '취',
 18: '소',
 19: '는',
 20: '됐',
 21: '어'}

### 문자열을 수열로 변환

In [46]:
char_seq = tok.texts_to_sequences(texts)

In [47]:
char_seq

[[5, 3, 2, 6, 2, 11, 12, 2, 7, 8, 4],
 [9, 10, 2, 6, 2, 13, 2, 7, 8, 4],
 [14, 15, 16, 3, 5, 3, 17, 18],
 [9, 10, 19, 2, 20, 21, 4]]

### 패딩

In [48]:
# 0을 채워서 문장들간의 길이를 맞춰줌. 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [49]:
x = pad_sequences(char_seq)

In [50]:
x

array([[ 5,  3,  2,  6,  2, 11, 12,  2,  7,  8,  4],
       [ 0,  9, 10,  2,  6,  2, 13,  2,  7,  8,  4],
       [ 0,  0,  0, 14, 15, 16,  3,  5,  3, 17, 18],
       [ 0,  0,  0,  0,  9, 10, 19,  2, 20, 21,  4]])

## 인텐트 데이터 처리

In [51]:
intents

['ORDER', 'ORDER', 'CANCEL', 'CANCEL']

In [52]:
intent_index = {'ORDER': 0, 'CANCEL': 1}

In [53]:
y1 = [intent_index[i] for i in intents]

In [54]:
y1

[0, 0, 1, 1]

## 엔티티 데이터 처리

In [55]:
codes

[['B_MENU', 'I_MENU', 'O', 'B_QTY', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B_MENU', 'I_MENU', 'O', 'B_QTY', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'B_MENU', 'I_MENU', 'O', 'O'],
 ['B_MENU', 'I_MENU', 'O', 'O', 'O', 'O', 'O']]

In [56]:
# 0번은 패딩용이라서 사용하면 안됨. 
code_index = {'B_MENU': 1, 'I_MENU': 2, 'B_QTY': 3, 'I_QTY': 4, 'O': 5}

In [57]:
code_seq = [[code_index[code] for code in instance] for instance in codes]

In [58]:
code_seq

[[1, 2, 5, 3, 5, 5, 5, 5, 5, 5, 5],
 [1, 2, 5, 3, 5, 5, 5, 5, 5, 5],
 [5, 5, 5, 5, 1, 2, 5, 5],
 [1, 2, 5, 5, 5, 5, 5]]

In [59]:
y2 = pad_sequences(code_seq)

In [60]:
y2

array([[1, 2, 5, 3, 5, 5, 5, 5, 5, 5, 5],
       [0, 1, 2, 5, 3, 5, 5, 5, 5, 5, 5],
       [0, 0, 0, 5, 5, 5, 5, 1, 2, 5, 5],
       [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5]])

## 엔티티 예측 모형

In [61]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed

In [62]:
from tensorflow.keras.models import Sequential

In [63]:
from tensorflow.keras.optimizers import Adam

In [64]:
# 글자수 + 패딩 0
NUM_CHAR = len(tok.index_word) + 1
NUM_CODE = len(code_index) + 1

In [65]:
model = Sequential([
    # 인풋 문자를 벡터로 변환. mask_zero=True 0으로 된거는 계산 안하도록. 
    Embedding(input_dim=NUM_CHAR, output_dim=4, mask_zero=True),
    LSTM(4, return_sequences=True),
    Dense(NUM_CODE, activation='softmax')
])

In [66]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 4)           88        
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 4)           144       
_________________________________________________________________
dense_2 (Dense)              (None, None, 6)           30        
Total params: 262
Trainable params: 262
Non-trainable params: 0
_________________________________________________________________


In [67]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=.1))

In [69]:
import numpy
model.fit(x, numpy.expand_dims(y2, 2), epochs=30)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x21916c4bf28>

## 예측

In [70]:
index_code = {idx: cd for cd, idx in code_index.items()}

In [71]:
p2 = model.predict_classes(x)

In [72]:
i = 1

In [73]:
text = texts[i]

In [74]:
n = len(text)

In [75]:
list(zip(text, [index_code[i] for i in p2[i][-n:]]))

[('콜', 'B_MENU'),
 ('라', 'B_MENU'),
 (' ', 'O'),
 ('한', 'O'),
 (' ', 'O'),
 ('병', 'O'),
 (' ', 'O'),
 ('주', 'O'),
 ('세', 'O'),
 ('요', 'O')]