# 使用序列到序列网络构建一个聊天机器人


In [7]:
from nlpia.loaders import get_data
df = get_data('moviedialog')
input_texts,target_texts = [],[]
input_vocabulary = set()
output_vocabulary = set()
start_token = '\t'
stop_token = '\n'
max_training_sample = min(25000,len(df)-1)

for input_text,target_text in zip(df.statement,df.reply):
    target_text = start_token+target_text+stop_token
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary:
            output_vocabulary.add(char)

INFO:nlpia.futil:Reading CSV with `read_csv(*('C:\\Users\\24132\\AppData\\Roaming\\Python\\Python38\\site-packages\\nlpia\\data\\moviedialog.csv',), **{'nrows': None, 'low_memory': False})`...


In [17]:
#输入和输出词汇表
input_vocabulary = sorted(input_vocabulary)
output_vocabulary = sorted(output_vocabulary)

input_vocab_size = len(input_vocabulary)
output_vocab_size = len(output_vocabulary)
max_encode_seq_length = max([len(txt) for txt in input_texts])
max_decode_seq_length = max([len(txt) for txt in target_texts])

#建立字典
input_token_index = dict([char,i] for i,char in enumerate(input_vocabulary))
target_token_index = dict([char,i] for i,char in enumerate(output_vocabulary))
reverse_input_char_index = dict((i,char) for char,i in input_token_index.items())
reverse_output_char_index = dict((i,char) for char,i in target_token_index.items())


print('输入字符set之后数量：',input_vocab_size,'     输出字符set之后数量',output_vocab_size)

输入字符set之后数量： 44      输出字符set之后数量 46


In [21]:
#生成独热编码
import numpy as np
encoder_input_data = np.zeros((len(input_texts),max_encode_seq_length,input_vocab_size),dtype='float32')
decoder_input_data = np.zeros((len(input_texts),max_decode_seq_length,output_vocab_size),dtype='float32')
decoder_target_data = np.zeros((len(input_texts),max_decode_seq_length,output_vocab_size),dtype='float32')
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for t,char in enumerate(input_text):
        encoder_input_data[i,t,input_token_index[char]] = 1
    for t,char in enumerate(target_text):
        decoder_input_data[i,t,target_token_index[char]] = 1
        if t>0:
            decoder_target_data[i,t-1,target_token_index[char]] = 1
encoder_input_data[0]

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [51]:
len(encoder_input_data)

64350

In [52]:
from keras.models import Model
from keras.layers import Input,LSTM,Dense
from keras import optimizers
batch_size = 64
epochs = 10
num_neurons = 256

encoder_inputs = Input(shape=(None,input_vocab_size))   #加密输入
encoder = LSTM(num_neurons,return_state=True)
encoder_outputs,state_h,state_c = encoder(encoder_inputs)
encoder_states = [state_h,state_c]

decoder_inputs = Input(shape=(None,output_vocab_size))  #解密输入
decoder_lstm = LSTM(num_neurons,return_sequences=True,return_state=True)
decoder_outputs,_,_ = decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size,activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)

optimizer = optimizers.Adam(lr=0.001)

model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['acc'])
model.fit([encoder_input_data,decoder_input_data],decoder_input_data,batch_size=batch_size,epochs=epochs,validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2520a8ec850>

In [53]:
#组装序列生成模型
encode_model = Model(encoder_inputs,encoder_states)

thought_input = [Input(shape=(num_neurons,)),Input(shape=(num_neurons,))]
decoder_outputs,state_h,state_c = decoder_lstm(decoder_inputs,initial_state=thought_input)
decoder_states = [state_h,state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(inputs=[decoder_inputs]+thought_input,
                     outputs=[decoder_outputs]+decoder_states)

#建立字符级的翻译器
def decode_sequence(input_seq):
    thought = encode_model.predict(input_seq)
    target_seq = np.zeros((1,1,output_vocab_size))
    target_seq[0,0,target_token_index[stop_token]] = 1.0
    stop_condiction = False
    generated_sequence = ''
    while not stop_condiction:
        output_tokens,h,c = decoder_model.predict([target_seq]+thought)
        generated_token_idx = np.argmax(output_tokens[0,-1,:])
        generated_char = reverse_output_char_index[generated_token_idx]
        generated_sequence += generated_char
        if (generated_char==stop_token or len(generated_sequence)>max_decode_seq_length):
            stop_condiction = True
        target_seq = np.zeros((1,1,output_vocab_size))
        target_seq[0,0,generated_token_idx] = 1.0
        thought = [h,c]
    return generated_sequence

In [54]:
#生成回复
def response(input_text):
    input_seq = np.zeros((1,max_encode_seq_length,input_vocab_size),dtype='float32')
    for t,char in enumerate(input_text):
        input_seq[0,t,input_token_index[char]] = 1.0
    decoded_sentence = decode_sequence(input_seq)
    print('Rebot reply(Decoded sentence):',decoded_sentence)

response('what is the net?')

Rebot reply(Decoded sentence): 	iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii


In [40]:
reverse_output_char_index

{0: '\t',
 1: '\n',
 2: ' ',
 3: '!',
 4: "'",
 5: ',',
 6: '.',
 7: '0',
 8: '1',
 9: '2',
 10: '3',
 11: '4',
 12: '5',
 13: '6',
 14: '7',
 15: '8',
 16: '9',
 17: ':',
 18: ';',
 19: '?',
 20: 'a',
 21: 'b',
 22: 'c',
 23: 'd',
 24: 'e',
 25: 'f',
 26: 'g',
 27: 'h',
 28: 'i',
 29: 'j',
 30: 'k',
 31: 'l',
 32: 'm',
 33: 'n',
 34: 'o',
 35: 'p',
 36: 'q',
 37: 'r',
 38: 's',
 39: 't',
 40: 'u',
 41: 'v',
 42: 'w',
 43: 'x',
 44: 'y',
 45: 'z'}