# 导入库

In [None]:
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model
from keras.utils import plot_model

import pandas as pd
import numpy as np

In [2]:
def create_model(n_input, n_output, n_units):
    # encoder
    encoder_input = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    _,encoder_h, encoder_c = encoder(encoder_input)
    encoder_state = [encoder_h, encoder_c]
    
    
    # Set up the decoder, using `encoder_states` as initial state.
    decoder_input = Input(shape=(None, n_output))
    decoder = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_output, _, _ = decoder(decoder_input,
                                   initial_state=encoder_state)
    decoder_dense = Dense(n_output, activation='softmax')
    decoder_output = decoder_dense(decoder_output)
    
    # Define the model 
    model = Model([encoder_input, decoder_input], decoder_output)
    
    # inference setup
    # encoder
    encoder_infer = Model(encoder_input, encoder_state)
    
    # decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))    
    decoder_state_input = [decoder_state_input_h, decoder_state_input_c] 
    
    decoder_infer_output, decoder_infer_state_h, decoder_infer_state_c = decoder(decoder_input,
                                                                                 initial_state=decoder_state_input)
    decoder_infer_state = [decoder_infer_state_h, decoder_infer_state_c]
    decoder_infer_output = decoder_dense(decoder_infer_output)
    
    decoder_infer = Model([decoder_input] + decoder_state_input,
                          [decoder_infer_output] + decoder_infer_state)
    
    return model, encoder_infer, decoder_infer

In [3]:
N_UNITS = 256
BATCH_SIZE = 64
EPOCH = 50
NUM_SAMPLES = 10000

# 数据下载
[下载地址](http://www.manythings.org/anki/)

# 数据读取

In [4]:
data_path = 'cmn-eng/cmn.txt'
df = pd.read_table(data_path,header=None).iloc[:NUM_SAMPLES,:,]
df.columns=['inputs', 'targets', 'others']

df['targets'] = df['targets'].apply(lambda x: '\t'+x+'\n')

input_texts = df.inputs.values.tolist()
target_texts = df.targets.values.tolist()

input_characters = sorted(list(set(df.inputs.unique().sum())))
target_characters = sorted(list(set(df.targets.unique().sum())))

In [5]:
INUPT_LENGTH = max([len(i) for i in input_texts])
OUTPUT_LENGTH = max([len(i) for i in target_texts])
INPUT_FEATURE_LENGTH = len(input_characters)
OUTPUT_FEATURE_LENGTH = len(target_characters)

# 向量化

In [6]:
encoder_input = np.zeros((NUM_SAMPLES, INUPT_LENGTH, INPUT_FEATURE_LENGTH))
decoder_input = np.zeros((NUM_SAMPLES, OUTPUT_LENGTH, OUTPUT_FEATURE_LENGTH))
decoder_output = np.zeros((NUM_SAMPLES, OUTPUT_LENGTH, OUTPUT_FEATURE_LENGTH))

In [7]:
input_dict = {char:index for index,char in enumerate(input_characters)}
input_dict_reverse = {index:char for index,char in enumerate(input_characters)}
target_dict = {char:index for index,char in enumerate(target_characters)}
target_dict_reverse = {index:char for index,char in enumerate(target_characters)}

In [8]:
for seq_index,seq in enumerate(input_texts):
    for char_index, char in enumerate(seq):
        encoder_input[seq_index, char_index, input_dict[char]] = 1

In [9]:
for seq_index,seq in enumerate(target_texts):
    for char_index,char in enumerate(seq):
        decoder_input[seq_index,char_index, target_dict[char]] = 1.0
        if char_index > 0:
            decoder_output[seq_index,char_index-1, target_dict[char]] = 1.0

# 观察向量化的数据

In [10]:
''.join([input_dict_reverse[np.argmax(i)] for i in encoder_input[0] if max(i) !=0])

'Hi.'

In [11]:
''.join([target_dict_reverse[np.argmax(i)] for i in decoder_output[0] if max(i) !=0])

'嗨。\n'


# 创建模型

In [12]:
model_train, encoder_infer, decoder_infer = create_model(INPUT_FEATURE_LENGTH,
                                                         OUTPUT_FEATURE_LENGTH,
                                                         N_UNITS)

In [13]:
# Compile & run training
model_train.compile(optimizer='rmsprop',
                    loss='categorical_crossentropy')

In [14]:
model_train.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 72)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 2561)   0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 336896      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  2885632     input_2[0][0]                    
                                                                 lstm_1[0][1]               

In [15]:
encoder_infer.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 72)          0         
_________________________________________________________________
lstm_1 (LSTM)                [(None, 256), (None, 256) 336896    
Total params: 336,896
Trainable params: 336,896
Non-trainable params: 0
_________________________________________________________________


In [16]:
decoder_infer.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 2561)   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  2885632     input_2[0][0]                    
                                                                 input_3[0][0]              

In [None]:
validation_split = 0.2
model_train.fit([encoder_input,decoder_input],
                decoder_output,
                batch_size=BATCH_SIZE,
                epochs=EPOCH,
                validation_split=validation_split)

# 预测序列

In [18]:
def predict_chinese(source,encoder_inference, decoder_inference, n_steps, features):
    state = encoder_inference.predict(source)
    predict_seq = np.zeros((1,1,features))
    predict_seq[0,0,target_dict['\t']] = 1

    output = ''

    for i in range(n_steps): # n_steps为句子最大长度
        yhat,h,c = decoder_inference.predict([predict_seq]+state)
        char_index = np.argmax(yhat[0,-1,:])
        char = target_dict_reverse[char_index]
        output += char
        state = [h,c]
        predict_seq = np.zeros((1,1,features))
        predict_seq[0,0,char_index] = 1
        if char == '\n':
            break
    return output

In [19]:
for i in range(1000,1100):
    test = encoder_input[i:i+1,:,:] 
    out = predict_chinese(test,encoder_infer,decoder_infer,OUTPUT_LENGTH,OUTPUT_FEATURE_LENGTH)
    print(input_texts[i])
    print(out)

Stop grumbling.
别再念了。

Stop resisting!
别再念了。

Summer is over.
生活很久。

Take your time.
你不要说话，我。

Take your time.
你不要说话，我。

That was wrong.
那不好。

That's a shame.
那是一個好主意。

That's logical.
那太好了。

That's my coat.
那是我的錯。

That's perfect.
那是一個好主意。

That's too bad.
那是不好。

That's too bad.
那是不好。

That's too bad.
那是不好。

The birds sang.
這個男人吃了麵包。

The flag is up.
火车准时到了。

The phone rang.
這個男人吃了麵包。

Their eyes met.
他們的車子很起。

These are pens.
這些是筆。

They hated Tom.
他們不會工作。

They have jobs.
他们没有。

They let me go.
他們不會見。

They love that.
他们不喜欢你。

They trust Tom.
他們會幫助。

They want more.
他们不喜欢你。

They want this.
他们不喜欢你。

They were good.
他們是我的。

This is a book.
这是一个好大的时间。

This is my bag.
这是我的自行车。

Tom can change.
汤姆不会游泳。

Tom can't swim.
汤姆不会游泳。

Tom has a plan.
湯姆有個好人。

Tom is a rabbi.
汤姆是个好主意。

Tom is no fool.
汤姆是个好人。

Tom isn't dumb.
汤姆不傻。

Tom looks pale.
湯姆喜歡這個。

Tom loves dogs.
湯姆喜歡這個。

Tom turned red.
汤姆走得很慢。

Tom walked out.
湯姆喜歡這個。

Tom was crying.
湯姆有個好人。

Tom won't stop.
湯姆不會停。

Tom's fearless