In [37]:
import json
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from opencc import OpenCC
cc = OpenCC('s2tw')


In [38]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False
data = []
endata = []
chdata = []
with open('./translation2019zh_train.json','r', encoding='utf-8') as file:
    for f in file:
        data = json.loads(f)
        if is_chinese(data['chinese']) == True:
            if len(data['chinese'])<15:
                endata.append(data['english'])
                chdata.append('@'+data['chinese']+'。')
                if(len(endata)==1000):
                    break
en_vocab = set(''.join(endata))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

ch_vocab = set(''.join(chdata))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}
print('\n英文字典:\n', endata)
print('\n中文字典共計\n:', chdata)

print('\n英文字典:\n', en2id)
print('\n中文字典共計\n:', ch2id)




英文字典:
 ['And the light breeze moves me to caress her long ear', 'They have the blood of martyrs is the White to flow …', 'Have you shined your shoes?', 'Look at these coasters over here.', 'Show all articles on this topic.', 'One flat can accommodate a family of five.', "Is there any special Tenimyu you're longing for?", 'Choose a recorder.', 'I just came from a fruit market.', 'The Oedipus business comes nearer to home.', 'He transacts business with a large number of stores.', "It's necessary for us to net the currant bushes.", "I hadn't paid the telephone bill.", "Life won't always turn out the way you want.", "That's easier said than done, of course.", 'I grant that yours are of better quality.', "There is also a Children's Corner in our zoo.", 'Side-to-Side Movements.', 'Some desires are formed as the result of rational thought pro-  cesses.', 'LX, do you see in the Internet.', 'They love to run about on the beach.', 'Re-registration and other issues.', "Maybe that's the right dia

In [39]:
en_num_data = [[en2id[en] for en in line ] for line in endata]
ch_num_data = [[ch2id[ch] for ch in line] for line in chdata]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in chdata]
print('char:', endata[1])
print('index:', en_num_data[1])
print('char:', chdata[1])
print('index:', ch_num_data[1])

char: They have the blood of martyrs is the White to flow …
index: [74, 9, 42, 19, 71, 9, 7, 54, 42, 71, 12, 9, 42, 71, 56, 23, 57, 57, 32, 71, 57, 73, 71, 46, 7, 48, 12, 19, 48, 13, 71, 33, 13, 71, 12, 9, 42, 71, 14, 9, 33, 12, 42, 71, 12, 57, 71, 73, 23, 57, 63, 71, 15]
char: @它们的先烈们的鲜血是白流了…。
index: [137, 1511, 1835, 389, 522, 931, 1835, 389, 1588, 1306, 1493, 1251, 1780, 1465, 1428, 962]


In [40]:
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

max encoder length: 124
max decoder length: 16


In [41]:
# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

In [42]:
for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

index data:
 [74, 9, 42, 19, 71, 9, 7, 54, 42, 71, 12, 9, 42, 71, 56, 23, 57, 57, 32, 71, 57, 73, 71, 46, 7, 48, 12, 19, 48, 13, 71, 33, 13, 71, 12, 9, 42, 71, 14, 9, 33, 12, 42, 71, 12, 57, 71, 73, 23, 57, 63, 71, 15]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [43]:
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.01
BATCH_SIZE = 20
EPOCHS = 250

In [44]:
# ==============encoder=============
encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
#emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)


In [45]:
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
decoder_outputs = decoder_dense(decoder_h2)

In [46]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.)


# Save model
model.save('s2s.h5')

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, None, 92)]   0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, None, 1860)] 0                                            
__________________________________________________________________________________________________
lstm_12 (LSTM)                  [(None, None, 256),  357376      input_19[0][0]                   
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, None, 256),  2167808     input_20[0][0]                   
                                                                 lstm_12[0][1]              

In [47]:
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])

# 預測模型中的decoder的初始化狀態需要傳入新的狀態
decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

# 使用傳入的值來初始化當前模型的輸入狀態
decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [48]:
for k in range(0,100):
    test_data = encoder_input_data[k:k+1]
    h1, c1, h2, c2 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    target_seq[0, 0, ch2id['@']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict([target_seq, h1, c1, h2, c2])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == ch2id['。']: break
        # if len(outputs) > 11: break
    
    print(endata[k])
    print(''.join([id2ch[i] for i in outputs]))

And the light breeze moves me to caress her long ear
这次是一只小公猫。
They have the blood of martyrs is the White to flow …
这座宝塔有个七边形的底座。
Have you shined your shoes?
他的双冒。
Look at these coasters over here.
没有上娜他。
Show all articles on this topic.
我没水的迎到。
One flat can accommodate a family of five.
我们英为。
Is there any special Tenimyu you're longing for?
我是个欧洲人，总要叶落归根。
Choose a recorder.
吉姆靠的。
I just came from a fruit market.
我是个欧洲人，总要叶落归根。
The Oedipus business comes nearer to home.
这座宝塔有个七边形的底座。
He transacts business with a large number of stores.
他的竟 的迎:到。
It's necessary for us to net the currant bushes.
我是个欧洲人，总要叶落归根。
I hadn't paid the telephone bill.
我是个欧洲人，总要叶落归根。
Life won't always turn out the way you want.
没有上娜他。
That's easier said than done, of course.
这座宝塔有个七边形的底座。
I grant that yours are of better quality.
我是个欧洲人，总要叶落归根。
There is also a Children's Corner in our zoo.
这座宝塔有个七边形的底座。
Side-to-Side Movements.
我没水的迎到。
Some desires are formed as the result of rational thought pro-  cesses.
我没水的迎到