In [49]:
import json
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from opencc import OpenCC
cc = OpenCC('s2tw')


In [50]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False
data = []
endata = []
chdata = []
with open('./translation2019zh_train.json','r', encoding='utf-8') as file:
    for f in file:
        data = json.loads(f)
        if is_chinese(data['chinese']) == True:
            if len(data['chinese'])<10:
                endata.append(data['english'])
                chdata.append('@'+data['chinese']+'。')
                if(len(endata)==100):
                    break
en_vocab = set(''.join(endata))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

ch_vocab = set(''.join(chdata))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}
print('\n英文字典:\n', endata)
print('\n中文字典共計\n:', chdata)

print('\n英文字典:\n', en2id)
print('\n中文字典共計\n:', ch2id)




英文字典:
 ['Look at these coasters over here.', 'Choose a recorder.', "I hadn't paid the telephone bill.", "That's easier said than done, of course.", 'Side-to-Side Movements.', 'about like 80 degrees.', 'We all are from Shandong.', 'She was possessed by a devil.', 'This wool knits up well.', 'The majority was wrong last time.', 'Stone Soup Stories to Go!', 'Done. See you tomorrow.', 'He eased some of the strains on the poor.', 'Could it be that it was written wrongly?', 'What a terrible temper!', 'Great talents flower late.', 'I forbid you to make a sortie today.', 'C：My surname is Jiang.', 'Well, if it was greater .', 'They looked over to the left.', 'To supervise the management of printing industry.', 'no one else can see you shake your head.', 'All photos dials.', 'Stained glass window panels;', 'The murderer was caught red-handed.', 'You don’t love Melanie.', 'His style is very lucid .', 'At this time, there was a male cat.', 'The truth has leaked out.', 'Material evidence must also

In [51]:
en_num_data = [[en2id[en] for en in line ] for line in endata]
ch_num_data = [[ch2id[ch] for ch in line] for line in chdata]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in chdata]
print('char:', endata[1])
print('index:', en_num_data[1])
print('char:', chdata[1])
print('index:', ch_num_data[1])

char: Choose a recorder.
index: [27, 32, 0, 0, 39, 31, 2, 17, 2, 62, 31, 20, 0, 62, 45, 31, 62, 61]
char: @选择一个记录员.。
index: [130, 125, 212, 13, 335, 287, 229, 168, 208, 243]


In [52]:
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

max encoder length: 49
max decoder length: 11


In [53]:
# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

In [54]:
for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

index data:
 [27, 32, 0, 0, 39, 31, 2, 17, 2, 62, 31, 20, 0, 62, 45, 31, 62, 61]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [55]:
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.01
BATCH_SIZE = 20
EPOCHS = 250

In [56]:
# ==============encoder=============
encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
#emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)


In [57]:
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
decoder_outputs = decoder_dense(decoder_h2)

In [58]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.)


# Save model
model.save('s2s.h5')

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, None, 66)]   0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, None, 403)]  0                                            
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, None, 256),  330752      input_25[0][0]                   
__________________________________________________________________________________________________
lstm_18 (LSTM)                  [(None, None, 256),  675840      input_26[0][0]                   
                                                                 lstm_16[0][1]             

In [59]:
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])

# 預測模型中的decoder的初始化狀態需要傳入新的狀態
decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

# 使用傳入的值來初始化當前模型的輸入狀態
decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [60]:
for k in range(0,99):
    test_data = encoder_input_data[k:k+1]
    h1, c1, h2, c2 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    target_seq[0, 0, ch2id['@']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict([target_seq, h1, c1, h2, c2])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == ch2id['。']: break
        # if len(outputs) > 11: break
    
    print(endata[k])
    print(''.join([id2ch[i] for i in outputs]))

Look at these coasters over here.
看看这边的杯垫。
Choose a recorder.
拥抱你的孩子。
I hadn't paid the telephone bill.
我还没交电话费。
That's easier said than done, of course.
这种毛线很好织。
Side-to-Side Movements.
侧向运动。
about like 80 degrees.
大概华氏80度吧。
We all are from Shandong.
如果拉力很大。
She was possessed by a devil.
门襟拉链扣在前面。
This wool knits up well.
这种毛线很好织。
The majority was wrong last time.
这种毛线很好织。
Stone Soup Stories to Go!
彩色玻璃窗板；。
Done. See you tomorrow.
一言为定。
He eased some of the strains on the poor.
他因谋杀罪而受审。
Could it be that it was written wrongly?
考虑你的听众。
What a terrible temper!
这是几杆洞？。
Great talents flower late.
大器晚成。
I forbid you to make a sortie today.
我还没交电话费。
C：My surname is Jiang.
雪貂会不会生跳蚤？。
Well, if it was greater .
如果拉力很大。
They looked over to the left.
这种毛线很好织。
To supervise the management of printing industry.
调整起步价。
no one else can see you shake your head.
没人看的见你摇头。
All photos dials.
全套管基桩?。
Stained glass window panels;
彩色玻璃窗板；。
The murderer was caught red-handed.
这种毛线很好织。
You don’t love Melanie