In [1]:
import numpy as np
import tensorflow as tf

In [2]:
file_path = r'C:\Users\DT-Liuxiangfei\Documents\CodeHub\TestCodes-master\TensorflowDemo\cmn.txt'
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
except FileNotFoundError:
    print(f"错误：文件 {file_path} 未找到。")
except Exception as e:
    print(f"读取文件时发生错误：{e}")


In [3]:
pairs = [line.split('\t') for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_en, sentences_cmn, *sentences_notes = zip(*pairs)

In [4]:
for i in range(5):
    print(f"{sentences_en[i]} => {sentences_cmn[i]}")

Are you sure that Tom is happy? => 你確定湯姆高興嗎？
He raised his hands. => 他舉起了他的手。
The batteries in my calculator are dead. => 我計算機的電池沒電了。
They obtained a yield of 8 percent on their investment. => 他們在這次的投資中得到百分之八的收益。
It is like looking for a needle in a haystack. => 这好比大海捞针。


In [5]:
def preprocess_chinese(text):
    import jieba
    return " ".join(jieba.cut(text))

In [6]:
sentences_cmn_temp = (preprocess_chinese(s) for s in sentences_cmn)
sentences_cmn = tuple(list(sentences_cmn_temp))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DT-LIU~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.914 seconds.
Prefix dict has been built successfully.


In [7]:
for i in range(5):
    print(f"{sentences_en[i]} => {sentences_cmn[i]}")

Are you sure that Tom is happy? => 你 確定 湯姆高興 嗎 ？
He raised his hands. => 他 舉起 了 他 的 手 。
The batteries in my calculator are dead. => 我 計算機 的 電池 沒電 了 。
They obtained a yield of 8 percent on their investment. => 他們 在 這次 的 投資中 得到 百分之八 的 收益 。
It is like looking for a needle in a haystack. => 这 好比 大海捞针 。


In [None]:
vocab_size = 1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=max_length,
)
text_vec_layer_cmn = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=max_length,
)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_cmn.adapt([f"startofseq {s} endofseq" for s in sentences_cmn])

In [9]:
X_train = tf.constant(sentences_en[:20_000])
X_valid = tf.constant(sentences_en[20_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_cmn[:20_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_cmn[20_000:]])
Y_train = text_vec_layer_cmn([f"endofseq {s}" for s in sentences_cmn[:20_000]])
Y_valid = text_vec_layer_cmn([f"endofseq {s}" for s in sentences_cmn[20_000:]])

In [26]:
encoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_input = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_input_ids = text_vec_layer_cmn(encoder_input)
decoder_input_ids = text_vec_layer_en(decoder_input)
encoder_embeddings = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)(encoder_input_ids)
decoder_embeddings = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)(decoder_input_ids)

encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [27]:
model = tf.keras.Model(inputs=[encoder_input, decoder_input], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [28]:
model.summary()

In [29]:
model.fit((X_train, X_train_dec), Y_train, epochs=10, validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 510ms/step - accuracy: 0.0536 - loss: 3.8691 - val_accuracy: 0.0554 - val_loss: 3.5810
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 612ms/step - accuracy: 0.0554 - loss: 3.5412 - val_accuracy: 0.0552 - val_loss: 3.5621
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 552ms/step - accuracy: 0.0556 - loss: 3.5204 - val_accuracy: 0.0564 - val_loss: 3.5185
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 608ms/step - accuracy: 0.0563 - loss: 3.4772 - val_accuracy: 0.0564 - val_loss: 3.5022
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 607ms/step - accuracy: 0.0565 - loss: 3.4505 - val_accuracy: 0.0562 - val_loss: 3.4915
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 631ms/step - accuracy: 0.0562 - loss: 3.4410 - val_accuracy: 0.0565 - val_loss: 3.4875
Epoc

<keras.src.callbacks.history.History at 0x1de691793d0>

In [49]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = tf.constant([sentence_en])
        X_dec = tf.constant(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.random.choice(len(y_proba), p=y_proba)
        print(predicted_word_id)
        predicted_word = text_vec_layer_cmn.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += " " + predicted_word
        print(translation)
    return translation.strip()

In [50]:
translate("Hello")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
3


''

In [46]:
text_vec_layer_cmn.get_vocabulary()[3]

np.str_('endofseq')