# 從零開始製作英文>中文翻譯機

## 建立資料集

In [1]:
import re
import pickle as pkl

data_path = 'cmn.txt'
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('n')

In [2]:
def preprocess_zh(sentence):
    """
    Lowercases a Chinese sentence and inserts a whitespace between two characters.
    Surrounds the split sentence with <SOS> and <EOS>.
    """
    # removes whitespaces from the beginning of a sentence and from the end of a sentence
    sentence = sentence.lower().strip()
    # removes redundant whitespaces among words
    sentence = re.sub(r"[' ']+", " ", sentence)
    sentence = sentence.strip()
    # inserts a whitespace in between two words
    sentence = " ".join(sentence)
    # attaches starting token and ending token
    sentence = "<SOS> " + sentence + " <EOS>"
    return sentence


def preprocess_eng(sentence):
    """
    Lowercases an English sentence and inserts a whitespace within 2 words or punctuations.
    Surrounds the split sentence with <SOS> and <EOS>
    """
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([,.!?\"'])", r" \1", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"[^a-zA-Z,.!?\"']", ' ', sentence)
    sentence = "<SOS> " + sentence + " <EOS>"
    return sentence

In [3]:
# regardless of source and target languages
seq_pairs = []

for line in lines:
    # ensures that the line loaded contains Chinese and English sentences
    if len(line.split('\t')) >= 3:
        eng_doc, zh_doc, _ = line.split('\t')
        eng_doc = preprocess_eng(eng_doc)
        en_doc = preprocess_zh(zh_doc)
        seq_pairs.append([eng_doc, en_doc])
    else:
        continue

In [4]:
seq_pairs

[['<SOS> hi . <EOS>', '<SOS> 嗨 。 <EOS>'],
 ['<SOS>   tatoeba .org          cm             martha  hi . <EOS>',
  '<SOS> 你 好 。 <EOS>'],
 ['<SOS>  . <EOS>', '<SOS> 你 用 跑 的 。 <EOS>'],
 ['<SOS>   tatoeba .org           jsakuragi              egg      stop ! <EOS>',
  '<SOS> 住 手 ！ <EOS>'],
 ['<SOS>   tatoeba .org          cm             glossamatik  wait ! <EOS>',
  '<SOS> 等 等 ！ <EOS>'],
 ['<SOS>   tatoeba .org           belgavox              wzhd  wait ! <EOS>',
  '<SOS> 等 一 下 ！ <EOS>'],
 ['<SOS>  . <EOS>', '<SOS> 開 始 ！ <EOS>'],
 ['<SOS> g  hello ! <EOS>', '<SOS> 你 好 。 <EOS>'],
 ['<SOS>   tatoeba .org          ck              musclegirlxyp  i try . <EOS>',
  '<SOS> 我 試 試 。 <EOS>'],
 ['<SOS>  ! <EOS>', '<SOS> 我 贏 了 。 <EOS>'],
 ['<SOS> o ! <EOS>', '<SOS> 不 會 吧 。 <EOS>'],
 ['<SOS>   cheers ! <EOS>', '<SOS> 乾 杯 ! <EOS>'],
 ['<SOS>                 martha  got it ? <EOS>', '<SOS> 知 道 了 沒 有 ？ <EOS>'],
 ['<SOS>   tatoeba .org          cm             glossamatik  got it ? <EOS>',
  '<SOS> 懂 了 嗎 ？ <

### 儲存切割好的資料

In [5]:
# Save list seq_pairs to file
with open("eng-zh.pkl", "wb") as f:
    pkl.dump(seq_pairs, f)

## 建立 TF 資料集

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [7]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPU


2022-07-25 12:42:57.879287: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-25 12:42:57.904994: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-25 12:42:57.905220: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-25 12:42:57.906178: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [8]:
with open("eng-zh.pkl", "rb") as f:
    seq_pairs = pkl.load(f)

source_sentences = [pair[0] for pair in seq_pairs]
target_sentences = [pair[1] for pair in seq_pairs]

### 建立斷詞

In [None]:
def create_tokeniser(sentences):
    # create a tokeniser specific to texts
    tokeniser = Tokenizer(filters = ' ')
    tokeniser.fit_on_texts(sentences)
    # preview the first 3 sentences versus their word tokenised versions
    for i in range(3):
        print("original: {} - word tokenised: {}".format(sentences[i], tokeniser.texts_to_sequences(sentences)[i]))
    return tokeniser.texts_to_sequences(sentences), tokeniser

# word tokenise source and target sentences
src_word_tokenised, src_tokeniser = create_tokeniser(src_sentences)
tgt_word_tokenised, tgt_tokeniser = create_tokeniser(tgt_sentences)