In [6]:
# load txt file
def load_def(filename):
    # open a txt file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    lines = file.read()
    file.close()
    
    return lines

In [7]:
# split a words english and japanese
def split_each_languages(text):
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [8]:
def create_lang_list():
    # load txt file
    text = load_def("dataset/raw.txt")

    # recieve jap and eng sentence as list
    pairs = split_each_languages(text)

    english_list = list()
    japanese_list = list()
    # create each languages list
    for i in pairs:
        english_list.append(i[0])
        japanese_list.append(i[1])

    return (english_list, japanese_list)

In [9]:
# lang list
en, ja = create_lang_list()

# translate English to Japanese

In [10]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import MeCab
import unicodedata
import re
import numpy as np
import os
import io
import time

In [11]:
# convert unicode file to ascii
def unicode_to_ascii(s):
    # avoid lang accent check → japanese has lang acccent
    return ' '.join(c for c in unicodedata.normalize('NFC', s))

# preprocess
def preprocess_sentence(w, lang):
    # Morphological analysis for japanese lang
    if lang == "ja":
        m = MeCab.Tagger("-Owakati")
        w = m.parse(w)
    
    w = unicode_to_ascii(w.lower().strip())
    # create a space between word and the punctuation
    w = re.sub(r"([?!¿.,。])", r" \1 ", w)
    # replacing everything with space except(a-z, A-Z, ".",  "?",  "!",  ",", "-", "ー", , "。", "Kanji", "Katakana", "Hiragana")
    w = re.sub(r"[^a-zA-Z\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?.!,。¿\-/ {1,}/]+",  " ", w)
    w = w.rstrip().strip()
    
    # add a start and end  token to the sentence
    # model know when to start and end
    w = "<start> " + w + " <end>"
    return w

In [12]:
# check word
en_sentence =u"May I borrow this book?"
ja_sentence = u"プールに行きたい。でも今日は雨"
print(preprocess_sentence(en_sentence, "en"))
print(preprocess_sentence(ja_sentence, "ja"))

<start> m a y   i   b o r r o w   t h i s   b o o k  ? <end>
<start> プ ー ル   に   行 き   た い    。    で も   今 日   は   雨 <end>


In [13]:
print(preprocess_sentence(en[-1], "en"))
print(preprocess_sentence(ja[-1], "ja"))

<start> d o   y o u   w a n t   t o   h e a r   i t  ? <end>
<start> 聞 き   た く   な け れ   ば   言 わ   な い   け れ ど    。 <end>


In [14]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [22]:
def tokenize(lang):
    # vectorize a text corpus     
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=' ')
    # updates internal vocabulary based on a list of texts
    # e.g. "[this place is good ]"→{this:1, place:2, is:3, good:4} "
    lang_tokenizer.fit_on_texts(lang)
    print(lang_tokenizer)
    # Transforms each text in texts to a sequence of integers.
    # e.g. {this:1, place:2, is:3, good:4} → [[1, 2, 3, 4]]
    tensor = lang_tokenizer.texts_to_sequences(lang)
    print(tensor)
    # transform a list of num sample into a 2D Numpy array of shape 
    # return (len(sequences), maxlen)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')
    print(tensor)
    
    return tensor, lang_tokenizer

In [24]:
# example
tokenize(["this place is good"])

<keras_preprocessing.text.Tokenizer object at 0x1526afa20>
[[1, 2, 3, 4]]
[[1 2 3 4]]


(array([[1, 2, 3, 4]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x1526afa20>)