In [3]:
# load txt file
def load_def(filename):
    # open a txt file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    lines = file.read().strip().split('\n')
    file.close()
    
    return lines

In [76]:
# create sentence list
def create_lang_list():
    # load txt file
    lines = load_def("dataset/raw.txt")

    # split words english and japanese
    pairs = [[preprocess_sentence(w) for w in line.split('\t')] for line in lines[:1]]
    return zip(*pairs)

# translate English to Japanese

In [5]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [78]:
# convert unicode file to ascii
def unicode_to_ascii(s):
    # str -> bytes
    bytes_text = s.encode()
    # replace ゛ to Combining characters
    bytes_text = re.sub(b"\xe3\x82\x9b", b'\xe3\x82\x99', bytes_text)
    bytes_text = re.sub(b"\xef\xbe\x9e", b'\xe3\x82\x99', bytes_text)

    # replace ゜to Combining characters
    bytes_text = re.sub(b"\xe3\x82\x9c", b'\xe3\x82\x9a', bytes_text)
    bytes_text = re.sub(b"\xef\xbe\x9f", b'\xe3\x82\x9a', bytes_text)

    # bytet -> str
    text = bytes_text.decode()
    # avoid lang accent check → japanese has lang acccent
    return ' '.join(c for c in unicodedata.normalize('NFD', text))

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # create a space between word and the punctuation
    w = re.sub(r"([?!¿.,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except(a-z, A-Z, ".",  "?",  "!",  ",", "-", "ー", "Kanji", "Katakana", "Hiragana")
    w = re.sub(r"[^a-zA-Z\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?.!,¿-ー]+",  " ", w)
    w = w.rstrip().strip()
    
    # add a start and end  token to the sentence
    # model know when to start and end
    w = "<start> " + w + " <end>"
    return w

In [79]:
# check word
en_sentence = u" helloworld.today!HAHAHA "
ja_sentence = u" こんにちは,何で?コぱニチはズるい!"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(ja_sentence))

<start> h e l l o w o r l d . t o d a y ! h a h a h a <end>
<start> こ ん に ち は , 何 て ゙ ? コ は ゚ ニ チ は ス ゙ る い ! <end>


In [80]:
en, ja = create_lang_list()
print(en[-1])
print(ja[-1])

<start> y o u a r e b a c k , a r e n t y o u , h a r o l d ? <end>
<start> あ な た は 戻 っ た の ね ハ ロ ル ト ゙ ? <end>
