In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import MeCab
import unicodedata
import re
import numpy as np
import os
import io

tf.enable_eager_execution()

In [2]:
# load txt file
def load_def(filename):
    # open a txt file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    lines = file.read()
    file.close()
    
    return lines

In [3]:
# create each languages list
def create_lang_list(num_example):
    # load txt file
    text = load_def("dataset/raw.txt")

    lines = text.strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_example]]

    return zip(*word_pairs)

# translate English to Japanese

In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import MeCab
import unicodedata
import re
import numpy as np
import os
import io
import time

In [5]:
# convert unicode file to ascii
def unicode_to_ascii(s):
    # avoid lang accent check → japanese has lang acccent
    return ' '.join(c for c in unicodedata.normalize('NFC', s))

# preprocess
def preprocess_sentence(w):
    # Morphological analysis for japanese lang
    m = MeCab.Tagger("-Owakati")
    w = m.parse(w)
    
    w = unicode_to_ascii(w.lower().strip())
    # create a space between word and the punctuation
    w = re.sub(r"([?!¿.,。])", r" \1 ", w)
    # replacing everything with space except(a-z, A-Z, ".",  "?",  "!",  ",", "-", "ー", , "。", "Kanji", "Katakana", "Hiragana")
    w = re.sub(r"[^a-zA-Z\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?.!,。¿\-/ {1,}/]+",  " ", w)
    w = w.rstrip().strip()
    
    # add a start and end  token to the sentence
    # model know when to start and end
    w = "<start> " + w + " <end>"
    return w

In [6]:
# check word
en_sentence =u"May I borrow this book?"
ja_sentence = u"プールに行きたい。でも今日は雨"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(ja_sentence))

<start> m a y   i   b o r r o w   t h i s   b o o k    ? <end>
<start> プ ー ル   に   行 き   た い    。    で も   今 日   は   雨 <end>


In [7]:
en, ja = create_lang_list(10)
print(en)
print(ja)

('<start> y o u   a r e   b a c k    ,    a r e n       t   y o u    ,    h a r o l d    ? <end>', '<start> m y   o p p o n e n t   i s   s h a r k    . <end>', '<start> t h i s   i s   o n e   t h i n g   i n   e x c h a n g e   f o r   a n o t h e r    . <end>', '<start> y e a h    ,    i       m   f i n e    . <end>', '<start> d o n       t   c o m e   t o   t h e   o f f i c e   a n y m o r e    .    d o n       t   c a l l   m e   e i t h e r    . <end>', '<start> l o o k s   b e a u t i f u l    . <end>', '<start> g e t   h i m   o u t   o f   h e r e    ,    b e c a u s e   i   w i l l   f u c k i n g   k i l l   h i m    . <end>', '<start> y o u   k i l l e d   h i m    ! <end>', '<start> o k a y    ,    t h e n   w h o    ? <end>', '<start> i t   s e e m s   a   f o r m e r   e m p l o y e e    .   .   . <end>')
('<start> あ な た   は   戻 っ   た   の   ね   ハ ロ ル ド    ? <end>', '<start> 俺   の   相 手   は   シ ャ ー ク   だ    。 <end>', '<start> 引 き 換 え   だ   あ る   事   と   あ る   物   の <end>

In [8]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [9]:
def tokenize(lang):
    # vectorize a text corpus     
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=' ')
    # updates internal vocabulary based on a list of texts
    # e.g. "[this place is good ]"→{this:1, place:2, is:3, good:4} "
    lang_tokenizer.fit_on_texts(lang)
    # Transforms each text in texts to a sequence of integers.
    tensor = lang_tokenizer.texts_to_sequences(lang)
    # transform a list of num sample into a 2D Numpy array of shape 
    # Fixed length because length of sequence of integers are different
    # return (len(sequences), maxlen)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')
    return tensor, lang_tokenizer

In [10]:
# example
tokenize(["this place is good haha hey"])

(array([[1, 2, 3, 4, 5, 6]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x1a2f437828>)

In [11]:
# create a clean input, output pairs
def load_dataset(num_example):
    input_lang, target_lang= create_lang_list(num_example)
    input_tensor, input_lang_tokenize = tokenize(input_lang)
    target_tensor, target_lang_tokenize = tokenize(target_lang)

    return input_tensor, target_tensor, input_lang_tokenize, target_lang_tokenize

In [12]:
# limit datasize for test
num_example = 30000
# get data
input_tensor, target_tensor, input_lang, target_lang = load_dataset(num_example)
# Calculate max_length of the target tensors
max_length_target, max_length_input = max_length(target_tensor), max_length(input_tensor)

In [13]:
# create trainnig set and validation set
input_tensor_train, input_tensor_val, \
    target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=42)

# show length
print(len(input_tensor_train), len(input_tensor_val), len(target_tensor_train), len(target_tensor_val))

24000 6000 24000 6000


In [14]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            # Index number assigned to each word
            print("%d----->%s" % (t, lang.index_word[t]))

In [15]:
print("input lang: index to word mapping")
convert(input_lang, input_tensor_train[0])
print("output lang: index to word mapping")
convert(target_lang, target_tensor_train[0])

input lang: index to word mapping
12-----><start>
8----->h
1----->e
14----->d
5----->i
14----->d
5----->i
2----->t
18----->w
5----->i
2----->t
8----->h
8----->h
5----->i
7----->s
3----->o
18----->w
6----->n
18----->w
5----->i
10----->l
10----->l
28----->!
13-----><end>
output lang: index to word mapping
1-----><start>
2139----->茂
16----->が
442----->勝
104----->手
10----->に
364----->落
45----->ち
6----->た
3----->の
2-----><end>


In [16]:
# create a dataset
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_lang.word_index) + 1
vocab_tar_size = len(target_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
print(dataset)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<DatasetV1Adapter shapes: ((255,), (80,)), types: (tf.int32, tf.int32)>
<DatasetV1Adapter shapes: ((64, 255), (64, 80)), types: (tf.int32, tf.int32)>


In [17]:
example_input_batch, example_target_batch =  next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

Instructions for updating:
Colocations handled automatically by placer.


(TensorShape([Dimension(64), Dimension(255)]),
 TensorShape([Dimension(64), Dimension(80)]))

# encoder and decoder model

In [None]:
# load glove
def load_glove():
    embeddings_dictionary = {}
    glove_file =  open()
    
    for line in glove_file:
        records =  line.split()
        word = records[0]
        vector_ dimensions = asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] =  vector_dimensions
    glove_file.close()

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding_dim = embedding_dim
        