In [137]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import MeCab
import unicodedata
import re
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import os
import io
import time
import copy

from gensim.models import word2vec


In [2]:
# load txt file
def load_def(path):
    # open a txt file as read only
    lines = io.open(path, encoding='UTF-8', errors="ignore").read().strip().split('\n')
    
    return lines

In [12]:
# create each languages list
def create_lang_list(num_example):
    # load txt file
    lines = load_def("dataset/raw.txt")

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_example]]

    return zip(*word_pairs)

In [146]:
# convert unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFC', s)
                    if unicodedata.category(c) != 'Mn')

# preprocess
def preprocess_sentence(w):

    # check japanese lang
    p = re.compile('[\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F]+')
    if p.search(w):
        # Morphological analysis for japanese lang
        m = MeCab.Tagger("-Owakati")
        w = m.parse(w)

    w = unicode_to_ascii(w.lower().strip())
    # create a space between word and the punctuation
    w = re.sub(r"([?!¿.,。])", r" \1 ", w)
    # replacing everything with space except(a-z, A-Z, ".",  "?",  "!",  ",", "-", "ー", , "。", "Kanji", "Katakana", "Hiragana")
    w = re.sub(r"[^a-zA-Z\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?.!,。¿\-/ {1,}/]+",  " ", w)
    w = w.rstrip().strip()
    
    # add a start and end  token to the sentence
    # model know when to start and end
    w = "<start> " + w + " <end>"
    return w

In [147]:
# check word
en_sentence =u"May I borrow this book?"
ja_sentence = u"プールに行きたい。でも今日は雨."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(ja_sentence))

<start> may i borrow this book ? <end>
<start> プール に 行き たい  。  でも 今日 は 雨  . <end>


In [148]:
en, ja = create_lang_list(10)
print(en[:10])
print(ja[:10])

('<start> you are back ,  aren t you ,  harold ? <end>', '<start> my opponent is shark . <end>', '<start> this is one thing in exchange for another . <end>', '<start> yeah ,  i m fine . <end>', '<start> don t come to the office anymore .  don t call me either . <end>', '<start> looks beautiful . <end>', '<start> get him out of here ,  because i will fucking kill him . <end>', '<start> you killed him ! <end>', '<start> okay ,  then who ? <end>', '<start> it seems a former employee .  .  . <end>')
('<start> あなた は 戻っ た の ね ハロルド  ? <end>', '<start> 俺 の 相手 は シャーク だ  。 <end>', '<start> 引き換え だ ある 事 と ある 物 の <end>', '<start> もう いい よ ごちそうさま ううん <end>', '<start> もう 会社 に は 来 ない で くれ 電話 も する な <end>', '<start> きれい だ  。 <end>', '<start> 連れ て 行け 殺し そう だ わかっ た か  ? <end>', '<start> 殺し た の か  ! <end>', '<start> わぁ   !  いつも すみません  。  いい の よ   。 <end>', '<start> カンパニー の 元 社員 が <end>')


# Tokenization

In [26]:
def max_length(tensor):
    return (len(t) for t in tensor)

In [22]:
def tokenize(lang):
    # victorize a text corpus
    lang_tokenize = tf.keras.preprocessing.text.Tokenizer(filters=' ')
    lang_tokenize.fit_on_texts(lang)
    # to sequence of integer  e.g. ['こんにちは 今日　は', "today is so cold"] →[[1, 2], [3, 4, 5, 6]]
    tensor = lang_tokenize.texts_to_sequences(lang)
    # Fixed length because length of sequence of integers are different e.g. [[1, 2], [3, 4, 5, 6]] 
    # → →[[1 2 0 0] [3 4 5 6]]
    # return (len(sequences), maxlen) 
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                                                                         padding="post")
    return tensor, lang_tokenize

In [8]:
# example
tokenize(['こんにちは 今日　は', "today is so cold"])

[[1, 2], [3, 4, 5, 6]]
[[1 2 0 0]
 [3 4 5 6]]


(array([[1, 2, 0, 0],
        [3, 4, 5, 6]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x1a3eba1590>)

In [33]:
def load_dataset(num_examples=None):
    # creating cleaned input, output pairs
    output_lang, input_lang = create_lang_list(num_examples)
    print("input", input_lang[:1])
    print("output", output_lang[:1])

    input_tensor, inp_lang_tokenizer = tokenize(input_lang)
    print('Total unique words in the input: %s' % len(inp_lang_tokenizer.word_index))
    print(inp_lang_tokenizer.word_index["こんにちは"])
    target_tensor, targ_lang_tokenizer = tokenize(output_lang)
    print('Total unique words in the output: %s' % len(targ_lang_tokenizer.word_index))
    print(targ_lang_tokenizer.word_index["hello"])


    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [70]:
# Try experimenting with the size of that dataset
num_example = 50000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(num_example)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

input ('<start> あなた は 戻っ た の ね ハロルド  ? <end>',)
output ('<start> you are back ,  aren t you ,  harold ? <end>',)
Total unique words in the input: 26141
1199
Total unique words in the output: 21596
429


In [71]:
# create trainnig set and validation set
input_tensor_train, input_tensor_val, \
    target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=42)

# show length
print(len(input_tensor_train), len(input_tensor_val), len(target_tensor_train), len(target_tensor_val))

40000 10000 40000 10000


In [72]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            # Index number assigned to each word
            print("%d----->%s" % (t, lang.index_word[t]))

In [73]:
print("input lang: index to word mapping")
convert(inp_lang, input_tensor_train[0])
print("output lang: index to word mapping")
convert(targ_lang, target_tensor_train[0])

input lang: index to word mapping
1-----><start>
122----->彼ら
3----->の
1748----->歌
9----->が
9319----->聴ける
3----->の
4----->は
470----->ホント
1647----->久しぶり
21----->です
2-----><end>
output lang: index to word mapping
1-----><start>
44----->they
377----->haven
19----->t
104----->had
10----->a
1744----->fresh
1189----->audience
18----->in
243----->many
10----->a
1492----->moon
3----->.
2-----><end>


In [138]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)
embedding_dim = 100
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
print('Total unique words in the input: %s' % len(inp_lang.word_index))

Total unique words in the input: 26141


In [159]:
# create 2dimention list for using word2vec
japanese_sentence_list = []
word2vec_list = []

for i in ja:
    i = i.replace("<start>", "")
    i = i.replace("<end>", "")
    japanese_sentence_list.append(i)
    c = copy.deepcopy(japanese_sentence_list)
    word2vec_list.append(c)
    japanese_sentence_list.clear()
    word2vec_list
model = word2vec.Word2Vec(word2vec_list)