In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [27]:
data_en = pd.read_table('data/train/train.en', header=0, names=['source'])
data_vi = pd.read_table('data/train/train.vi', header=0, names=['target'])
data = pd.concat([data_en,data_vi], axis=1)
data.source = '<start> ' + data.source + ' <end>'
data.target = '<start> ' + data.target + ' <end>'
data.head()

Unnamed: 0,source,target
0,"<start> In 4 minutes , atmospheric chemist Rac...","<start> Trong 4 phút , chuyên gia hoá học khí ..."
1,<start> I &apos;d like to talk to you today ab...,<start> Tôi muốn cho các bạn biết về sự to lớn...
2,<start> Headlines that look like this when the...,<start> Có những dòng trông như thế này khi bà...
3,<start> They are both two branches of the same...,<start> Cả hai đều là một nhánh của cùng một l...
4,<start> Recently the headlines looked like thi...,<start> Các tiêu đề gần đây trông như thế này ...


In [29]:
def create_dataset(path, limit_size=None):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    lines = ['<s> ' + line + ' </s>' for line in tqdm(lines[:limit_size])]
    #lines = [line for line in tqdm(lines[:limit_size])]

    # Print examples
    for line in lines[:5]:
        print(line)

    return lines 


def tokenize(text, vocab, max_len):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')

    lang_tokenizer.word_index = vocab

    tensor = lang_tokenizer.texts_to_sequences(text)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=max_len, padding='post')

    return tensor, lang_tokenizer


def load_dataset(path, max_len, limit_size=None, lang=['en', 'vi']):
    dataset_train_input_path = 'train.{}'.format(lang[0]) 
    dataset_train_target_path = 'train.{}'.format(lang[1])

    print('Loading...')
    vocab_input = load_vocab(path, lang[0])
    vocab_target = load_vocab(path, lang[1])
    
    input_text = create_dataset(path + dataset_train_input_path, limit_size)
    target_text = create_dataset(path + dataset_train_target_path, limit_size)

    print('Tokenizing...')
    input_tensor, input_lang_tokenizer = tokenize(input_text, vocab_input, max_len)
    target_tensor, target_lang_tokenizer = tokenize(target_text, vocab_target, max_len)

    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer


def max_length(tensor):
    return max(len(t) for t in tensor)

    
def load_dataset_test(path):

    it, tt, ilt, tlt = load_dataset(path, 90, 5000)
    print(tt[0].shape)
    print(it.shape, tt.shape)
    max_it, max_tt = max_length(it), max_length(tt)
    print(max_it, max_tt)


def load_vocab(path, lang):
    lines = io.open(path + '.{}'.format(lang), encoding='UTF-8').read().strip().split('\n')
    vocab = {}
    
    # 0 is padding
    for idx, word in enumerate(lines):
        vocab[word] = idx + 1

    return vocab


def convert_vocab(tokenizer, vocab):
    for key, val in vocab.items():
        tokenizer.index_word[val] = key

In [None]:
load_vocab()