In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
# !wget https://huggingface.co/huseinzol05/bpe/resolve/main/ms-en.subwords

In [3]:
from malaya.text.t2t import text_encoder
import malaya



2022-07-04 23:01:32.983758: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
encoder = text_encoder.SubwordTextEncoder('ms-en.subwords')

In [5]:
encoder.decode([25891])

''

In [6]:
encoder.encode('saya')

[197]

In [7]:
encoder.vocab_size

25880

In [8]:
# !~/tf-nvidia/bin/pip3 install fasttext

In [9]:
fast_text = malaya.language_detection.fasttext()



In [10]:
fast_text.predict(['saya suka', 'i like'])

['malay', 'eng']

In [11]:
class Encoder:
    def __init__(self, encoder):
        self.encoder = encoder
        self.vocab_size = encoder.vocab_size

    def encode(self, s):
        s = [self.encoder.encode(s_) for s_ in s]
        s = [i + [1] for i in s]
        return s

    def decode(self, ids, strip_extraneous = False):
        return self.encoder.decode(ids)

In [12]:
s_encoder = Encoder(encoder)

In [13]:
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
from tqdm import tqdm
from glob import glob
import json

In [14]:
@registry.register_problem
class Translation(text_problems.Text2TextProblem):
    @property
    def approx_vocab_size(self):
        return encoder.vocab_size

    @property
    def is_generate_per_split(self):
        # generate_data will shard the data into TRAIN and EVAL for us.
        return False

    @property
    def dataset_splits(self):
        return [
            {'split': problem.DatasetSplit.EVAL, 'shards': 1},
        ]

    def generate_samples(self, data_dir, tmp_dir, dataset_split):
                
        with open('test/left.txt') as fopen:
            left = fopen.read().split('\n')
        
        with open('test/right.txt') as fopen:
            right = fopen.read().split('\n')
            
        for i in tqdm(range(len(left))):
            if len(left[i]) and len(right[i]):
                lang_left, lang_right = fast_text.predict([left[i], right[i]])
                if lang_left not in ['malay', 'eng']:
                    continue
                if lang_right not in ['eng']:
                    continue
                i, o = s_encoder.encode([left[i], right[i]])
                yield {'inputs': i, 'targets': o}
                
        with open('augmented-ms-en-test.json') as fopen:
            data = json.load(fopen)

        for i in tqdm(range(len(data['ms']))):
            if len(data['ms'][i]) and len(data['en'][i]):
                i, o = s_encoder.encode([data['ms'][i], data['en'][i]])
                yield {'inputs': i, 'targets': o}

    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        for sample in generator:
            yield sample

In [15]:
import os
import tensorflow as tf

In [16]:
DATA_DIR = os.path.expanduser('t2t-noisy-ms-en/data')
TMP_DIR = os.path.expanduser('t2t-noisy-ms-en/tmp')

In [17]:
tf.io.gfile.mkdir(DATA_DIR)
tf.io.gfile.mkdir(TMP_DIR)




In [18]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

In [19]:
PROBLEM = 'translation'
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

  0%|                                                                                                     | 0/100000 [00:00<?, ?it/s]

INFO:tensorflow:Generating case 0.


INFO:tensorflow:Generating case 0.
100%|██████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:15<00:00, 6536.22it/s]
 29%|█████████████████████████▎                                                             | 19118/65642 [00:01<00:03, 13665.07it/s]

INFO:tensorflow:Generating case 100000.


INFO:tensorflow:Generating case 100000.
100%|███████████████████████████████████████████████████████████████████████████████████████| 65642/65642 [00:04<00:00, 13209.64it/s]

INFO:tensorflow:Generated 145209 Examples



INFO:tensorflow:Generated 145209 Examples


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Shuffling data...


INFO:tensorflow:read: 100000


INFO:tensorflow:read: 100000


INFO:tensorflow:write: 100000


INFO:tensorflow:write: 100000


INFO:tensorflow:Data shuffled.


INFO:tensorflow:Data shuffled.
