In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import json
import youtokentome as yttm

In [3]:
bpe = yttm.BPE(model = 'rumi-jawi.yttm')

In [4]:
train_X, train_Y = [], []
with open('../jawi-rumi/jawi-rumi-news-full.test') as fopen:
    for line in fopen:
        d = json.loads(line)
        train_X.append(d[0])
        train_Y.append(d[1])

In [5]:
len(train_X)

100000

In [6]:
train_X[0], train_Y[0]

('laporan sebuah portal berita baru-baru ini memetik seorang ahli parlimen pkr yang enggan dinamakan yang mendakwa azmin',
 'لاڤورن سبواه ڤورتل بريتا بارو-بارو اين ممتيق سأورڠ اهلي ڤرليمين ڤقر يڠ اڠڬن ديناماكن يڠ مندعوا عزمين')

In [7]:
class Encoder:
    def __init__(self, bpe):
        self.bpe = bpe
        self.vocab_size = len(self.bpe.vocab())

    def encode(self, s):
        s = self.bpe.encode(s, output_type = yttm.OutputType.ID)
        s = [i + [1] for i in s]
        return s

    def decode(self, ids, strip_extraneous = False):
        return self.bpe.decode(list(ids))[0]

In [8]:
encoder = Encoder(bpe)

from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
from tqdm import tqdm



2022-08-01 00:26:08.737142: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [9]:
@registry.register_problem
class Jawi(text_problems.Text2TextProblem):
    @property
    def approx_vocab_size(self):
        return 32000

    @property
    def is_generate_per_split(self):
        # generate_data will shard the data into TRAIN and EVAL for us.
        return False

    @property
    def dataset_splits(self):
        return [
            {'split': problem.DatasetSplit.EVAL, 'shards': 1},
        ]

    def generate_samples(self, data_dir, tmp_dir, dataset_split):

        for i in tqdm(range(len(train_X))):
            i, o = encoder.encode([train_X[i], train_Y[i]])
            yield {'inputs': i, 'targets': o}

    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        for sample in generator:
            yield sample

In [10]:
import os
import tensorflow as tf

In [11]:
# os.system('rm -rf t2t-jawi-rumi/data')

In [12]:
DATA_DIR = os.path.expanduser('t2t-rumi-jawi/data')
TMP_DIR = os.path.expanduser('t2t-rumi-jawi/tmp')

In [13]:
tf.io.gfile.mkdir(DATA_DIR)
tf.io.gfile.mkdir(TMP_DIR)




In [14]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
PROBLEM = 'jawi'
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

  0%|                                                                                                 | 0/100000 [00:00<?, ?it/s]

INFO:tensorflow:Generating case 0.


INFO:tensorflow:Generating case 0.
100%|█████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:04<00:00, 23697.67it/s]

INFO:tensorflow:Generated 100000 Examples



INFO:tensorflow:Generated 100000 Examples


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Shuffling data...


INFO:tensorflow:read: 100000


INFO:tensorflow:read: 100000


INFO:tensorflow:Data shuffled.


INFO:tensorflow:Data shuffled.
