In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import json
import youtokentome as yttm

In [3]:
bpe = yttm.BPE(model = 'rumi-jawi.yttm')

In [4]:
train_X, train_Y = [], []
with open('../jawi-rumi/jawi-rumi-news-full.train') as fopen:
    for line in fopen:
        d = json.loads(line)
        train_X.append(d[0])
        train_Y.append(d[1])

In [5]:
len(train_X)

3232675

In [6]:
train_X[0], train_Y[0]

('laporan sebuah portal berita baru-baru ini memetik seorang ahli parlimen pkr yang enggan dinamakan yang mendakwa azmin',
 'لاڤورن سبواه ڤورتل بريتا بارو-بارو اين ممتيق سأورڠ اهلي ڤرليمين ڤقر يڠ اڠڬن ديناماكن يڠ مندعوا عزمين')

In [7]:
class Encoder:
    def __init__(self, bpe):
        self.bpe = bpe
        self.vocab_size = len(self.bpe.vocab())

    def encode(self, s):
        s = self.bpe.encode(s, output_type = yttm.OutputType.ID)
        s = [i + [1] for i in s]
        return s

    def decode(self, ids, strip_extraneous = False):
        return self.bpe.decode(list(ids))[0]

In [8]:
encoder = Encoder(bpe)

from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
from tqdm import tqdm



2022-08-01 00:23:57.954193: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [9]:
@registry.register_problem
class Jawi(text_problems.Text2TextProblem):
    @property
    def approx_vocab_size(self):
        return 32000

    @property
    def is_generate_per_split(self):
        # generate_data will shard the data into TRAIN and EVAL for us.
        return False

    @property
    def dataset_splits(self):
        return [
            {'split': problem.DatasetSplit.TRAIN, 'shards': 200},
        ]

    def generate_samples(self, data_dir, tmp_dir, dataset_split):

        for i in tqdm(range(len(train_X))):
            i, o = encoder.encode([train_X[i], train_Y[i]])
            yield {'inputs': i, 'targets': o}

    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        for sample in generator:
            yield sample

In [11]:
import os
import tensorflow as tf

In [12]:
os.system('rm -rf t2t-rumi-jawi/data')

0

In [14]:
DATA_DIR = os.path.expanduser('t2t-rumi-jawi/data')
TMP_DIR = os.path.expanduser('t2t-rumi-jawi/tmp')

In [15]:
tf.compat.v1.io.gfile.mkdir(DATA_DIR)
tf.compat.v1.io.gfile.mkdir(TMP_DIR)




In [16]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
PROBLEM = 'jawi'
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

  0%|                                                                                                | 0/3232675 [00:00<?, ?it/s]

INFO:tensorflow:Generating case 0.


INFO:tensorflow:Generating case 0.
  3%|██▍                                                                              | 97880/3232675 [00:03<02:04, 25175.34it/s]

INFO:tensorflow:Generating case 100000.


INFO:tensorflow:Generating case 100000.
  6%|████▉                                                                           | 199529/3232675 [00:07<02:01, 24910.17it/s]

INFO:tensorflow:Generating case 200000.


INFO:tensorflow:Generating case 200000.
  9%|███████▍                                                                        | 299225/3232675 [00:11<01:56, 25130.00it/s]

INFO:tensorflow:Generating case 300000.


INFO:tensorflow:Generating case 300000.
 12%|█████████▉                                                                      | 399276/3232675 [00:15<01:53, 24949.48it/s]

INFO:tensorflow:Generating case 400000.


INFO:tensorflow:Generating case 400000.
 15%|████████████▎                                                                   | 499624/3232675 [00:19<01:49, 24907.16it/s]

INFO:tensorflow:Generating case 500000.


INFO:tensorflow:Generating case 500000.
 18%|██████████████▊                                                                 | 597504/3232675 [00:23<01:44, 25099.60it/s]

INFO:tensorflow:Generating case 600000.


INFO:tensorflow:Generating case 600000.
 22%|█████████████████▎                                                              | 697961/3232675 [00:27<01:40, 25099.09it/s]

INFO:tensorflow:Generating case 700000.


INFO:tensorflow:Generating case 700000.
 25%|███████████████████▊                                                            | 798382/3232675 [00:31<01:36, 25132.45it/s]

INFO:tensorflow:Generating case 800000.


INFO:tensorflow:Generating case 800000.
 28%|██████████████████████▏                                                         | 898455/3232675 [00:35<01:33, 24931.98it/s]

INFO:tensorflow:Generating case 900000.


INFO:tensorflow:Generating case 900000.
 31%|████████████████████████▋                                                       | 998927/3232675 [00:39<01:30, 24782.44it/s]

INFO:tensorflow:Generating case 1000000.


INFO:tensorflow:Generating case 1000000.
 34%|██████████████████████████▊                                                    | 1098994/3232675 [00:43<01:25, 25000.26it/s]

INFO:tensorflow:Generating case 1100000.


INFO:tensorflow:Generating case 1100000.
 37%|█████████████████████████████▎                                                 | 1198445/3232675 [00:47<01:22, 24653.60it/s]

INFO:tensorflow:Generating case 1200000.


INFO:tensorflow:Generating case 1200000.
 40%|███████████████████████████████▋                                               | 1297660/3232675 [00:52<01:18, 24771.67it/s]

INFO:tensorflow:Generating case 1300000.


INFO:tensorflow:Generating case 1300000.
 43%|██████████████████████████████████▏                                            | 1399149/3232675 [00:56<01:13, 24827.56it/s]

INFO:tensorflow:Generating case 1400000.


INFO:tensorflow:Generating case 1400000.
 46%|████████████████████████████████████▌                                          | 1498331/3232675 [01:00<01:10, 24529.26it/s]

INFO:tensorflow:Generating case 1500000.


INFO:tensorflow:Generating case 1500000.
 49%|███████████████████████████████████████                                        | 1597847/3232675 [01:04<01:06, 24526.95it/s]

INFO:tensorflow:Generating case 1600000.


INFO:tensorflow:Generating case 1600000.
 53%|█████████████████████████████████████████▌                                     | 1699453/3232675 [01:08<01:01, 24734.41it/s]

INFO:tensorflow:Generating case 1700000.


INFO:tensorflow:Generating case 1700000.
 56%|███████████████████████████████████████████▉                                   | 1798817/3232675 [01:12<00:57, 24758.23it/s]

INFO:tensorflow:Generating case 1800000.


INFO:tensorflow:Generating case 1800000.
 59%|██████████████████████████████████████████████▍                                | 1898358/3232675 [01:16<00:53, 24727.85it/s]

INFO:tensorflow:Generating case 1900000.


INFO:tensorflow:Generating case 1900000.
 62%|████████████████████████████████████████████████▊                              | 1998047/3232675 [01:20<00:50, 24688.78it/s]

INFO:tensorflow:Generating case 2000000.


INFO:tensorflow:Generating case 2000000.
 65%|███████████████████████████████████████████████████▎                           | 2099947/3232675 [01:24<00:45, 24769.73it/s]

INFO:tensorflow:Generating case 2100000.


INFO:tensorflow:Generating case 2100000.
 68%|█████████████████████████████████████████████████████▊                         | 2199800/3232675 [01:28<00:41, 24784.39it/s]

INFO:tensorflow:Generating case 2200000.


INFO:tensorflow:Generating case 2200000.
 71%|████████████████████████████████████████████████████████▏                      | 2299116/3232675 [01:32<00:37, 25163.20it/s]

INFO:tensorflow:Generating case 2300000.


INFO:tensorflow:Generating case 2300000.
 74%|██████████████████████████████████████████████████████████▋                    | 2398959/3232675 [01:36<00:33, 24912.50it/s]

INFO:tensorflow:Generating case 2400000.


INFO:tensorflow:Generating case 2400000.
 77%|█████████████████████████████████████████████████████████████                  | 2498739/3232675 [01:40<00:29, 24774.15it/s]

INFO:tensorflow:Generating case 2500000.


INFO:tensorflow:Generating case 2500000.
 80%|███████████████████████████████████████████████████████████████▍               | 2598183/3232675 [01:44<00:25, 24801.63it/s]

INFO:tensorflow:Generating case 2600000.


INFO:tensorflow:Generating case 2600000.
 83%|█████████████████████████████████████████████████████████████████▉             | 2698673/3232675 [01:48<00:21, 25115.66it/s]

INFO:tensorflow:Generating case 2700000.


INFO:tensorflow:Generating case 2700000.
 87%|████████████████████████████████████████████████████████████████████▍          | 2798610/3232675 [01:52<00:17, 24940.34it/s]

INFO:tensorflow:Generating case 2800000.


INFO:tensorflow:Generating case 2800000.
 90%|██████████████████████████████████████████████████████████████████████▊        | 2898754/3232675 [01:56<00:13, 24689.34it/s]

INFO:tensorflow:Generating case 2900000.


INFO:tensorflow:Generating case 2900000.
 93%|█████████████████████████████████████████████████████████████████████████▎     | 2998617/3232675 [02:01<00:09, 25015.17it/s]

INFO:tensorflow:Generating case 3000000.


INFO:tensorflow:Generating case 3000000.
 96%|███████████████████████████████████████████████████████████████████████████▋   | 3098164/3232675 [02:05<00:05, 22656.91it/s]

INFO:tensorflow:Generating case 3100000.


INFO:tensorflow:Generating case 3100000.
 99%|██████████████████████████████████████████████████████████████████████████████▏| 3199577/3232675 [02:09<00:01, 24944.92it/s]

INFO:tensorflow:Generating case 3200000.


INFO:tensorflow:Generating case 3200000.
100%|███████████████████████████████████████████████████████████████████████████████| 3232675/3232675 [02:10<00:00, 24688.16it/s]

INFO:tensorflow:Generated 3232675 Examples



INFO:tensorflow:Generated 3232675 Examples


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Data shuffled.


INFO:tensorflow:Data shuffled.
