In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
# !wget https://huggingface.co/huseinzol05/bpe/resolve/main/en-ms.subwords

In [3]:
from malaya.text.t2t import text_encoder
import malaya



2022-07-23 17:37:55.526213: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
encoder = text_encoder.SubwordTextEncoder('en-ms.subwords')

In [5]:
encoder.decode([25891])

'808'

In [6]:
encoder.encode('saya')

[52]

In [7]:
encoder.vocab_size

26088

In [8]:
# !~/tf-nvidia/bin/pip3 install fasttext

In [9]:
fast_text = malaya.language_detection.fasttext()



In [10]:
fast_text.predict(['saya suka', 'i like'])

['malay', 'eng']

In [11]:
class Encoder:
    def __init__(self, encoder):
        self.encoder = encoder
        self.vocab_size = encoder.vocab_size

    def encode(self, s):
        s = [self.encoder.encode(s_) for s_ in s]
        s = [i + [1] for i in s]
        return s

    def decode(self, ids, strip_extraneous = False):
        return self.encoder.decode(ids)

In [12]:
s_encoder = Encoder(encoder)

In [13]:
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
from tqdm import tqdm
from glob import glob
import json

In [14]:
augmented = glob('augmented-en-ms-*.json')
augmented = [f for f in augmented if 'test' not in f]
augmented

['augmented-en-ms-v2-part2.json',
 'augmented-en-ms-v3.json',
 'augmented-en-ms-v2-part3.json',
 'augmented-en-ms-v1.json',
 'augmented-en-ms-v2.json',
 'augmented-en-ms-v3-part2.json']

In [15]:
with open(augmented[0]) as fopen:
    data = json.load(fopen)

In [16]:
data['ms'][:10]

['Foto: AP : Enam pengunjung sebuah taman tema di Florida berdepan detik cemas apabila roller coaster yang dinaiki mereka tergelincir.',
 'Era revolusi bermula pada tahun 1763 apabila ancaman ketenteraan Perancis ke atas koloni British di Amerika Utara berakhir.',
 'Pada akhirnya, Robespierre dan Jawatankuasa Keselamatan Awam dipaksa untuk menentang kampanye dengan menggantikan Kultus Alasan dengan deisme, walaupun masih non-Kristen.',
 'Pasukan hoki SMACH telah menjadi juara Zon Timur semenjak 2006 sehingga 2009.',
 'Secara telitinya, ketua-ketua ini meminta pertolongan Perancis, "puak Marion", di mana ia merupakan surat pertama yang ditulis orang Maori meminta campur tangan British.',
 'Menurut bancian India pada tahun 2001 Mallar memiliki kadar pendidikan (kebolehan membaca) 75%, melebihi kadar purata kebangsaan 59.5%; dengan 50% lelaki dan 50% wanita mampu membaca.',
 'Di London beliau bertemu dengan Mary Ann Wilton dan memperanakkan dua orang yang dilahirkannya.',
 '(Ingatlah akan

In [17]:
data['en'][:10]

['Foto: AP: Sex vistors to a tema taman di Florida berdepan and cemas apabla dinaiki roller coaster mereka yang slides.',
 'The revolutionar era began in 1763 when the French mltary threat to tne British collonies in Noth*2 America ended.',
 'Eventally, Robespierre a*5 Pada Awam Keselamatan Committee were forsed to oppose thoy kampanye by replacing the Kultus onet Alasan with deism walaupun masih non-Kristen.',
 'Duh SMACH hki team telah been juara Timur Zone campion semenjak 2006 sehingga 2009.',
 'Secara telitinya, ini leaders called fr the French, puak "Marion cln which ia merupakan frist letter written by orang Maori asking campur British tangan.',
 'Menurut to the Indian cnsus in 2001 Mallar had kadar education rate (kebolehan ability) othe 75%, melebihi tje national average afoh 59.5%; dengan 50% of lelaki hand 50% with wnita alb to read.',
 'In London if met Mary Ann Wilton and gave birth to two children.',
 'on to Day when He will callllllllll you and you will rise praisng Him 

In [18]:
@registry.register_problem
class Translation(text_problems.Text2TextProblem):
    @property
    def approx_vocab_size(self):
        return encoder.vocab_size

    @property
    def is_generate_per_split(self):
        # generate_data will shard the data into TRAIN and EVAL for us.
        return False

    @property
    def dataset_splits(self):
        return [
            {'split': problem.DatasetSplit.TRAIN, 'shards': 100},
        ]

    def generate_samples(self, data_dir, tmp_dir, dataset_split):

        for file in augmented:
            with open(file) as fopen:
                data = json.load(fopen)

            for i in tqdm(range(len(data['en']))):
                if len(data['en'][i]) and len(data['ms'][i]):
                    i, o = s_encoder.encode([data['en'][i], data['ms'][i]])
                    yield {'inputs': i, 'targets': o}
                
        with open('train-en/left.txt') as fopen:
            left = fopen.read().split('\n')
        
        with open('train-en/right.txt') as fopen:
            right = fopen.read().split('\n')
            
        
        for i in tqdm(range(len(left))):
            if len(left[i]) and len(right[i]):
                lang_left, lang_right = fast_text.predict([left[i], right[i]])
                if lang_left not in ['malay', 'eng', 'rojak', 'manglish']:
                    continue
                if lang_right not in ['malay', 'rojak']:
                    continue
                i, o = s_encoder.encode([left[i], right[i]])
                yield {'inputs': i, 'targets': o}

    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        for sample in generator:
            yield sample

In [19]:
import os
import tensorflow as tf

In [20]:
os.system('rm -rf t2t-noisy-en-ms/data')
DATA_DIR = os.path.expanduser('t2t-noisy-en-ms/data')
TMP_DIR = os.path.expanduser('t2t-noisy-en-ms/tmp')

In [21]:
tf.io.gfile.mkdir(DATA_DIR)
tf.io.gfile.mkdir(TMP_DIR)




In [22]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

In [23]:
PROBLEM = 'translation'
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

  0%|                                                                                                     | 0/231233 [00:00<?, ?it/s]

INFO:tensorflow:Generating case 0.


INFO:tensorflow:Generating case 0.
 43%|█████████████████████████████████████▏                                                | 99882/231233 [00:08<00:11, 11750.14it/s]

INFO:tensorflow:Generating case 100000.


INFO:tensorflow:Generating case 100000.
 86%|█████████████████████████████████████████████████████████████████████████▎           | 199476/231233 [00:16<00:02, 12517.23it/s]

INFO:tensorflow:Generating case 200000.


INFO:tensorflow:Generating case 200000.
100%|█████████████████████████████████████████████████████████████████████████████████████| 231233/231233 [00:19<00:00, 12080.66it/s]
 98%|█████████████████████████████████████████████████████████████████████████████████████  | 68372/69976 [00:05<00:00, 12975.94it/s]

INFO:tensorflow:Generating case 300000.


INFO:tensorflow:Generating case 300000.
100%|███████████████████████████████████████████████████████████████████████████████████████| 69976/69976 [00:05<00:00, 13027.05it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 77100/77100 [00:06<00:00, 12317.03it/s]
  2%|█▊                                                                                    | 20988/984790 [00:01<01:15, 12816.68it/s]

INFO:tensorflow:Generating case 400000.


INFO:tensorflow:Generating case 400000.
 12%|██████████▍                                                                          | 121634/984790 [00:09<01:07, 12863.47it/s]

INFO:tensorflow:Generating case 500000.


INFO:tensorflow:Generating case 500000.
 22%|███████████████████                                                                  | 221504/984790 [00:17<01:01, 12493.95it/s]

INFO:tensorflow:Generating case 600000.


INFO:tensorflow:Generating case 600000.
 33%|███████████████████████████▋                                                         | 320685/984790 [00:25<00:51, 12886.94it/s]

INFO:tensorflow:Generating case 700000.


INFO:tensorflow:Generating case 700000.
 43%|████████████████████████████████████▎                                                | 420804/984790 [00:33<00:44, 12649.55it/s]

INFO:tensorflow:Generating case 800000.


INFO:tensorflow:Generating case 800000.
 53%|████████████████████████████████████████████▉                                        | 520917/984790 [00:41<00:35, 12961.19it/s]

INFO:tensorflow:Generating case 900000.


INFO:tensorflow:Generating case 900000.
 63%|█████████████████████████████████████████████████████▋                               | 621583/984790 [00:49<00:28, 12684.92it/s]

INFO:tensorflow:Generating case 1000000.


INFO:tensorflow:Generating case 1000000.
 73%|██████████████████████████████████████████████████████████████▏                      | 720955/984790 [00:57<00:20, 12702.36it/s]

INFO:tensorflow:Generating case 1100000.


INFO:tensorflow:Generating case 1100000.
 83%|██████████████████████████████████████████████████████████████████████▉              | 821216/984790 [01:05<00:12, 12803.06it/s]

INFO:tensorflow:Generating case 1200000.


INFO:tensorflow:Generating case 1200000.
 94%|███████████████████████████████████████████████████████████████████████████████▌     | 921685/984790 [01:13<00:04, 12793.04it/s]

INFO:tensorflow:Generating case 1300000.


INFO:tensorflow:Generating case 1300000.
100%|█████████████████████████████████████████████████████████████████████████████████████| 984790/984790 [01:18<00:00, 12514.26it/s]
 14%|████████████▎                                                                         | 35647/249095 [00:02<00:16, 12628.38it/s]

INFO:tensorflow:Generating case 1400000.


INFO:tensorflow:Generating case 1400000.
 55%|██████████████████████████████████████████████▌                                      | 136397/249095 [00:10<00:09, 12520.65it/s]

INFO:tensorflow:Generating case 1500000.


INFO:tensorflow:Generating case 1500000.
 95%|████████████████████████████████████████████████████████████████████████████████▌    | 236060/249095 [00:18<00:01, 12798.57it/s]

INFO:tensorflow:Generating case 1600000.


INFO:tensorflow:Generating case 1600000.
100%|█████████████████████████████████████████████████████████████████████████████████████| 249095/249095 [00:19<00:00, 12613.35it/s]
 63%|██████████████████████████████████████████████████████▎                               | 87751/138982 [00:06<00:03, 13003.83it/s]

INFO:tensorflow:Generating case 1700000.


INFO:tensorflow:Generating case 1700000.
100%|█████████████████████████████████████████████████████████████████████████████████████| 138982/138982 [00:11<00:00, 12441.60it/s]
  1%|█▏                                                                                    | 54361/3807616 [00:12<13:07, 4766.96it/s]

INFO:tensorflow:Generating case 1800000.


INFO:tensorflow:Generating case 1800000.
  4%|███▋                                                                                 | 165593/3807616 [00:36<12:44, 4761.91it/s]

INFO:tensorflow:Generating case 1900000.


INFO:tensorflow:Generating case 1900000.
  7%|██████▏                                                                              | 276749/3807616 [01:00<13:12, 4455.81it/s]

INFO:tensorflow:Generating case 2000000.


INFO:tensorflow:Generating case 2000000.
 10%|████████▋                                                                            | 388287/3807616 [01:24<12:12, 4668.72it/s]

INFO:tensorflow:Generating case 2100000.


INFO:tensorflow:Generating case 2100000.
 13%|███████████▏                                                                         | 499702/3807616 [01:48<11:41, 4715.84it/s]

INFO:tensorflow:Generating case 2200000.


INFO:tensorflow:Generating case 2200000.
 16%|█████████████▋                                                                       | 611165/3807616 [02:13<11:55, 4469.90it/s]

INFO:tensorflow:Generating case 2300000.


INFO:tensorflow:Generating case 2300000.
 19%|████████████████▏                                                                    | 722723/3807616 [02:37<10:50, 4744.45it/s]

INFO:tensorflow:Generating case 2400000.


INFO:tensorflow:Generating case 2400000.
 22%|██████████████████▌                                                                  | 834114/3807616 [03:01<09:58, 4967.76it/s]

INFO:tensorflow:Generating case 2500000.


INFO:tensorflow:Generating case 2500000.
 25%|█████████████████████                                                                | 945335/3807616 [03:24<10:01, 4755.78it/s]

INFO:tensorflow:Generating case 2600000.


INFO:tensorflow:Generating case 2600000.
 28%|███████████████████████▎                                                            | 1056559/3807616 [03:47<09:36, 4772.44it/s]

INFO:tensorflow:Generating case 2700000.


INFO:tensorflow:Generating case 2700000.
 31%|█████████████████████████▊                                                          | 1167668/3807616 [04:11<08:54, 4936.22it/s]

INFO:tensorflow:Generating case 2800000.


INFO:tensorflow:Generating case 2800000.
 34%|████████████████████████████▏                                                       | 1279099/3807616 [04:35<08:52, 4749.00it/s]

INFO:tensorflow:Generating case 2900000.


INFO:tensorflow:Generating case 2900000.
 37%|██████████████████████████████▋                                                     | 1390476/3807616 [04:59<08:48, 4574.79it/s]

INFO:tensorflow:Generating case 3000000.


INFO:tensorflow:Generating case 3000000.
 39%|█████████████████████████████████▏                                                  | 1502104/3807616 [05:22<08:05, 4751.84it/s]

INFO:tensorflow:Generating case 3100000.


INFO:tensorflow:Generating case 3100000.
 42%|███████████████████████████████████▌                                                | 1613260/3807616 [05:46<07:26, 4909.55it/s]

INFO:tensorflow:Generating case 3200000.


INFO:tensorflow:Generating case 3200000.
 45%|██████████████████████████████████████                                              | 1724696/3807616 [06:09<07:17, 4758.28it/s]

INFO:tensorflow:Generating case 3300000.


INFO:tensorflow:Generating case 3300000.
 48%|████████████████████████████████████████▍                                           | 1835749/3807616 [06:33<07:01, 4673.23it/s]

INFO:tensorflow:Generating case 3400000.


INFO:tensorflow:Generating case 3400000.
 51%|██████████████████████████████████████████▉                                         | 1947313/3807616 [06:57<06:23, 4856.66it/s]

INFO:tensorflow:Generating case 3500000.


INFO:tensorflow:Generating case 3500000.
 54%|█████████████████████████████████████████████▍                                      | 2058796/3807616 [07:21<06:27, 4513.84it/s]

INFO:tensorflow:Generating case 3600000.


INFO:tensorflow:Generating case 3600000.
 57%|███████████████████████████████████████████████▉                                    | 2170173/3807616 [07:45<05:43, 4760.83it/s]

INFO:tensorflow:Generating case 3700000.


INFO:tensorflow:Generating case 3700000.
 60%|██████████████████████████████████████████████████▎                                 | 2281240/3807616 [08:08<05:26, 4671.99it/s]

INFO:tensorflow:Generating case 3800000.


INFO:tensorflow:Generating case 3800000.
 63%|████████████████████████████████████████████████████▊                               | 2392711/3807616 [08:32<04:56, 4768.68it/s]

INFO:tensorflow:Generating case 3900000.


INFO:tensorflow:Generating case 3900000.
 66%|███████████████████████████████████████████████████████▏                            | 2504070/3807616 [08:55<04:44, 4577.87it/s]

INFO:tensorflow:Generating case 4000000.


INFO:tensorflow:Generating case 4000000.
 69%|█████████████████████████████████████████████████████████▋                          | 2615457/3807616 [09:19<04:25, 4495.94it/s]

INFO:tensorflow:Generating case 4100000.


INFO:tensorflow:Generating case 4100000.
 72%|████████████████████████████████████████████████████████████▏                       | 2726549/3807616 [09:43<03:53, 4628.74it/s]

INFO:tensorflow:Generating case 4200000.


INFO:tensorflow:Generating case 4200000.
 75%|██████████████████████████████████████████████████████████████▌                     | 2838285/3807616 [10:08<03:21, 4821.59it/s]

INFO:tensorflow:Generating case 4300000.


INFO:tensorflow:Generating case 4300000.
 77%|█████████████████████████████████████████████████████████████████                   | 2949634/3807616 [10:32<02:59, 4793.05it/s]

INFO:tensorflow:Generating case 4400000.


INFO:tensorflow:Generating case 4400000.
 80%|███████████████████████████████████████████████████████████████████▌                | 3061204/3807616 [10:56<02:47, 4455.60it/s]

INFO:tensorflow:Generating case 4500000.


INFO:tensorflow:Generating case 4500000.
 83%|█████████████████████████████████████████████████████████████████████▉              | 3172534/3807616 [11:19<02:12, 4785.20it/s]

INFO:tensorflow:Generating case 4600000.


INFO:tensorflow:Generating case 4600000.
 86%|████████████████████████████████████████████████████████████████████████▍           | 3284052/3807616 [11:43<01:47, 4851.99it/s]

INFO:tensorflow:Generating case 4700000.


INFO:tensorflow:Generating case 4700000.
 89%|██████████████████████████████████████████████████████████████████████████▉         | 3395525/3807616 [12:06<01:24, 4854.10it/s]

INFO:tensorflow:Generating case 4800000.


INFO:tensorflow:Generating case 4800000.
 92%|█████████████████████████████████████████████████████████████████████████████▎      | 3506812/3807616 [12:30<01:07, 4473.14it/s]

INFO:tensorflow:Generating case 4900000.


INFO:tensorflow:Generating case 4900000.
 95%|███████████████████████████████████████████████████████████████████████████████▊    | 3617945/3807616 [12:53<00:38, 4882.92it/s]

INFO:tensorflow:Generating case 5000000.


INFO:tensorflow:Generating case 5000000.
 98%|██████████████████████████████████████████████████████████████████████████████████▎ | 3729328/3807616 [13:17<00:16, 4737.06it/s]

INFO:tensorflow:Generating case 5100000.


INFO:tensorflow:Generating case 5100000.
100%|████████████████████████████████████████████████████████████████████████████████████| 3807616/3807616 [13:34<00:00, 4675.18it/s]


INFO:tensorflow:Generated 5170229 Examples


INFO:tensorflow:Generated 5170229 Examples


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Data shuffled.


INFO:tensorflow:Data shuffled.
