In [1]:
import tensorflow as tf
import sentencepiece as spm
from glob import glob
import os
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry

In [2]:
# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.t5.model
# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.t5.vocab

In [3]:
# from tqdm import trange, tqdm

# vocab = 'sp10m.cased.t5.model'
# sp = spm.SentencePieceProcessor()
# sp.Load(vocab)

# vocab = {sp.id_to_piece(index): index for index in trange(sp.GetPieceSize())}
# merges = []
# for piece_l in tqdm(vocab.keys(), total=sp.GetPieceSize()):
#     for piece_r in vocab.keys():
#         merge = f"{piece_l}{piece_r}"
#         piece_id = vocab.get(merge, None)
#         if piece_id:
#             merges += [(piece_l, piece_r, piece_id)]
# merges = sorted(merges, key=lambda val: val[2])
# merges = [(val[0], val[1]) for val in merges]

In [4]:
# from json import dump
# from os import linesep, remove

# with open('out.vocab', 'w') as vocab_f:
#     with open('out.merged', 'w') as merges_f:
#         dump(vocab, vocab_f)
#         merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges))

In [5]:
# from tokenizers import SentencePieceBPETokenizer

# tokenizer = SentencePieceBPETokenizer('out.vocab', 'out.merged')

In [6]:
vocab = '../sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)


class Encoder:
    def __init__(self, sp):
        self.sp = sp
        self.vocab_size = sp.GetPieceSize() + 100

    def encode(self, s):
        return self.sp.EncodeAsIds(s)

    def decode(self, ids, strip_extraneous = False):
        return self.sp.DecodeIds(list(ids))
    
encoder = Encoder(sp)

In [7]:
from tqdm import tqdm

@registry.register_problem
class Seq2Seq(text_problems.Text2TextProblem):

    @property
    def approx_vocab_size(self):
        return 32100
    
    @property
    def is_generate_per_split(self):
        return False
    
    @property
    def dataset_splits(self):
        return [{
            "split": problem.DatasetSplit.TRAIN,
            "shards": 1000,
        },
        {
            "split": problem.DatasetSplit.EVAL,
            "shards": 6,
        }]
    
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        
        encoder = Encoder(sp)
        
        pairs = [['train/left.txt', 'train/right.txt']]
        
        for pair in pairs:
        
            left = open(pair[0])
            right = open(pair[1])

            while True:
                try:
                    l, r = next(left), next(right)
                    l, r = l.strip(), r.strip()
                    l = encoder.encode(l) + [1]
                    r = encoder.encode(r) + [1]
                    yield {
                        'inputs': l,
                        'targets': r
                    }

                except StopIteration:
                    print('break')
                    break
            
            left.close()
            right.close()
                    
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        
        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        for sample in tqdm(generator):
            yield sample

In [8]:
import os
import tensorflow as tf

os.system('rm -rf t2t-knowledge-graph/data')
DATA_DIR = os.path.expanduser('t2t-knowledge-graph/data')
TMP_DIR = os.path.expanduser('t2t-knowledge-graph/tmp')

In [9]:
tf.compat.v1.io.gfile.mkdir(DATA_DIR)
tf.compat.v1.io.gfile.mkdir(TMP_DIR)

In [10]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

PROBLEM = 'seq2_seq'
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

0it [00:00, ?it/s]

INFO:tensorflow:Generating case 0.


INFO:tensorflow:Generating case 0.
99851it [00:26, 4109.06it/s]

INFO:tensorflow:Generating case 100000.


INFO:tensorflow:Generating case 100000.
199916it [00:53, 3826.44it/s]

INFO:tensorflow:Generating case 200000.


INFO:tensorflow:Generating case 200000.
299684it [01:18, 3168.81it/s]

INFO:tensorflow:Generating case 300000.


INFO:tensorflow:Generating case 300000.
399950it [01:44, 4291.25it/s]

INFO:tensorflow:Generating case 400000.


INFO:tensorflow:Generating case 400000.
499980it [02:11, 3564.99it/s]

INFO:tensorflow:Generating case 500000.


INFO:tensorflow:Generating case 500000.
599973it [02:38, 3577.83it/s]

INFO:tensorflow:Generating case 600000.


INFO:tensorflow:Generating case 600000.
699576it [03:03, 4296.27it/s]

INFO:tensorflow:Generating case 700000.


INFO:tensorflow:Generating case 700000.
799599it [03:29, 3957.57it/s]

INFO:tensorflow:Generating case 800000.


INFO:tensorflow:Generating case 800000.
899751it [03:55, 3578.84it/s]

INFO:tensorflow:Generating case 900000.


INFO:tensorflow:Generating case 900000.
999832it [04:22, 3503.33it/s]

INFO:tensorflow:Generating case 1000000.


INFO:tensorflow:Generating case 1000000.
1099639it [04:48, 4134.34it/s]

INFO:tensorflow:Generating case 1100000.


INFO:tensorflow:Generating case 1100000.
1199704it [05:14, 4387.89it/s]

INFO:tensorflow:Generating case 1200000.


INFO:tensorflow:Generating case 1200000.
1299952it [05:40, 4223.10it/s]

INFO:tensorflow:Generating case 1300000.


INFO:tensorflow:Generating case 1300000.
1399782it [06:06, 4180.07it/s]

INFO:tensorflow:Generating case 1400000.


INFO:tensorflow:Generating case 1400000.
1499805it [06:32, 3336.59it/s]

INFO:tensorflow:Generating case 1500000.


INFO:tensorflow:Generating case 1500000.
1599747it [06:58, 3585.25it/s]

INFO:tensorflow:Generating case 1600000.


INFO:tensorflow:Generating case 1600000.
1699590it [07:23, 4331.74it/s]

INFO:tensorflow:Generating case 1700000.


INFO:tensorflow:Generating case 1700000.
1799913it [07:49, 4010.74it/s]

INFO:tensorflow:Generating case 1800000.


INFO:tensorflow:Generating case 1800000.
1899824it [08:15, 4235.76it/s]

INFO:tensorflow:Generating case 1900000.


INFO:tensorflow:Generating case 1900000.
1999925it [08:41, 4310.59it/s]

INFO:tensorflow:Generating case 2000000.


INFO:tensorflow:Generating case 2000000.
2099857it [09:07, 4107.69it/s]

INFO:tensorflow:Generating case 2100000.


INFO:tensorflow:Generating case 2100000.
2199979it [09:32, 3653.27it/s]

INFO:tensorflow:Generating case 2200000.


INFO:tensorflow:Generating case 2200000.
2299658it [09:59, 3372.03it/s]

INFO:tensorflow:Generating case 2300000.


INFO:tensorflow:Generating case 2300000.
2399749it [10:24, 3805.32it/s]

INFO:tensorflow:Generating case 2400000.


INFO:tensorflow:Generating case 2400000.
2499611it [10:50, 4423.39it/s]

INFO:tensorflow:Generating case 2500000.


INFO:tensorflow:Generating case 2500000.
2599675it [11:16, 3703.48it/s]

INFO:tensorflow:Generating case 2600000.


INFO:tensorflow:Generating case 2600000.
2699706it [11:42, 3872.49it/s]

INFO:tensorflow:Generating case 2700000.


INFO:tensorflow:Generating case 2700000.
2799744it [12:08, 3996.93it/s]

INFO:tensorflow:Generating case 2800000.


INFO:tensorflow:Generating case 2800000.
2899933it [12:34, 3509.12it/s]

INFO:tensorflow:Generating case 2900000.


INFO:tensorflow:Generating case 2900000.
2999678it [12:59, 4188.65it/s]

INFO:tensorflow:Generating case 3000000.


INFO:tensorflow:Generating case 3000000.
3099867it [13:25, 3781.83it/s]

INFO:tensorflow:Generating case 3100000.


INFO:tensorflow:Generating case 3100000.
3199833it [13:50, 4042.63it/s]

INFO:tensorflow:Generating case 3200000.


INFO:tensorflow:Generating case 3200000.
3299732it [14:16, 3520.91it/s]

INFO:tensorflow:Generating case 3300000.


INFO:tensorflow:Generating case 3300000.
3399789it [14:42, 3728.84it/s]

INFO:tensorflow:Generating case 3400000.


INFO:tensorflow:Generating case 3400000.
3499552it [15:08, 3742.31it/s]

INFO:tensorflow:Generating case 3500000.


INFO:tensorflow:Generating case 3500000.
3599771it [15:34, 3898.97it/s]

INFO:tensorflow:Generating case 3600000.


INFO:tensorflow:Generating case 3600000.
3699649it [16:00, 4358.23it/s]

INFO:tensorflow:Generating case 3700000.


INFO:tensorflow:Generating case 3700000.
3799736it [16:27, 4176.68it/s]

INFO:tensorflow:Generating case 3800000.


INFO:tensorflow:Generating case 3800000.
3899584it [16:52, 3998.95it/s]

INFO:tensorflow:Generating case 3900000.


INFO:tensorflow:Generating case 3900000.
3999653it [17:19, 3398.33it/s]

INFO:tensorflow:Generating case 4000000.


INFO:tensorflow:Generating case 4000000.
4099731it [17:45, 3860.68it/s]

INFO:tensorflow:Generating case 4100000.


INFO:tensorflow:Generating case 4100000.
4199691it [18:11, 3914.39it/s]

INFO:tensorflow:Generating case 4200000.


INFO:tensorflow:Generating case 4200000.
4299775it [18:37, 3449.12it/s]

INFO:tensorflow:Generating case 4300000.


INFO:tensorflow:Generating case 4300000.
4399837it [19:02, 3551.41it/s]

INFO:tensorflow:Generating case 4400000.


INFO:tensorflow:Generating case 4400000.
4499878it [19:29, 4059.47it/s]

INFO:tensorflow:Generating case 4500000.


INFO:tensorflow:Generating case 4500000.
4599586it [19:55, 4384.45it/s]

INFO:tensorflow:Generating case 4600000.


INFO:tensorflow:Generating case 4600000.
4699872it [20:21, 3858.05it/s]

INFO:tensorflow:Generating case 4700000.


INFO:tensorflow:Generating case 4700000.
4799632it [20:47, 3953.40it/s]

INFO:tensorflow:Generating case 4800000.


INFO:tensorflow:Generating case 4800000.
4899665it [21:13, 4090.83it/s]

INFO:tensorflow:Generating case 4900000.


INFO:tensorflow:Generating case 4900000.
4999876it [21:39, 3396.99it/s]

INFO:tensorflow:Generating case 5000000.


INFO:tensorflow:Generating case 5000000.
5099961it [22:05, 4442.00it/s]

INFO:tensorflow:Generating case 5100000.


INFO:tensorflow:Generating case 5100000.
5199659it [22:31, 3470.76it/s]

INFO:tensorflow:Generating case 5200000.


INFO:tensorflow:Generating case 5200000.
5299963it [22:57, 4152.51it/s]

INFO:tensorflow:Generating case 5300000.


INFO:tensorflow:Generating case 5300000.
5399706it [23:23, 4194.47it/s]

INFO:tensorflow:Generating case 5400000.


INFO:tensorflow:Generating case 5400000.
5499765it [23:49, 3779.61it/s]

INFO:tensorflow:Generating case 5500000.


INFO:tensorflow:Generating case 5500000.
5599831it [24:15, 3139.67it/s]

INFO:tensorflow:Generating case 5600000.


INFO:tensorflow:Generating case 5600000.
5699798it [24:40, 3360.87it/s]

INFO:tensorflow:Generating case 5700000.


INFO:tensorflow:Generating case 5700000.
5799811it [25:06, 3627.65it/s]

INFO:tensorflow:Generating case 5800000.


INFO:tensorflow:Generating case 5800000.
5899814it [25:32, 3703.65it/s]

INFO:tensorflow:Generating case 5900000.


INFO:tensorflow:Generating case 5900000.
5999698it [25:58, 3629.67it/s]

INFO:tensorflow:Generating case 6000000.


INFO:tensorflow:Generating case 6000000.
6099900it [26:24, 4565.06it/s]

INFO:tensorflow:Generating case 6100000.


INFO:tensorflow:Generating case 6100000.
6199750it [26:51, 3358.95it/s]

INFO:tensorflow:Generating case 6200000.


INFO:tensorflow:Generating case 6200000.
6299539it [27:16, 4601.14it/s]

INFO:tensorflow:Generating case 6300000.


INFO:tensorflow:Generating case 6300000.
6399877it [27:42, 3978.22it/s]

INFO:tensorflow:Generating case 6400000.


INFO:tensorflow:Generating case 6400000.
6499631it [28:08, 3989.60it/s]

INFO:tensorflow:Generating case 6500000.


INFO:tensorflow:Generating case 6500000.
6599973it [28:35, 4011.15it/s]

INFO:tensorflow:Generating case 6600000.


INFO:tensorflow:Generating case 6600000.
6699870it [29:01, 3729.27it/s]

INFO:tensorflow:Generating case 6700000.


INFO:tensorflow:Generating case 6700000.
6799733it [29:27, 4072.29it/s]

INFO:tensorflow:Generating case 6800000.


INFO:tensorflow:Generating case 6800000.
6899806it [29:53, 3441.89it/s]

INFO:tensorflow:Generating case 6900000.


INFO:tensorflow:Generating case 6900000.
6999555it [30:18, 3995.02it/s]

INFO:tensorflow:Generating case 7000000.


INFO:tensorflow:Generating case 7000000.
7099620it [30:45, 3701.93it/s]

INFO:tensorflow:Generating case 7100000.


INFO:tensorflow:Generating case 7100000.
7199841it [31:11, 3632.38it/s]

INFO:tensorflow:Generating case 7200000.


INFO:tensorflow:Generating case 7200000.
7299805it [31:36, 4364.13it/s]

INFO:tensorflow:Generating case 7300000.


INFO:tensorflow:Generating case 7300000.
7399709it [32:03, 3817.54it/s]

INFO:tensorflow:Generating case 7400000.


INFO:tensorflow:Generating case 7400000.
7499828it [32:28, 3764.46it/s]

INFO:tensorflow:Generating case 7500000.


INFO:tensorflow:Generating case 7500000.
7599871it [32:58, 3593.71it/s]

INFO:tensorflow:Generating case 7600000.


INFO:tensorflow:Generating case 7600000.
7699944it [33:23, 3642.40it/s]

INFO:tensorflow:Generating case 7700000.


INFO:tensorflow:Generating case 7700000.
7799558it [33:48, 4561.19it/s]

INFO:tensorflow:Generating case 7800000.


INFO:tensorflow:Generating case 7800000.
7899688it [34:13, 4076.36it/s]

INFO:tensorflow:Generating case 7900000.


INFO:tensorflow:Generating case 7900000.
7999940it [34:39, 4040.71it/s]

INFO:tensorflow:Generating case 8000000.


INFO:tensorflow:Generating case 8000000.
8099763it [35:05, 3513.89it/s]

INFO:tensorflow:Generating case 8100000.


INFO:tensorflow:Generating case 8100000.
8199750it [35:30, 4297.44it/s]

INFO:tensorflow:Generating case 8200000.


INFO:tensorflow:Generating case 8200000.
8299879it [35:55, 4524.30it/s]

INFO:tensorflow:Generating case 8300000.


INFO:tensorflow:Generating case 8300000.
8399958it [36:20, 3682.50it/s]

INFO:tensorflow:Generating case 8400000.


INFO:tensorflow:Generating case 8400000.
8499962it [36:46, 4051.22it/s]

INFO:tensorflow:Generating case 8500000.


INFO:tensorflow:Generating case 8500000.
8599790it [37:11, 4498.22it/s]

INFO:tensorflow:Generating case 8600000.


INFO:tensorflow:Generating case 8600000.
8699700it [37:36, 4249.57it/s]

INFO:tensorflow:Generating case 8700000.


INFO:tensorflow:Generating case 8700000.
8799736it [38:02, 3606.16it/s]

INFO:tensorflow:Generating case 8800000.


INFO:tensorflow:Generating case 8800000.
8899738it [38:28, 3671.92it/s]

INFO:tensorflow:Generating case 8900000.


INFO:tensorflow:Generating case 8900000.
8999745it [38:52, 3952.00it/s]

INFO:tensorflow:Generating case 9000000.


INFO:tensorflow:Generating case 9000000.
9099669it [39:18, 3849.38it/s]

INFO:tensorflow:Generating case 9100000.


INFO:tensorflow:Generating case 9100000.
9199871it [39:44, 3822.26it/s]

INFO:tensorflow:Generating case 9200000.


INFO:tensorflow:Generating case 9200000.
9299955it [40:10, 4008.87it/s]

INFO:tensorflow:Generating case 9300000.


INFO:tensorflow:Generating case 9300000.
9399830it [40:35, 3961.33it/s]

INFO:tensorflow:Generating case 9400000.


INFO:tensorflow:Generating case 9400000.
9499892it [41:01, 3990.21it/s]

INFO:tensorflow:Generating case 9500000.


INFO:tensorflow:Generating case 9500000.
9599908it [41:36, 2471.03it/s]

INFO:tensorflow:Generating case 9600000.


INFO:tensorflow:Generating case 9600000.
9699816it [42:23, 2376.67it/s]

INFO:tensorflow:Generating case 9700000.


INFO:tensorflow:Generating case 9700000.
9799673it [43:00, 4146.23it/s]

INFO:tensorflow:Generating case 9800000.


INFO:tensorflow:Generating case 9800000.
9899722it [43:25, 3876.98it/s]

INFO:tensorflow:Generating case 9900000.


INFO:tensorflow:Generating case 9900000.
9999706it [43:50, 3946.31it/s]

INFO:tensorflow:Generating case 10000000.


INFO:tensorflow:Generating case 10000000.
10099534it [44:15, 3975.90it/s]

INFO:tensorflow:Generating case 10100000.


INFO:tensorflow:Generating case 10100000.
10199789it [44:41, 3796.19it/s]

INFO:tensorflow:Generating case 10200000.


INFO:tensorflow:Generating case 10200000.
10299778it [45:05, 4500.83it/s]

INFO:tensorflow:Generating case 10300000.


INFO:tensorflow:Generating case 10300000.
10399990it [45:31, 3234.30it/s]

INFO:tensorflow:Generating case 10400000.


INFO:tensorflow:Generating case 10400000.
10499894it [45:57, 3713.42it/s]

INFO:tensorflow:Generating case 10500000.


INFO:tensorflow:Generating case 10500000.
10599685it [46:22, 4156.83it/s]

INFO:tensorflow:Generating case 10600000.


INFO:tensorflow:Generating case 10600000.
10699937it [46:48, 4191.25it/s]

INFO:tensorflow:Generating case 10700000.


INFO:tensorflow:Generating case 10700000.
10799781it [47:13, 4014.00it/s]

INFO:tensorflow:Generating case 10800000.


INFO:tensorflow:Generating case 10800000.
10899879it [47:38, 3364.10it/s]

INFO:tensorflow:Generating case 10900000.


INFO:tensorflow:Generating case 10900000.
10999794it [48:02, 3889.23it/s]

INFO:tensorflow:Generating case 11000000.


INFO:tensorflow:Generating case 11000000.
11099840it [48:28, 4264.80it/s]

INFO:tensorflow:Generating case 11100000.


INFO:tensorflow:Generating case 11100000.
11199863it [48:53, 4517.01it/s]

INFO:tensorflow:Generating case 11200000.


INFO:tensorflow:Generating case 11200000.
11299909it [49:21, 3817.72it/s]

INFO:tensorflow:Generating case 11300000.


INFO:tensorflow:Generating case 11300000.
11399726it [49:46, 3768.50it/s]

INFO:tensorflow:Generating case 11400000.


INFO:tensorflow:Generating case 11400000.
11499875it [50:12, 4537.72it/s]

INFO:tensorflow:Generating case 11500000.


INFO:tensorflow:Generating case 11500000.
11599819it [50:37, 4559.36it/s]

INFO:tensorflow:Generating case 11600000.


INFO:tensorflow:Generating case 11600000.
11699753it [51:02, 4462.35it/s]

INFO:tensorflow:Generating case 11700000.


INFO:tensorflow:Generating case 11700000.
11799816it [51:26, 3989.66it/s]

INFO:tensorflow:Generating case 11800000.


INFO:tensorflow:Generating case 11800000.
11899797it [51:51, 4168.05it/s]

INFO:tensorflow:Generating case 11900000.


INFO:tensorflow:Generating case 11900000.
11999985it [52:16, 3846.69it/s]

INFO:tensorflow:Generating case 12000000.


INFO:tensorflow:Generating case 12000000.
12099749it [52:41, 4334.70it/s]

INFO:tensorflow:Generating case 12100000.


INFO:tensorflow:Generating case 12100000.
12199926it [53:05, 4147.70it/s]

INFO:tensorflow:Generating case 12200000.


INFO:tensorflow:Generating case 12200000.
12299898it [53:31, 3570.10it/s]

INFO:tensorflow:Generating case 12300000.


INFO:tensorflow:Generating case 12300000.
12399977it [53:56, 3418.07it/s]

INFO:tensorflow:Generating case 12400000.


INFO:tensorflow:Generating case 12400000.
12499995it [54:20, 4166.44it/s]

INFO:tensorflow:Generating case 12500000.


INFO:tensorflow:Generating case 12500000.
12599728it [54:45, 4169.03it/s]

INFO:tensorflow:Generating case 12600000.


INFO:tensorflow:Generating case 12600000.
12699670it [55:10, 4067.86it/s]

INFO:tensorflow:Generating case 12700000.


INFO:tensorflow:Generating case 12700000.
12799860it [55:34, 3682.92it/s]

INFO:tensorflow:Generating case 12800000.


INFO:tensorflow:Generating case 12800000.
12899672it [55:59, 4250.86it/s]

INFO:tensorflow:Generating case 12900000.


INFO:tensorflow:Generating case 12900000.
12999864it [56:24, 4133.38it/s]

INFO:tensorflow:Generating case 13000000.


INFO:tensorflow:Generating case 13000000.
13099599it [56:49, 3441.09it/s]

INFO:tensorflow:Generating case 13100000.


INFO:tensorflow:Generating case 13100000.
13199853it [57:13, 4511.26it/s]

INFO:tensorflow:Generating case 13200000.


INFO:tensorflow:Generating case 13200000.
13299768it [57:38, 3902.15it/s]

INFO:tensorflow:Generating case 13300000.


INFO:tensorflow:Generating case 13300000.
13399641it [58:02, 4144.39it/s]

INFO:tensorflow:Generating case 13400000.


INFO:tensorflow:Generating case 13400000.
13499932it [58:27, 3366.51it/s]

INFO:tensorflow:Generating case 13500000.


INFO:tensorflow:Generating case 13500000.
13599932it [58:51, 3202.94it/s]

INFO:tensorflow:Generating case 13600000.


INFO:tensorflow:Generating case 13600000.
13699741it [59:15, 4429.68it/s]

INFO:tensorflow:Generating case 13700000.


INFO:tensorflow:Generating case 13700000.
13737121it [59:25, 3853.22it/s]

break
INFO:tensorflow:Generated 13737121 Examples



INFO:tensorflow:Generated 13737121 Examples


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Shuffling data...


Instructions for updating:
Use eager execution and: 
`tf.compat.v1.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.compat.v1.data.TFRecordDataset(path)`


INFO:tensorflow:Data shuffled.


INFO:tensorflow:Data shuffled.
