In [1]:
from bert import tokenization

In [2]:
import json

with open('../bahasa/dataset-en-to-ms.json') as fopen:
    data = json.load(fopen)
    
train_X = data['train_X']
train_Y = data['train_Y']
test_X = data['test_X']
test_Y = data['test_Y']

In [3]:
BERT_VOCAB = '../multi_cased_L-12_H-768_A-12/vocab.txt'
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=False)




In [4]:
EOS = 1

In [5]:
from unidecode import unidecode
from tqdm import tqdm
import collections
import tensorflow as tf
maxlen = 256

def create_int_feature(values):
    feature = tf.compat.v1.train.Feature(
        int64_list = tf.compat.v1.train.Int64List(value = list(values))
    )
    return feature

def get_inputs(x, y, index, prefix = 'train'):
    input_ids, input_masks, segment_ids, ys = [], [], [], []
    for i in tqdm(range(len(x))):
        tokens_a = tokenizer.tokenize(unidecode(x[i]))
        tokens_b = tokenizer.tokenize(unidecode(y[i]))
        tokens_a = tokens_a[:maxlen - 2]
        tokens_b = tokens_b[:maxlen - 1]
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        
        segment_id = [0] * len(tokens)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        
        input_id = input_id + [0] * (maxlen - len(input_id))
        segment_id = segment_id + [0] * (maxlen - len(segment_id))
        input_mask = input_mask + [0] * (maxlen - len(input_mask))

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        
        r = tokenizer.convert_tokens_to_ids(tokens_b) + [EOS]
        if len([k for k in r if k == 0]):
            print(y[i], i)
            break
            
        r = r + [0] * (maxlen - len(r))
        ys.append(r)
    
    r = tf.compat.v1.python_io.TFRecordWriter(f'multilanguagebert-{prefix}-{index}.tfrecord')
    for i in tqdm(range(len(ys))):
        features = collections.OrderedDict()
        features['input_ids'] = create_int_feature(input_ids[i])
        features['input_mask'] = create_int_feature(input_masks[i])
        features['segment_ids'] = create_int_feature(segment_ids[i])
        features['y'] = create_int_feature(ys[i])
        tf_example = tf.compat.v1.train.Example(
            features = tf.compat.v1.train.Features(feature = features)
        )
        r.write(tf_example.SerializeToString())
    r.close()

In [6]:
def chunks_multiple(l, n):
    for i in range(0, len(l), n):
        x, y = list(zip(*l[i : i + n]))
        yield (x, y, i)

In [7]:
import multi

In [8]:
multi.multiprocessing(chunks_multiple(list(zip(train_X, train_Y)), len(train_X) // 12),
                     get_inputs)

 99%|█████████▉| 269276/272587 [04:47<00:15, 210.74it/s] 
100%|█████████▉| 272068/272587 [04:52<00:02, 241.32it/s]
 98%|█████████▊| 268129/272587 [04:51<00:24, 183.84it/s]
 98%|█████████▊| 268121/272587 [04:51<00:21, 210.51it/s]
 99%|█████████▉| 269679/272587 [04:56<00:11, 254.66it/s]
100%|█████████▉| 272038/272587 [04:59<00:01, 297.78it/s]
  3%|▎         | 7369/272587 [00:08<04:16, 1032.88it/s]]
  1%|          | 1790/272587 [00:01<04:09, 1083.86it/s]]
  4%|▍         | 11641/272587 [00:13<04:00, 1084.73it/s]
100%|██████████| 272587/272587 [05:04<00:00, 895.30it/s]
  5%|▍         | 12298/272587 [00:14<07:36, 570.16it/s]]
  5%|▌         | 13664/272587 [00:15<05:13, 825.19it/s]]
100%|██████████| 272587/272587 [03:06<00:00, 1458.63it/s]
100%|██████████| 272587/272587 [03:11<00:00, 1419.92it/s]
100%|██████████| 272587/272587 [03:06<00:00, 1463.97it/s]
 96%|█████████▌| 261471/272587 [02:54<00:04, 2664.67it/s]
100%|██████████| 6/6 [00:00<00:00, 697.85it/s]128.19it/s]
100%|██████████| 6/6 [00:

In [9]:
def chunks_multiple(l, n):
    for i in range(0, len(l), n):
        x, y = list(zip(*l[i : i + n]))
        yield (x, y, i, 'test')

In [10]:
multi.multiprocessing(chunks_multiple(list(zip(test_X, test_Y)), len(test_X) // 12),
                     get_inputs)

100%|██████████| 5563/5563 [00:04<00:00, 1229.64it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1203.67it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1201.45it/s]
  4%|▍         | 227/5563 [00:00<00:02, 2263.69it/s]]
 12%|█▏        | 640/5563 [00:00<00:02, 2092.59it/s]]
100%|██████████| 5563/5563 [00:04<00:00, 1147.49it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1145.38it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1151.62it/s]
  0%|          | 0/5563 [00:00<?, ?it/s]2140.74it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1139.78it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1130.13it/s]
100%|██████████| 5563/5563 [00:04<00:00, 1115.46it/s]
100%|██████████| 5563/5563 [00:02<00:00, 2139.11it/s]
100%|██████████| 1/1 [00:00<00:00, 1281.88it/s]7it/s]
100%|██████████| 1/1 [00:00<00:00, 1252.78it/s]
100%|██████████| 5563/5563 [00:02<00:00, 2010.85it/s]
100%|██████████| 5563/5563 [00:02<00:00, 1898.03it/s]
100%|██████████| 5563/5563 [00:02<00:00, 2089.37it/s]
100%|██████████| 5563/5563 [00:02<0