In [1]:
from bert import tokenization

In [2]:
import json

with open('dataset.json') as fopen:
    data = json.load(fopen)
    
train_X = data['train_X']
train_Y = data['train_Y']
test_X = data['test_X']
test_Y = data['test_Y']

In [3]:
BERT_VOCAB = '../multi_cased_L-12_H-768_A-12/vocab.txt'
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=False)




In [4]:
GO = 101
EOS = 102

In [5]:
from unidecode import unidecode
from tqdm import tqdm
import collections
import tensorflow as tf
maxlen = 256

def create_int_feature(values):
    feature = tf.compat.v1.train.Feature(
        int64_list = tf.compat.v1.train.Int64List(value = list(values))
    )
    return feature

def get_inputs(x, y, index, prefix = 'train'):
    input_ids, input_masks, segment_ids, ys = [], [], [], []
    for i in tqdm(range(len(x))):
        tokens_a = tokenizer.tokenize(unidecode(x[i]))
        tokens_b = tokenizer.tokenize(unidecode(y[i]))
        tokens_a = tokens_a[:maxlen - 2]
        tokens_b = tokens_b[:maxlen - 1]
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        
        segment_id = [0] * len(tokens)
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        
        input_id = input_id + [0] * (maxlen - len(tokens_a))
        segment_id = segment_id + [0] * (maxlen - len(segment_id))
        input_mask = input_mask + [0] * (maxlen - len(input_mask))

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        
        r = tokenizer.convert_tokens_to_ids(tokens_b + ["[SEP]"])
        if len([k for k in r if k == 0]):
            print(y[i], i)
            break
            
        r = r + [0] * (maxlen - len(r))
        ys.append(r)
    
    r = tf.compat.v1.python_io.TFRecordWriter(f'multilanguagebert-{prefix}-{index}.tfrecord')
    for i in tqdm(range(len(ys))):
        features = collections.OrderedDict()
        features['input_ids'] = create_int_feature(input_ids[i])
        features['input_mask'] = create_int_feature(input_masks[i])
        features['segment_ids'] = create_int_feature(segment_ids[i])
        features['y'] = create_int_feature(ys[i])
        tf_example = tf.compat.v1.train.Example(
            features = tf.compat.v1.train.Features(feature = features)
        )
        r.write(tf_example.SerializeToString())
    r.close()

In [6]:
def chunks_multiple(l, n):
    for i in range(0, len(l), n):
        x, y = list(zip(*l[i : i + n]))
        yield (x, y, i)

In [7]:
import multi

In [8]:
multi.multiprocessing(chunks_multiple(list(zip(train_X, train_Y)), len(train_X) // 12),
                     get_inputs)

100%|██████████| 309379/309379 [05:58<00:00, 863.00it/s] 
100%|██████████| 309379/309379 [05:58<00:00, 864.15it/s]]
100%|██████████| 309379/309379 [05:59<00:00, 861.09it/s] 
100%|██████████| 309379/309379 [05:59<00:00, 859.77it/s] 
100%|██████████| 309379/309379 [05:57<00:00, 865.20it/s] 
100%|██████████| 309379/309379 [05:59<00:00, 861.20it/s] 
100%|██████████| 309379/309379 [06:01<00:00, 856.76it/s] 
100%|██████████| 309379/309379 [06:00<00:00, 857.09it/s] 
100%|██████████| 309379/309379 [05:59<00:00, 860.85it/s]
100%|██████████| 309379/309379 [06:02<00:00, 853.49it/s]
100%|██████████| 309379/309379 [06:01<00:00, 856.97it/s]
100%|██████████| 309379/309379 [06:00<00:00, 857.23it/s]
100%|██████████| 309379/309379 [02:19<00:00, 2212.68it/s]
100%|██████████| 7/7 [00:00<00:00, 916.27it/s]502.26it/s]
 99%|█████████▉| 307358/309379 [02:20<00:01, 1523.06it/s]
100%|██████████| 309379/309379 [02:21<00:00, 2181.47it/s]
100%|██████████| 309379/309379 [02:21<00:00, 2180.15it/s]
100%|██████████| 3

In [9]:
def chunks_multiple(l, n):
    for i in range(0, len(l), n):
        x, y = list(zip(*l[i : i + n]))
        yield (x, y, i, 'test')

In [10]:
multi.multiprocessing(chunks_multiple(list(zip(test_X, test_Y)), len(test_X) // 12),
                     get_inputs)

100%|██████████| 8333/8333 [00:10<00:00, 829.28it/s] 
100%|██████████| 8333/8333 [00:10<00:00, 814.80it/s] 
100%|██████████| 8333/8333 [00:10<00:00, 810.86it/s] 
100%|██████████| 8333/8333 [00:10<00:00, 799.50it/s]
100%|██████████| 8333/8333 [00:10<00:00, 798.03it/s]
100%|██████████| 8333/8333 [00:10<00:00, 799.93it/s]
  7%|▋         | 576/8333 [00:00<00:02, 2847.57it/s]
100%|██████████| 8333/8333 [00:10<00:00, 790.79it/s]
100%|██████████| 8333/8333 [00:10<00:00, 791.80it/s]
100%|██████████| 8333/8333 [00:10<00:00, 787.28it/s]
100%|██████████| 8333/8333 [00:10<00:00, 790.94it/s]
100%|██████████| 8333/8333 [00:10<00:00, 784.32it/s]]
100%|██████████| 8333/8333 [00:02<00:00, 2951.74it/s]
100%|██████████| 8333/8333 [00:02<00:00, 3039.81it/s]
100%|██████████| 4/4 [00:00<00:00, 1487.61it/s]5it/s]
100%|██████████| 4/4 [00:00<00:00, 2171.53it/s]7it/s]
100%|██████████| 8333/8333 [00:03<00:00, 2586.56it/s]
 87%|████████▋ | 7246/8333 [00:02<00:00, 2818.05it/s]
100%|██████████| 8333/8333 [00:02<00