In [1]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [2]:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('mesolitica-tpu-general')

In [3]:
blob = bucket.get_blob('pegasus-data-v2/tfrecord/pegasus-splitted-parliament00.txt.tfrecord')
blob.download_to_filename('pegasus-splitted-parliament00.txt.tfrecord')

In [4]:
import tensorflow as tf

In [5]:
def _decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    example = tf.io.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.to_int32(t)
        example[name] = t

    return example

def input_fn_builder(
    input_files,
    max_seq_length_encoder,
    max_seq_length_decoder,
    max_predictions_per_seq,
    is_training,
    num_cpu_threads = 4,
):
    def input_fn(params):
        batch_size = params['batch_size']

        name_to_features = {
            'input_ids': tf.io.FixedLenFeature([max_seq_length_encoder], tf.int64),
            'target_ids': tf.io.FixedLenFeature(
                [max_seq_length_decoder], tf.int64
            ),
            'masked_lm_positions': tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.int64
            ),
            'masked_lm_ids': tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.int64
            ),
            'masked_lm_weights': tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.float32
            ),
        }
        if is_training:
            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            d = d.repeat()
            d = d.shuffle(buffer_size = len(input_files))
            cycle_length = min(num_cpu_threads, len(input_files))
            d = d.apply(
                tf.contrib.data.parallel_interleave(
                    tf.data.TFRecordDataset,
                    sloppy = is_training,
                    cycle_length = cycle_length,
                )
            )
            d = d.shuffle(buffer_size = 100)
        else:
            d = tf.data.TFRecordDataset(input_files)
            d = d.repeat()
        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size = batch_size,
                num_parallel_batches = num_cpu_threads,
                drop_remainder = True,
            )
        )
        return d

    return input_fn

In [16]:
input_fn = input_fn_builder(['pegasus-splitted-parliament00.txt.tfrecord'], 512, 256, 0, True)
dataset = input_fn({'batch_size': 1})
dataset = dataset._make_one_shot_iterator().get_next()

In [18]:
sess = tf.Session()

In [20]:
r = sess.run(dataset)

In [21]:
import tokenization

tokenizer = tokenization.FullTokenizer(
    vocab_file='pegasus.wordpiece', do_lower_case=False
)




In [28]:
tokenizer.convert_ids_to_tokens(r['input_ids'][0])

['JAWAPAN',
 ':',
 'YB',
 'DATO',
 '*',
 'SERI',
 'DR',
 '.',
 'SHAH',
 '##IDA',
 '##N',
 'BIN',
 'KAS',
 '##SI',
 '##M',
 'MENTERI',
 'DI',
 'JABATAN',
 'PERDANA',
 'MENTERI',
 'Tuan',
 'Yang',
 'di',
 '-',
 'Pertua',
 ',',
 'Bilangan',
 'Rumah',
 'Kekal',
 'Baharu',
 '(',
 'R',
 '##KB',
 ')',
 'yang',
 'dim',
 '##ohon',
 'oleh',
 'mangsa',
 'banjir',
 'di',
 'negeri',
 'Kelantan',
 'ialah',
 'sebanyak',
 '1',
 ',',
 '82',
 '##7',
 'unit',
 '.',
 'Dari',
 'jumlah',
 'tersebut',
 ',',
 'Kerajaan',
 'Persekutuan',
 'dipertanggungjawabkan',
 'untuk',
 'membina',
 '96',
 '##6',
 'unit',
 '.',
 'Sehingga',
 '18',
 'Mei',
 '2016',
 ',',
 'Kerajaan',
 'Persekutuan',
 'telah',
 'melaksanakan',
 'proses',
 'penempatan',
 'semula',
 'mangsa',
 'banjir',
 'melalui',
 'program',
 'rumah',
 'berkel',
 '##ompok',
 '(',
 'Integ',
 '##rated',
 'Res',
 '##et',
 '##tle',
 '##ment',
 'Programme',
 ')',
 'di',
 'sepuluh',
 '(',
 '10',
 ')',
 'lokasi',
 'yang',
 'telah',
 'dikenal',
 'pasti',
 '.',
 '[MAS

In [29]:
tokenizer.convert_ids_to_tokens(r['target_ids'][0])

['Berikut',
 'adalah',
 'lokasi',
 'penempatan',
 'semula',
 'mangsa',
 'banjir',
 '2014',
 'di',
 'Kelantan',
 ':',
 '-',
 'Ja',
 '##iah',
 '##an',
 'Kuala',
 'Krai',
 'Kampung',
 'Telek',
 '##ong',
 '94',
 'Unit',
 'R',
 '##KB',
 'ii',
 '.',
 'Sehingga',
 '18',
 'Mei',
 '2016',
 ',',
 'Kerajaan',
 'Persekutuan',
 'telah',
 'melaksanakan',
 'proses',
 'penempatan',
 'semula',
 'mangsa',
 'banjir',
 'melalui',
 'program',
 'rumah',
 'berkel',
 '##ompok',
 '(',
 'Integ',
 '##rated',
 'Res',
 '##et',
 '##tle',
 '##ment',
 'Programme',
 ')',
 'di',
 'sepuluh',
 '(',
 '10',
 ')',
 'lokasi',
 'yang',
 'telah',
 'dikenal',
 'pasti',
 '.',
 'Sepuluh',
 '(',
 '10',
 ')',
 'lokasi',
 'tersebut',
 'melibatkan',
 'enam',
 '(',
 '6',
 ')',
 'tapak',
 'berkel',
 '##ompok',
 'di',
 'Jajahan',
 'Kuala',
 'Krai',
 'dan',
 'empat',
 '(',
 '4',
 ')',
 'tapak',
 'berkel',
 '##ompok',
 'di',
 'Jajahan',
 'Gua',
 'Musang',
 '.',
 '[CLS]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]'