In [12]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/knowledge-graph/kelm/train_X
# !wget https://f000.backblazeb2.com/file/malay-dataset/knowledge-graph/kelm/train_Y

In [11]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [2]:
from tqdm import tqdm
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [4]:
with open('train_X') as fopen:
    train_X = fopen.read().split('\n')
    
with open('train_Y') as fopen:
    train_Y = fopen.read().split('\n')

In [5]:
with tf.compat.v1.io.gfile.GFile('knowledge-graph.tsv', "w") as outfile:
    for i in tqdm(range(len(train_X))):
        if len(train_X) and len(train_Y):
            l = cleaning(train_X[i])
            r = cleaning(train_Y[i])
            outfile.write("%s\t%s\n" % (l, r))

100%|██████████| 6032873/6032873 [02:26<00:00, 41280.67it/s]


In [6]:
def knowledge_graph_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.compat.v1.data.TextLineDataset(
        [
            'knowledge-graph.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.compat.v1.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.compat.v1.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def knowledge_graph_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.compat.v1.strings.join(['grafik pengetahuan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.compat.v1.data.experimental.AUTOTUNE,
    )

In [7]:
t5.data.TaskRegistry.remove('knowledge_graph_dataset')
t5.data.TaskRegistry.add(
    'knowledge_graph_dataset',
    dataset_fn = knowledge_graph_dataset,
    splits = ['train'],
    text_preprocessor = [knowledge_graph_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


In [8]:
nq_task = t5.data.TaskRegistry.get("knowledge_graph_dataset")
ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

In [9]:
next(r)

{'inputs_plaintext': b'grafik pengetahuan: Perkahwinan Figaro adalah episod Mad Men dari musim 1 dan diikuti oleh New Amsterdam.',
 'inputs': array([12333,  5836,    31,   881,  2158,  3686,    47,    13,  2397,
         1783,   162,    52,  5319,  3940,  1181,    42,   451,   179,
           22,  5281,    60,   141, 14108,     3,     1]),
 'targets_plaintext': b'Marriage of Figaro ( Mad Men ) season Mad Men ( season 1 ), part of the series Mad Men, followed by New Amsterdam ( Mad Men ).',
 'targets': array([ 1366,   562,  1174,    18,    13,  2397,  1783,   162,    13,
            4,  3940,  1181,    13,     5,  1015,  3940,  1181,    13,
            4,  1015,   179,    13,     5,    14,   523,    18,    15,
         1463,  3940,  1181,    14,  3093,    51,   141, 14108,    13,
            4,  3940,  1181,    13,     5,     3,     1])}

In [13]:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('mesolitica-tpu-general')

In [None]:
blob = bucket.blob('t5-data/knowledge-graph-train.tsv')
blob.upload_from_filename('knowledge-graph.tsv')
os.remove('knowledge-graph.tsv')