In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.data_generators import translate
from tensor2tensor.utils import registry
from tensor2tensor import problems
import tensorflow as tf
import os
import logging

logger = logging.getLogger()
tf.compat.v1.logging.set_verbosity(tf.logging.DEBUG)

In [2]:
import sentencepiece as spm

vocab = 'sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

class Encoder:
    def __init__(self, sp):
        self.sp = sp
        self.vocab_size = sp.GetPieceSize() + 100
    
    def encode(self, s):
        return self.sp.EncodeAsIds(s)
    
    def decode(self, ids, strip_extraneous=False):
        return self.sp.DecodeIds(list(ids))
    
encoder = Encoder(sp)

In [3]:
from tqdm import tqdm
from glob import glob

@registry.register_problem
class Seq2Seq(text_problems.Text2TextProblem):

    @property
    def approx_vocab_size(self):
        return 32100
    
    @property
    def is_generate_per_split(self):
        return False
            
    def feature_encoders(self, data_dir):
        encoder = Encoder(sp)
        return {
            "inputs": encoder,
            "targets": encoder
        }


In [4]:
DATA_DIR = os.path.expanduser('t2t2/data')
TMP_DIR = os.path.expanduser('t2t2/tmp')
TRAIN_DIR = os.path.expanduser('t2t2/train-base')
EXPORT_DIR = os.path.expanduser('t2t2/export')
TRANSLATIONS_DIR = os.path.expanduser('t2t2/translation')
EVENT_DIR = os.path.expanduser('t2t2/event')
USR_DIR = os.path.expanduser('t2t2/user')

In [5]:
PROBLEM = 'seq2_seq'
t2t_problem = problems.problem(PROBLEM)

In [7]:
import tensorflow as tf
import os

ckpt_path = 'base/model.ckpt-300000'
ckpt_path

'base/model.ckpt-300000'

In [8]:
from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.data_generators import translate
from tensor2tensor.utils import registry







In [9]:
class Model:
    def __init__(self, HPARAMS = "transformer_base", DATA_DIR = 't2t/data'):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        maxlen_decode = tf.reduce_max(self.X_seq_len)
        
        x = tf.expand_dims(tf.expand_dims(self.X, -1), -1)
        y = tf.expand_dims(tf.expand_dims(self.Y, -1), -1)
        
        features = {
            "inputs": x,
            "targets": y,
            "target_space_id": tf.constant(1, dtype=tf.int32),
        }
        self.features = features
        
        Modes = tf.estimator.ModeKeys
        hparams = trainer_lib.create_hparams(HPARAMS, data_dir=DATA_DIR, problem_name=PROBLEM)
        hparams.filter_size = 3072
        hparams.hidden_size = 768
        hparams.num_heads = 12
        hparams.num_hidden_layers = 8
        hparams.vocab_divisor = 128
        hparams.label_smoothing = 0.0
        hparams.shared_embedding_and_softmax_weights = False
        hparams.dropout = 0.1
        hparams.max_length = 1024
        hparams.multiproblem_mixing_schedule = "pretrain"

        hparams.optimizer = "Adafactor"
        hparams.learning_rate_warmup_steps = 10000
        hparams.learning_rate_schedule = "rsqrt_decay"
        
        translate_model = registry.model('transformer')(hparams, Modes.TRAIN)
        self.translate_model = translate_model
        logits, _ = translate_model(features)
        self.logits = logits
        
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            self.fast_result = translate_model._greedy_infer(features, maxlen_decode)["outputs"]
            self.beam_result = translate_model._beam_decode_slow(
                features, maxlen_decode, beam_size=5, 
                top_beams=1, alpha=1.0)["outputs"]
        
        self.fast_result = tf.identity(self.fast_result, name = 'greedy')
        self.beam_result = tf.identity(self.beam_result, name = 'beam')
        
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, ckpt_path)

Instructions for updating:
reduction_indices is deprecated, use axis instead


Instructions for updating:
reduction_indices is deprecated, use axis instead


INFO:tensorflow:Setting T2TModel mode to 'train'


INFO:tensorflow:Setting T2TModel mode to 'train'


INFO:tensorflow:Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Using variable initializer: uniform_unit_scaling






Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_768.bottom


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_768.bottom


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_768.targets_bottom


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_768.targets_bottom


INFO:tensorflow:Building model body


INFO:tensorflow:Building model body


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






INFO:tensorflow:Transforming body output with symbol_modality_32128_768.top


INFO:tensorflow:Transforming body output with symbol_modality_32128_768.top


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


INFO:tensorflow:Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Using variable initializer: uniform_unit_scaling


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_768.bottom


INFO:tensorflow:Transforming feature 'inputs' with symbol_modality_32128_768.bottom


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_768.targets_bottom


INFO:tensorflow:Transforming feature 'targets' with symbol_modality_32128_768.targets_bottom


INFO:tensorflow:Building model body


INFO:tensorflow:Building model body


INFO:tensorflow:Transforming body output with symbol_modality_32128_768.top


INFO:tensorflow:Transforming body output with symbol_modality_32128_768.top


INFO:tensorflow:Restoring parameters from base/model.ckpt-300000


INFO:tensorflow:Restoring parameters from base/model.ckpt-300000


In [10]:
# saver = tf.train.Saver(tf.trainable_variables())
# saver.save(sess, 'b2b-base/model.ckpt')

'b2b-base/model.ckpt'

In [11]:
#!rm -rf b2b-base

In [12]:
glob('dumping*txt.tsv')

['dumping-watpadd.txt.tsv',
 'dumping-wiki.txt.tsv',
 'dumping-instagram.txt.tsv',
 'dumping-parliament.txt.tsv',
 'dumping-pdf.txt.tsv',
 'dumping-news.txt.tsv',
 'dumping-iium.txt.tsv']

In [13]:
from t5.data import preprocessors as prep
import functools
import t5
import gin

gin.parse_config_file('pretrained_models_base_operative_config.gin')

def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['dumping-wiki.txt.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds

t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

nq_task = t5.data.TaskRegistry.get("dumping_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


In [20]:
def pair_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        glob('dumping*pair.tsv')
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['text'], ex)))
    return ds


t5.data.TaskRegistry.remove('pair_dataset')
t5.data.TaskRegistry.add(
    'pair_dataset',
    dataset_fn = pair_dataset,
    splits = ['train'],
    text_preprocessor = [prep.next_sentence_prediction],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

nq_task = t5.data.TaskRegistry.get("pair_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})

      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.


      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.


      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.


In [14]:
import tensorflow_datasets as tfds
import numpy as np

iterator = tfds.as_numpy(ds)

In [15]:
r = next(iterator)
encoder.decode([i for i in r['targets'].tolist() if i < 32000])

'tentera daripada kekuasaannya dan menyebabkan Khalaf terpaksa mengalahkan.amir.M pertempuran sehingga kekuasaan Khalf tegak kembali berakhirf. www.iranchamber.com padarieilung Verkehrswesen 7 Perang Penggangkut) sebagai Jerman semasa Perang Dunia pengendali A708/15 7 5. hadapan. Ia tidak jelas berapa banyak hanya kerangka mm di bukannya kepingan perisai Ini menawarkan perlindungan perisai kereta kebal menghasilkan 100 kuasa kuda ( km (132 gelen. Ia seperlahan kereta dari bererti kawasan. Bagaimanapun di( keupayaan diu 24 April 1918 apabila sengaja kebal dan satu kereta kebal Jantan dengan meriam enam kebal hadapan, 2nd Lt mesingan berundur rosak terkena kereta kebalnya dan memusnahkannya. Dia sebanyak tiga kali, dan membunuh lima krew untuk Dua A7 L L pasukan dua terbalik kedalam lubang, sesetengah mengalami ke dapat digunakan dan dilupuskan A'

In [16]:
f = sess.run(model.logits, feed_dict = {model.X: [r['inputs']], model.Y: [r['targets']]})

In [17]:
encoder.decode([i for i in np.argmax(f[0,:,0,0], axis = -1).tolist() if i < 32000])

'dari kerajaan keatas denganaf terpaksa membalas."mir. 9 Pertempuran Khal Abu Khalf berakhirf Dinasti(.tsjmads Termasuk pada tahunlausseklehr"" Keubahsuaigal) sebagai Perang Perang Dunia belas dan A7,,h 8. hadapan yang A banyak bahawa gun mm, tidak perisai perisai, Senjata adalah kepingan  kuasa kW tembakan (3 km (atau tan m pada Tiada boleht keretanya, parit. Ia kereta( dan dengan hadapan pertembungan Mac 1917, sengaja kebal infantri kebal kebal, kebal A mendahului sungguhpunring. kebal... kebal palang dengan lebih lebih menolak lebih lebih.t. kebal kereta. medan kelihatannya. lebih'

In [18]:
np.argmax(f[0,:,0,0], axis = -1)

array([32099,    37, 32098, 32098,    72, 32097, 14018,    22, 32097,
          13,   492,  1060,  6390, 32097,     3, 32096,     6,  6160,
           3, 32095,   482, 32094,  4141, 12529,  1048, 12529, 32093,
         492, 32092, 32092, 32092,  1902, 32091,   492, 32090,  4869,
           4,     3,  2268,   963,  4407,    25, 16216, 32090,    23,
          31,  4938, 32088,  8822, 32087, 32087, 15562, 32087, 32087,
        5253, 32087,     6,     6, 32086,   309, 26823,  4832, 32085,
           5, 32084,    41, 32083,  1000, 32082,  1000,   653, 32082,
        2961,    16,   128,   376, 32080,    14,    14,   211, 32079,
       32079,   410,     3, 32078,   942, 32077,    17,   128, 32076,
       32076, 32076,   127, 32076,    90, 32075, 13654,  4791,    14,
       32074,    29,  6925,  6925,    14, 10922,    35, 32072, 32072,
       13085, 32071, 32071, 32071,    15,   284, 22331,  4733,    15,
           4,   205,  1468, 32069,    15,     4,  3096,  2298,   856,
       32068,    23,

In [19]:
r['targets']

array([32099,   460,    47, 32098,  4806,    26,    16,   370, 12529,
          13,   492,  1060,  4087, 32097,     3, 32096,    13,  6160,
           3, 32095,   238, 32094,  2815,   135,  4806, 12529, 32093,
         492, 16580,   448, 32092,  1902, 32091,   492,     3,    15,
       28759,     3,  5691, 21240,   839,     3,  1065, 32090,    23,
       32089,  6691, 32088,   752,  1086,  3376, 18098,  3923,  3814,
        1840, 32087,   362,  1000, 32086,  1639,  1255, 17040, 32085,
           5, 32084,    41, 32083,   426,   251,  1000,   653, 32082,
        4331, 32081,   128,   376, 32080,  3052,   214,   871,   362,
       32079,   241,     3, 32078,   942, 32077,     3,   132,    29,
         740,  3620,   127, 32076,   121,  5553, 32075,  4791,    18,
       32074,  2282, 13085,  6925, 32073,   297,  1342,  1745, 32072,
        6925,   542,  2506, 32071,   803,  1026,   284,  3971,    15,
           4, 32070,  1468, 32069,    15,     4,  1009,   143, 20578,
       32068,     3,

In [None]:
r = sess.run(n)
r[0]['inputs'][:,:,0,0].shape

In [None]:
r[0]['inputs'][:,:,0,0][-3]

In [None]:
r[0]['targets'][:10,:,0,0]

In [None]:
hparams.data_dir

In [None]:
r['targets'], np.argmax(f[0,:,0,0], axis = -1)

In [None]:
encoder.decode(np.argmax(f[0,:,0,0], axis = -1).tolist())

In [None]:
import re
from unidecode import unidecode

def cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string)).strip()

In [None]:
string = """
SHAH ALAM - Langkah Tun Dr Mahathir Mohamad yang mahu menubuhkan parti baharu sekali lagi dilihat akan memecahbelahkan orang Melayu, kata bekas pegawai khas Datuk Seri Najib Tun Razak, Isham Jalil.

Dakwa beliau, orang Melayu akan menjadi semakin lemah akibat tindakan bekas Perdana Menteri itu.

"Lebih dua puluh tahun dahulu, Dr Mahathir telah memecahbelahkan orang-orang Melayu dengan penubuhan Parti Keadilan Rakyat (PKR) oleh Datuk Seri Anwar Ibrahim akibat pergaduhan Dr Mahathir dengan beliau.

"Lima tahun lalu juga, Dr Mahathir telah memecahbelahkan orang-orang Melayu melalui penubuhan Bersatu untuk menjatuhkan Najib kerana tidak membantu Mukhriz dalam pemilihan UMNO serta tidak membantu kroni-kroni beliau untuk menyambung kontrak dan konsesi kerajaan yang diperoleh selama berpuluh tahun sebelum itu.

"Ini tidak akan berhenti sehinggalah Mukhriz menjadi Perdana Menteri dan kepentingan peribadi serta kroni-kroninya dipenuhi," katanya dalam satu kenyataan hari ini.

Justeru katanya, Ahli Parlimen Langkawi itu layak dilabelkan sebagai 'Bapa Pemecahbelah Orang-orang Melayu'.

"Parti baharu Dr Mahathir ini dikhabarkan yang terdiri daripada serpihan ahli-ahli Bersatu dalam kalangan orang-orang Melayu.

"Sebab jika ada banyak parti Melayu, orang-orang Melayu berkemungkinan besar akan lebih bertelagah dan berpecah belah," katanya.

Isham yang juga Ketua Penerangan Barisan Nasional (BN) Selangor berkata, musuh Melayu dan DAP kini paling beruntung dan bersorak dengan tindakan Dr Mahathir ini.

"Jika masih ada lagi orang-orang Melayu yang masih tidak sedar dan masih mengikut Dr Mahathir, saya tidak tahu nak kata apa.

"Kepada Dr Mahathir dan pengikut-pengikutnya ini, daripada mereka menjadi musuh dalam selimut untuk orang-orang Melayu, mungkin lebih baik mereka terus masuk DAP secara terang-terangan seperti segelintir Melayu DAP yang lain.

"Dengan ini mereka tidak perlu menyorok atau berselindung lagi tentang perjuangan sebenar mereka," katanya.
"""

In [None]:
encoded = encoder.encode(f'tajuk: {cleaning(string)}') + [1]
f = sess.run(model.fast_result, feed_dict = {model.X: [encoded]})
encoder.decode(f[0].tolist())

In [None]:
encoded = encoder.encode(f'ringkasan: {cleaning(string)}') + [1]
f = sess.run(model.fast_result, feed_dict = {model.X: [encoded]})
encoder.decode(f[0].tolist())

In [None]:
encoded = encoder.encode(f'soalan: siapakah perdana menteri malaysia?') + [1]
f = sess.run(model.fast_result, feed_dict = {model.X: [encoded]})
encoder.decode(f[0].tolist())