In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/summarization/cnn-news/translated-cnn-test.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/summarization/dailymail/translated-dailymail-test.json

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
import sentencepiece as spm
sp_model = spm.SentencePieceProcessor()
sp_model.Load('prepare/sp10m.cased.ms-en.model')

True

In [4]:
import tensorflow as tf
import tensorflow_text
import struct

unknown = b'\xff\xff\xff\xff'

def load_graph(frozen_graph_filename):
    with tf.compat.v1.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.compat.v1.GraphDef()
        graph_def.ParseFromString(f.read())
        
    for node in graph_def.node:
        
        if node.op == 'RefSwitch':
          node.op = 'Switch'
          for index in xrange(len(node.input)):
            if 'moving_' in node.input[index]:
              node.input[index] = node.input[index] + '/read'
        elif node.op == 'AssignSub':
          node.op = 'Sub'
          if 'use_locking' in node.attr: del node.attr['use_locking']
        elif node.op == 'AssignAdd':
          node.op = 'Add'
          if 'use_locking' in node.attr: del node.attr['use_locking']
        elif node.op == 'Assign':
          node.op = 'Identity'
          if 'use_locking' in node.attr: del node.attr['use_locking']
          if 'validate_shape' in node.attr: del node.attr['validate_shape']
          if len(node.input) == 2:
            node.input[0] = node.input[1]
            del node.input[1]
            
        if 'Reshape/shape' in node.name or 'Reshape_1/shape' in node.name:
            b = node.attr['value'].tensor.tensor_content
            arr_int = [int.from_bytes(b[i:i + 4], 'little') for i in range(0, len(b), 4)]
            if len(arr_int):
                arr_byte = [unknown] + [struct.pack('<i', i) for i in arr_int[1:]]
                arr_byte = b''.join(arr_byte)
                node.attr['value'].tensor.tensor_content = arr_byte
            
            if len(node.attr['value'].tensor.int_val):
                node.attr['value'].tensor.int_val[0] = -1
    
    with tf.compat.v1.Graph().as_default() as graph:
        tf.compat.v1.import_graph_def(graph_def)
    return graph

In [5]:
g = load_graph('base/frozen_model.pb')
x = g.get_tensor_by_name('import/inputs:0')
logits = g.get_tensor_by_name('import/SelectV2_3:0')
test_sess = tf.compat.v1.InteractiveSession(graph = g)

In [6]:
import json

with open('/home/husein/malaya/malaya/pretrained-model/pegasus/translated-cnn-test.json') as fopen:
    data = json.load(fopen)
    
with open('/home/husein/malaya/malaya/pretrained-model/pegasus/translated-dailymail-test.json') as fopen:
    data.extend(json.load(fopen))
    
len(data)

12000

In [7]:
X, Y = [], []
for i in range(len(data)):
    X.append(' '.join(data[i]['ms_article']))
    Y.append(' '.join(data[i]['ms_abstract']))

In [8]:
import re
from unidecode import unidecode
from malaya.text.rules import normalized_chars

def filter_news(string):
    string = string.lower()
    return 'javascript is disabled' in string or 'requires javascript' in string or 'javascript' in string \
    or 'président' in string

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def transformer_textcleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    return ' '.join(string)

In [9]:
from tqdm import tqdm

batch_size = 10
results = []
for i in tqdm(range(0, len(X), batch_size)):
    batch_x = X[i: i + batch_size]
    batches = []
    for b in batch_x:
        batches.append(f'ringkasan: {transformer_textcleaning(b)}')
    g = test_sess.run(logits, feed_dict = {x:batches})
    results.extend(g.tolist())

100%|██████████| 1200/1200 [1:11:10<00:00,  3.56s/it]


In [10]:
from tensor2tensor.utils import rouge
from tensorflow.keras.preprocessing import sequence

In [11]:
import numpy as np

In [12]:
def calculate_rouges(predicted, batch_y, n_size = 2):
    batch_y = [sp_model.EncodeAsIds(transformer_textcleaning(row)) for row in batch_y]
    maxlen = max(max(len(row) for row in batch_y), max(len(row) for row in predicted))
    batch_y = sequence.pad_sequences(batch_y, padding = 'post', maxlen = maxlen)
    predicted = sequence.pad_sequences(predicted, padding = 'post', maxlen = maxlen)
    
    non = np.count_nonzero(batch_y, axis = 1)
    o = []
    for n in non:
        o.append([True for _ in range(n)])
    b = sequence.pad_sequences(o, dtype = np.bool, padding = 'post', value = False, maxlen = maxlen)
    rouges = []
    for i in range(b.shape[0]):
        a = batch_y[i][b[i]]
        p = predicted[i][b[i]]
        score = rouge.rouge_n([a], [p], n = n_size)
        rouges.append(score)
    return np.mean(rouges)

def calculate_rouge_l(predicted, batch_y):
    batch_y = [sp_model.EncodeAsIds(transformer_textcleaning(row)) for row in batch_y]
    maxlen = max(max(len(row) for row in batch_y), max(len(row) for row in predicted))
    batch_y = sequence.pad_sequences(batch_y, padding = 'post', maxlen = maxlen)
    predicted = sequence.pad_sequences(predicted, padding = 'post', maxlen = maxlen)
    
    non = np.count_nonzero(batch_y, axis = 1)
    o = []
    for n in non:
        o.append([True for _ in range(n)])
    b = sequence.pad_sequences(o, dtype = np.bool, padding = 'post', value = False, maxlen = maxlen)
    rouges = []
    for i in range(b.shape[0]):
        a = batch_y[i][b[i]]
        p = predicted[i][b[i]]
        score = rouge.rouge_l_sentence_level([a], [p])
        rouges.append(score)
    return np.mean(rouges)

In [13]:
calculate_rouges(results, Y[:len(results)], n_size = 1)

0.3717403

In [14]:
calculate_rouges(results, Y[:len(results)], n_size = 2)

0.18471429

In [15]:
calculate_rouge_l(results, Y[:len(results)])

0.2582724