In [1]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-storage.json'

from google.cloud import storage
client = storage.Client()
bucket = client.bucket('mesolitica-public')

In [2]:
from glob import glob
import itertools
from tqdm import tqdm
import json

files = glob('results-semi*.json')

In [3]:
import malaya

def cleaning(string):
    splitted = malaya.text.function.split_into_sentences(string)
    if not len(splitted):
        splitted = '. '.join([k.strip() for k in string.split('.') if len(k.strip())])
    if splitted[0][0] == '-':
        splitted[0] = splitted[0].replace('- ','')
    points = [f'{no + 1}. {malaya.text.function.transformer_textcleaning(s)}' for no, s in enumerate(splitted)]
    points = ' '.join(points)
    return points

In [4]:
before, after = [], []

for file in files:
    with open(file) as fopen:
        x = json.load(fopen)
    merged = list(itertools.chain(*x))
    for row in tqdm(merged):
        try:
            if len(row) != 2:
                continue
            before.append(cleaning(row[1] + '.'))
            after.append(malaya.text.function.transformer_textcleaning(row[0]))
        except:
            pass

100%|██████████| 35824/35824 [00:37<00:00, 952.03it/s] 
100%|██████████| 35824/35824 [00:44<00:00, 813.62it/s] 
100%|██████████| 35824/35824 [00:44<00:00, 799.95it/s]


In [5]:
len(before), len(after)

(107471, 107471)

In [6]:
# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/summarization/karangan/1.rtf
# !pip3 install striprtf

In [7]:
from striprtf.compat.v1.striprtf import rtf_to_text

with open('1.rtf') as fopen:
    x = fopen.read()

results = []
splitted = rtf_to_text(x).split('===')
for i in range(0, len(splitted), 2):
    results.append(splitted[i: i + 2])

In [8]:
remove = ['(AR5)', '(AR1)', '(AR2)', '(AR3)', '(AR4)', '(AA)', '(AH1)', '(AH2)', '(AH3)', '(AC)',
'(AH4)', '(AP)', '(A Ksmpln)', '(A Cdngn)', '(A Pndpt)', '(A Pntp)', '(AJ)', '(AH5)']
def cleaning_rtf(string):
    for r in remove:
        string = string.replace(r, '')
    return malaya.text.function.transformer_textcleaning(string)

In [9]:
for row in results:
    b = [i for i in row[0].split('\n') if len(i) > 2]
    b = [f'{no + 1}. {i}' for no, i in enumerate(b)]
    b = ' '.join(b)
    a = cleaning_rtf(row[1])
    
    before.append(b)
    after.append(a)

In [10]:
batches = []
batch = 20000
for i in range(0, len(before), batch):
    index = min(i + batch, len(before))
    x = before[i: index]
    y = after[i: index]
    batches.append((x, y))

In [11]:
import tensorflow as tf

for i in range(len(batches)):
    before = batches[i][0]
    after = batches[i][1]
    filename = f'bahasa-generator-{i}.tsv'
    with tf.compat.v1.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(before)):
            outfile.write('%s\t%s\n' % (before[i], after[i]))
            
    blob = bucket.blob(f't5-data/{filename}')
    blob.upload_from_filename(filename)