In [1]:
# !pip3 install git+https://github.com/huseinzol05/malaya.git@4.6.1 --no-deps

In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [3]:
import malaya
from malaya.preprocessing import Tokenizer
from malaya.text.function import case_of
from malaya.augmentation import (
    replace_similar_consonants, 
    replace_similar_vowels, 
    socialmedia_form,
    vowel_alternate)
from malaya.text import rules
from collections import defaultdict
import random
import re
import tensorflow as tf
from malaya.text.tatabahasa import alphabet, consonants, vowels
from malaya.text.function import augmentation_textcleaning, simple_textcleaning

def cleaning_row(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [4]:
replace_normalizer = defaultdict(list)
for k, v in rules.rules_normalizer.items():
    if v.count(' ') == 0:
        replace_normalizer[v].append(k)

In [5]:
def socialmedia_form(word: str):
    """
    augmenting a word into socialmedia form.

    Parameters
    ----------
    word: str

    Returns
    -------
    result: List[str]
    """

    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    results = []

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + word[1:])
    
        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(word[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    return list(set(results))

socialmedia_form('juga')

['jugak', 'juge']

In [6]:
def random_slide(string, min_n = 2):
    splitted = string.split()
    n = random.randint(min_n, len(splitted))
    i = random.randint(0, len(splitted) - n)
    return ' '.join(splitted[i: i + n])

random_slide('Husein makan ayam di kampung Jawa juga')

'Husein makan ayam di kampung Jawa'

In [7]:
'word'.split('-')

['word']

In [8]:
tokenizer = Tokenizer(duration = False, date = False).tokenize

def augment(string):
    
    r = []
    for word in tokenizer(string):
        original_word = word
        word_lower = word.lower()
        try:
            if word.istitle() or word.isupper():
                if random.random() >= 0.3:
                    word = case_of(word)(random.choice(replace_normalizer[word_lower]))
            else:
                splitted = word_lower.split('-')
                if len(splitted) > 1:
                    word = splitted[0]
                    after = '-'.join(splitted[1:])
                else:
                    after = ''
                s = socialmedia_form(word_lower)
                if len(s):
                    word = case_of(word)(random.choice(s))
                else:
                    if word_lower in replace_normalizer and random.random() >= 0.3:
                        word = case_of(word)(random.choice(replace_normalizer[word_lower]))

                word = case_of(word)(vowel_alternate(word, 0.7))
                word = case_of(word)(replace_similar_consonants(word, 0.95))
                word = case_of(word)(replace_similar_vowels(word, 0.8))
            
                if len(after):
                    word = f'{word}-{after}'
                
        except Exception as e:
            word = original_word
            pass
        
        r.append(word)
    return ' '.join(r)

augment('abad ke-14-14-14-14')

'abad kek-14-14-14-14'

In [9]:
string = """
Husein makan ayam di kampung Jawa juga
"""
splitted = malaya.text.function.split_into_sentences(string)
augment(splitted[0])

'Husein makn ayam dik kg Jawa juge .'

In [10]:
string = """
Husein makan ayam di kampung Jawa
"""
splitted = malaya.text.function.split_into_sentences(string)
augment(splitted[0])

'Husein makan ayam dk kg Jawa .'

In [11]:
splitted[0]

'Husein makan ayam di kampung Jawa.'

In [12]:
files = ['/home/husein/pure-text/filtered-dumping-wiki.txt',
        '/home/husein/pure-text/dumping-cleaned-news.txt',]

In [13]:
with open(files[0]) as fopen:
    data = list(filter(None, fopen.read().split('\n')))
    
data = [i for i in data if len(i) >= 2]
len(data)

2037249

In [14]:
fast_text = malaya.language_detection.fasttext()




In [15]:
fast_text.predict(['តើប្រព័ន្ធប្រតិបត្តិការណាដែលត្រូវគ្នាជាមួយកម្មវិធីធនាគារអេប៊ីអេ។'])

['other']

In [16]:
from tqdm import tqdm

def loop(strings):
    results = []
    for i in tqdm(range(len(strings))):
        try:
            if fast_text.predict([strings[i]])[0] == 'other':
                continue
            if random.random() > 0.6:
                s = random_slide(strings[i])
                if not len(s):
                    s = strings[i]
            else:
                s = strings[i]
            t = ' '.join(tokenizer(s))
            if random.random() >= 0.2:
                row = augment(s)
                results.append((row, t))
            else:
                results.append((t, t))
        except:
            pass
    return results

In [17]:
socialmedia_form('serba')

['serbak', 'serbe']

In [18]:
loop(data[:10])

100%|██████████| 10/10 [00:00<00:00, 1856.54it/s]


[('Dirk Jan Klaas " Klaas-Jan " Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bolak sepk Belanda yg nermain',
  'Dirk Jan Klaas " Klaas-Jan " Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bola sepak Belanda yang bermain'),
 ('Beliau kinok bermain tuk klab Ajax .',
  'Beliau kini bermain untuk kelab Ajax .'),
 ('Hypo-Arena .', 'Hypo-Arena .'),
 ('dikenali sebagai ) ialah sebuah', 'dikenali sebagai ) ialah sebuah'),
 ('Ia merupakn stadiim team Austria Karnten .',
  'Ia merupakan stadium pasukan Austria Karnten .'),
 ('Stadium lama dikenali sebagai Wortherseestadion , dibina pada 1960 dan mempunyai kapasiti sebanyak 10,900 .',
  'Stadium lama dikenali sebagai Wortherseestadion , dibina pada 1960 dan mempunyai kapasiti sebanyak 10,900 .'),
 ('Ia dirobohkan pada 2005 dan digantikan dengan Hypo-Arena yang baru , juga dikenali sehingga 30 Jun 2007 dengan nama " Wortherseestadion " .',
  'Ia dirobohkan pada 2005 dan digantikan dengan Hypo-Arena yang baru , juga dikenali sehingga 30 Jun

In [19]:
import cleaning

results1 = cleaning.multiprocessing(data, loop)

100%|██████████| 127328/127328 [01:14<00:00, 1702.91it/s]
100%|██████████| 1/1 [00:00<00:00, 1219.63it/s]60.03it/s]
100%|██████████| 127328/127328 [01:16<00:00, 1669.30it/s]
100%|██████████| 127328/127328 [01:16<00:00, 1667.24it/s]
100%|██████████| 127328/127328 [01:17<00:00, 1653.37it/s]
100%|██████████| 127328/127328 [01:16<00:00, 1653.89it/s]
100%|██████████| 127328/127328 [01:17<00:00, 1650.10it/s]
100%|██████████| 127328/127328 [01:21<00:00, 1565.66it/s]
100%|██████████| 127328/127328 [01:22<00:00, 1544.89it/s]
100%|██████████| 127328/127328 [01:22<00:00, 1536.30it/s]
100%|██████████| 127328/127328 [01:23<00:00, 1524.38it/s]
100%|██████████| 127328/127328 [01:23<00:00, 1527.30it/s]
100%|██████████| 127328/127328 [01:23<00:00, 1522.07it/s]
100%|██████████| 127328/127328 [01:23<00:00, 1518.42it/s]
100%|██████████| 127328/127328 [01:24<00:00, 1502.69it/s]
100%|██████████| 127328/127328 [01:24<00:00, 1501.38it/s]
100%|██████████| 127328/127328 [01:26<00:00, 1471.03it/s]


In [20]:
not_same = 0
for r in tqdm(results1):
    if r[0] != r[1]:
        not_same += 1

not_same / len(results1)

100%|██████████| 2033099/2033099 [00:00<00:00, 2255652.40it/s]


0.7365666895709456

In [21]:
with tf.compat.v1.io.gfile.GFile('spelling-correction-wiki.tsv', "w") as outfile:
    for i in tqdm(range(len(results1))):
        l = cleaning_row(results1[i][0])
        r = cleaning_row(results1[i][1])
        outfile.write("%s\t%s\n" % (l, r))

100%|██████████| 2033099/2033099 [00:34<00:00, 58728.25it/s]


In [22]:
with open(files[1]) as fopen:
    data = list(filter(None, fopen.read().split('\n')))
    
data = [i for i in data if len(i) >= 2]
len(data)

3483869

In [23]:
results1 = cleaning.multiprocessing(data, loop)

100%|██████████| 217741/217741 [02:26<00:00, 1488.10it/s]
100%|██████████| 13/13 [00:00<00:00, 1667.41it/s].34it/s]
100%|██████████| 217741/217741 [02:27<00:00, 1479.33it/s]
100%|██████████| 217741/217741 [02:26<00:00, 1487.89it/s]
100%|██████████| 217741/217741 [02:26<00:00, 1485.05it/s]
100%|██████████| 217741/217741 [02:28<00:00, 1462.91it/s]
100%|██████████| 217741/217741 [02:29<00:00, 1461.21it/s]
100%|██████████| 217741/217741 [02:28<00:00, 1462.65it/s]
100%|██████████| 217741/217741 [02:30<00:00, 1445.70it/s]
100%|██████████| 217741/217741 [02:30<00:00, 1451.15it/s]
100%|██████████| 217741/217741 [02:30<00:00, 1451.52it/s]
100%|██████████| 217741/217741 [02:30<00:00, 1447.91it/s]
100%|██████████| 217741/217741 [02:31<00:00, 1440.86it/s]
100%|██████████| 217741/217741 [02:30<00:00, 1444.52it/s]
100%|██████████| 217741/217741 [02:30<00:00, 1448.46it/s]
100%|██████████| 217741/217741 [02:31<00:00, 1434.80it/s]
100%|██████████| 217741/217741 [02:32<00:00, 1429.16it/s]


In [24]:
not_same = 0
for r in tqdm(results1):
    if r[0] != r[1]:
        not_same += 1

not_same / len(results1)

100%|██████████| 3414470/3414470 [00:01<00:00, 2221148.74it/s]


0.7739716559231741

In [25]:
with tf.compat.v1.io.gfile.GFile('spelling-correction-news.tsv', "w") as outfile:
    for i in tqdm(range(len(results1))):
        l = cleaning_row(results1[i][0])
        r = cleaning_row(results1[i][1])
        outfile.write("%s\t%s\n" % (l, r))

100%|██████████| 3414470/3414470 [01:05<00:00, 51997.72it/s]


In [26]:
from google.cloud import storage

client = storage.Client()
bucket = client.bucket('mesolitica-tpu-general')

In [27]:
blob = bucket.blob('t5-data-v2/spelling-correction-wiki.tsv')
blob.upload_from_filename('spelling-correction-wiki.tsv')

In [28]:
blob = bucket.blob('t5-data-v2/spelling-correction-news.tsv')
blob.upload_from_filename('spelling-correction-news.tsv')

In [29]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [30]:
def spelling_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.compat.v1.data.TextLineDataset(
        [
            'spelling-correction-wiki.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.compat.v1.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.compat.v1.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def spelling_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.compat.v1.strings.join(['ejaan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.compat.v1.data.experimental.AUTOTUNE,
    )

In [31]:
t5.data.TaskRegistry.remove('spelling_dataset')
t5.data.TaskRegistry.add(
    'spelling_dataset',
    dataset_fn = spelling_dataset,
    splits = ['train'],
    text_preprocessor = [spelling_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [32]:
nq_task = t5.data.TaskRegistry.get("spelling_dataset")
ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={"inputs": 256, "targets": 256})
r = tfds.as_numpy(ds)

In [33]:
next(r)

{'inputs_plaintext': b'ejaan: " Nota : " Perkataan deep krungan hndakla dnyanyion ole kuit , yng lainnye dnyanyokan secark slok .',
 'inputs': array([28699,    31,    13,     6, 21387,    13,    31,    13,     6,
        22653,  4081,  8092,  9756,    13,  7999,  7131,   472,    13,
           79,    38,  2721,  1186,    13,  9134,    13,   848,   545,
           13,    14,    13,   128,  2460,   116,  2721,    81,    13,
           79,    38,  2721,   162,   103,    13, 13384, 11680,    13,
           16, 12330,    13,     3,     1]),
 'targets_plaintext': b'" Nota : " Perkataan dalam kurungan hendaklah dinyanyikan oleh koir , yang lainnya dinyanyikan secara solo .',
 'targets': array([   13,     6, 21387,    13,    31,    13,     6, 22653,    36,
        25840,  9339, 28420,    60,    13,  1232,  1022,    13,    14,
           17, 11301, 28420,   156,  7186,    13,     3,     1])}