# The WikiLarge Dataset

In [16]:
import io
import os
import re

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

## Load

In [29]:
def load_wiki(dataset='wikismall', data_dir='../data', keep_splits=False):
    wiki_dir = os.path.join(data_dir, 'raw/data-simplification', dataset)

    prefix = 'PWKP_108016.tag.80.aner.ori' if dataset == 'wikismall' else 'wiki.full.aner.ori'
    data = []
    for split in ['train', 'valid', 'test']:
        for loc in ['src', 'dst']:
            file_name = '.'.join([prefix, split, loc])
            file_path = os.path.join(wiki_dir, file_name)
            stream = io.open(file_path)
            lines = stream.read().split('\n')
            data.append(lines)

    if keep_splits:
        return data
    
    src_train, dst_train, src_valid, dst_valid, src_test, dst_test = data
    src = src_train + src_valid + src_test
    dst = dst_train + dst_valid + dst_test
    return src, dst

In [43]:
src, dest = load_wiki(dataset='wikilarge')

## Preprocess

In [17]:
def preprocess(sentence):
    sentence = re.sub(r'([?.!])', r' \1 ', sentence)
    sentence = re.sub('\s{2,}', ' ', sentence)
    sentence = sentence.strip()
    return f'<START> {sentence} <END>'

In [44]:
print(preprocess(src[10]))

<START> Many still refer to 25 , 50 and 75 paise as 4 , 8 and 12 annas respectively , not unlike the usage of '' bit '' in American English for â <END>


In [33]:
src, dest = [preprocess(s) for s in src], [preprocess(s) for s in dest]

## Tokenize

### Number of tokens

In [34]:
tokenizer = Tokenizer(filters='')

In [68]:
tokenizer.fit_on_texts(src)

In [69]:
len(tokenizer.word_index)

178938

### Tokenize

In [60]:
tokenizer = Tokenizer(num_words=50_000, filters='')

In [64]:
seqs = tokenizer.fit_on_texts(src)