# The WikiLarge Dataset

In [1]:
import io
import os
import re

import numpy as np
import tensorflow as tf
import tensorflow.keras.preprocessing as preprocessing

## Load

In [2]:
def load_wiki(dataset='wikismall', data_dir='../data', keep_splits=False):
    wiki_dir = os.path.join(data_dir, 'raw/data-simplification', dataset)

    prefix = 'PWKP_108016.tag.80.aner.ori' if dataset == 'wikismall' else 'wiki.full.aner.ori'
    data = []
    for split in ['train', 'valid', 'test']:
        for loc in ['src', 'dst']:
            file_name = '.'.join([prefix, split, loc])
            file_path = os.path.join(wiki_dir, file_name)
            stream = io.open(file_path)
            lines = stream.read().split('\n')
            data.append(lines)

    if keep_splits:
        return data
    
    src_train, dst_train, src_valid, dst_valid, src_test, dst_test = data
    src = src_train + src_valid + src_test
    dst = dst_train + dst_valid + dst_test
    return src, dst

In [3]:
src, dest = load_wiki(dataset='wikilarge')

## Preprocess

Add space around punctuation (note: punctuation can be dropped by tokenizer) and remove redundant white space.

In [4]:
def preprocess(sentence):
    sentence = re.sub(r'([?.!])', r' \1 ', sentence)
    sentence = re.sub('\s{2,}', ' ', sentence)
    sentence = sentence.strip()
    return f'<START> {sentence} <END>'

In [5]:
print(preprocess(src[10]))

<START> Many still refer to 25 , 50 and 75 paise as 4 , 8 and 12 annas respectively , not unlike the usage of '' bit '' in American English for â <END>


In [6]:
src, dest = [preprocess(s) for s in src], [preprocess(s) for s in dest]

## Sequence

### Tokenize

Actually perform tokenization, limiting total number of tokens.

In [7]:
NUM_TOKENS = 50_000

In [9]:
tokenizer = preprocessing.text.Tokenizer(num_words=NUM_TOKENS, oov_token='<UNK>')
tokenizer.fit_on_texts(src)

In [10]:
src_tokens = tokenizer.texts_to_sequences(src)
dest_tokens = tokenizer.texts_to_sequences(dest)

Identify longest sequence.

In [11]:
max_src_len = max([len(seq) for seq in src_tokens])
max_dest_len = max([len(seq) for seq in dest_tokens])
max_len = max([max_src_len, max_dest_len])

### Pad

In [12]:
src_tokens = preprocessing.sequence.pad_sequences(src_tokens, maxlen=max_len, padding='post')
dest_tokens = preprocessing.sequence.pad_sequences(dest_tokens, maxlen=max_len, padding='post')

## Embedding

Currently mapping '<UNK>' to all zeros vector. See https://datascience.stackexchange.com/questions/26943/how-to-initialize-word-embeddings-for-out-of-vocabulary-word

In [13]:
embed_index = dict()
with open('../data/raw/glove.6B.50d.txt') as f:
    for line in f:
        word, vec = line.split(maxsplit=1)
        vec = np.asarray(vec.split(), dtype='float32')
        embed_index[word] = vec

In [14]:
def build_embed_matrix(embed_index, tokenizer):
    vec = next(x for x in embed_index.values())
    dim = vec.shape[0]
    
    # add 1 for padding token
    vocab_size = NUM_TOKENS + 1
    matrix = np.zeros((vocab_size, dim))
    for word, row in tokenizer.word_index.items():
        if row >= vocab_size:
            continue
        vec = embed_index.get(word)
        if vec is not None:
            matrix[row] = vec

    return matrix

In [15]:
embed_matrix = build_embed_matrix(embed_index, tokenizer)

In [16]:
embed_matrix.shape

(50001, 50)

## Modeling

In [17]:
from tensorflow.keras import layers

In [26]:
embed = layers.Embedding(NUM_TOKENS + 1, 50, weights=[embed_matrix], trainable=False, mask_zero=True)

In [28]:
enc_in = layers.Input(shape=(max_len,))
enc_embed = embed(enc_in)
_, enc_hidden, enc_cell = layers.LSTM(32, return_state=True)(enc_embed)

Note decoder internal state to be used in inference (how?)

In [29]:
dec_in = layers.Input(shape=(max_len,))
dec_embed = embed(dec_in)
dec_out, _, _ = layers.LSTM(32, return_sequences=True, return_state=True)(dec_embed, initial_state=[enc_hidden, enc_cell])

Wasn't there a wrapper to do this? Do we include padding?

In [30]:
out = layers.Dense(NUM_TOKENS + 1, activation='softmax')(dec_out)

In [31]:
model = tf.keras.Model([enc_in, dec_in], out)

In [32]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [34]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 106)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 106)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 106, 50)      2500050     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 32), (None,  10624       embedding_1[0][0]          

### Train

In [33]:
model.fit([src_tokens, dest_tokens], dest_tokens, batch_size=1)

    51/297756 [..............................] - ETA: 38:00:00 - loss: 1.7784

KeyboardInterrupt: 