# The WikiLarge Dataset

In [1]:
import io
import os
import re
import unicodedata

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.preprocessing as preprocessing

## Load

In [2]:
def load_wiki(dataset='wikismall', data_dir='../data', keep_splits=False):
    wiki_dir = os.path.join(data_dir, 'raw/data-simplification', dataset)

    prefix = 'PWKP_108016.tag.80.aner.ori' if dataset == 'wikismall' else 'wiki.full.aner.ori'
    data = []
    for split in ['train', 'valid', 'test']:
        for loc in ['src', 'dst']:
            file_name = '.'.join([prefix, split, loc])
            file_path = os.path.join(wiki_dir, file_name)
            stream = io.open(file_path)
            lines = stream.read().split('\n')
            data.append(lines)

    src_train, dst_train, src_valid, dst_valid, src_test, dst_test = data
    train = pd.DataFrame(zip(src_train, dst_train), columns=['source', 'target'])
    valid = pd.DataFrame(zip(src_valid, dst_valid), columns=['source', 'target'])
    test = pd.DataFrame(zip(src_test, dst_test), columns=['source', 'target'])
    
    if keep_splits:
        return train, valid, test
    
    return pd.concat([train, valid, test])

In [32]:
data = load_wiki(dataset='wikismall')

## Preprocess

Add space around punctuation (note: punctuation can be dropped by tokenizer) and remove redundant white space.

In [33]:
def preprocess(sentence):
    sentence = unicodedata.normalize('NFC', sentence)
    sentence = re.sub(r'([?.!])', r' \1 ', sentence)
    sentence = re.sub('\s{2,}', ' ', sentence)
    sentence = sentence.strip()
    return f'<START> {sentence} <END>'

In [34]:
data = data.applymap(preprocess)

## Sequence

### Tokenize

Actually perform tokenization, limiting total number of tokens.

In [35]:
NUM_TOKENS = 10_000

In [36]:
tokenizer = preprocessing.text.Tokenizer(num_words=NUM_TOKENS, oov_token='<UNK>')
tokenizer.fit_on_texts(data['source'])

In [37]:
data = data.apply(lambda col: tokenizer.texts_to_sequences(col), axis=0)

Identify longest sequence.

In [38]:
max_len = max(data.applymap(len).max(axis=0))

In [39]:
max_len

80

### Pad

In [40]:
data = data.applymap(lambda x: preprocessing.sequence.pad_sequences([x], maxlen=max_len, padding='post')[0])

In [41]:
len(data.iloc[0, 0])

80

## Split

In [42]:
train_frac = 0.8

num_rows = len(data)
train_size = int(train_frac * num_rows)

data = data.sample(frac=1)
train, dev = data[:train_size], data[train_size:]

## Embedding

Currently mapping '<UNK>' to all zeros vector. See https://datascience.stackexchange.com/questions/26943/how-to-initialize-word-embeddings-for-out-of-vocabulary-word

In [46]:
embed_index = dict()
with open('../data/raw/glove.6B.50d.txt') as f:
    for line in f:
        word, vec = line.split(maxsplit=1)
        vec = np.asarray(vec.split(), dtype='float32')
        embed_index[word] = vec

In [47]:
def build_embed_matrix(embed_index, tokenizer):
    vec = next(x for x in embed_index.values())
    dim = vec.shape[0]
    
    # add 1 for padding token
    vocab_size = NUM_TOKENS + 1
    matrix = np.zeros((vocab_size, dim))
    for word, row in tokenizer.word_index.items():
        if row >= vocab_size:
            continue
        vec = embed_index.get(word)
        if vec is not None:
            matrix[row] = vec

    return matrix

In [48]:
embed_matrix = build_embed_matrix(embed_index, tokenizer)

In [49]:
embed_matrix.shape

(10001, 50)

## Modeling

In [43]:
from tensorflow.keras import layers

In [50]:
embed = layers.Embedding(NUM_TOKENS + 1, 50, weights=[embed_matrix], trainable=False, mask_zero=True)

In [51]:
enc_in = layers.Input(shape=(max_len,))
enc_embed = embed(enc_in)
_, enc_hidden, enc_cell = layers.LSTM(128, return_state=True)(enc_embed)

Note decoder internal state to be used in inference (how?)

In [53]:
dec_in = layers.Input(shape=(max_len,))
dec_embed = embed(dec_in)
dec_out, _, _ = layers.LSTM(128, return_sequences=True, return_state=True)(dec_embed, initial_state=[enc_hidden, enc_cell])

Wasn't there a wrapper to do this? Do we include padding?

In [54]:
out = layers.Dense(NUM_TOKENS + 1, activation='softmax')(dec_out)

In [55]:
model = tf.keras.Model([enc_in, dec_in], out)

In [56]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [57]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 80)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 80, 50)       500050      input_4[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 128), (None, 91648       embedding_2[0][0]          

### Train

In [63]:
src_tokens = list(train['source'])
dest_tokens = list(train['target'])

In [64]:
model.fit([src_tokens, dest_tokens], dest_tokens, batch_size=1)

KeyboardInterrupt: 