# The WikiLarge Dataset

In [1]:
import io
import os
import re
import unicodedata

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text

from pathlib import Path

In [2]:
def get_proj_root():
    return '/home/ben/repos/cleartext'

## Load

In [3]:
def load_wiki(dataset='wikismall', keep_splits=False):
    proj_root = get_proj_root()
    wiki_dir = os.path.join(proj_root, 'data/raw/data-simplification', dataset)

    prefix = 'PWKP_108016.tag.80.aner.ori' if dataset == 'wikismall' else 'wiki.full.aner.ori'
    data = []
    for split in ['train', 'valid', 'test']:
        for loc in ['src', 'dst']:
            file_name = '.'.join([prefix, split, loc])
            file_path = os.path.join(wiki_dir, file_name)
            stream = io.open(file_path)
            lines = stream.read().split('\n')
            data.append(lines)

    src_train, dst_train, src_valid, dst_valid, src_test, dst_test = data
    train = pd.DataFrame(zip(src_train, dst_train), columns=['source', 'target'])
    valid = pd.DataFrame(zip(src_valid, dst_valid), columns=['source', 'target'])
    test = pd.DataFrame(zip(src_test, dst_test), columns=['source', 'target'])

    if keep_splits:
        return train, valid, test

    return pd.concat([train, valid, test])

In [4]:
data_raw = load_wiki()

In [5]:
NUM_EXAMPLES = 1000

data_raw = data_raw[:NUM_EXAMPLES]

## Preprocess

In [105]:
vocab_size = 10_000
tokenizer = text.Tokenizer(num_words=vocab_size, oov_token='<UNK>')

def prepare(df, tokenizer, pad=True):
    # preprocess
    df = df.applymap(preprocess)

    # tokenize
    tokenizer.fit_on_texts(df['source'])
    df = df.apply(lambda col: tokenizer.texts_to_sequences(col), axis=0)

    # pad
    if pad:
        max_len = max(df.applymap(len).max(axis=0))
        df = df.applymap(lambda x: sequence.pad_sequences([x], maxlen=max_len, padding='post')[0])

    return df

tokens = prepare(data_raw, tokenizer)

## Sequence

### Tokenize

Actually perform tokenization, limiting total number of tokens.

In [107]:
source_array = np.array(tokens['source'].tolist())
target_array = np.array(tokens['target'].tolist())

In [108]:
tokens_array = np.stack([source_array, target_array])

## Split

In [109]:
train_frac = 0.8

num_rows = len(data_prep)
train_size = int(train_frac * num_rows)

tokens = tokens.sample(frac=1)

source_array = np.array(tokens['source'].tolist())
target_array = np.array(tokens['target'].tolist())
tokens_array = np.stack([source_array, target_array])

train, dev = tokens_array[:, :train_size, :], tokens_array[:, train_size:, :]

## Embedding

In [12]:
def build_embed_matrix(embed_vectors, tokenizer):
    vec = next(x for x in embed_vectors.values())
    dim = vec.shape[0]

    # add 1 for padding token
    vocab_size = tokenizer.num_words + 1
    matrix = np.zeros((vocab_size, dim))
    for word, row in tokenizer.word_index.items():
        if row >= vocab_size:
            continue
        vec = embed_vectors.get(word)
        if vec is not None:
            matrix[row] = vec

    return matrix


def load_glove(dim):
    proj_root = get_proj_root()
    glove_dir = os.path.join(proj_root, 'data/raw/')

    embed_vectors = dict()
    with open(os.path.join(glove_dir, f'glove.6B.{dim}d.txt')) as f:
        for line in f:
            word, vec = line.split(maxsplit=1)
            vec = np.asarray(vec.split(), dtype='float32')
            embed_vectors[word] = vec
    return embed_vectors

Currently mapping '<UNK>' to all zeros vector. See https://datascience.stackexchange.com/questions/26943/how-to-initialize-word-embeddings-for-out-of-vocabulary-word

In [13]:
embed_vectors = load_glove(50)
embed_matrix = build_embed_matrix(embed_vectors, tokenizer)

## Modeling

In [14]:
def lstm(vocab_size, seq_len, units, weights=None):
    # shared embedding layer
    embed = layers.Embedding(vocab_size,
                             weights.shape[1],
                             weights=[weights],
                             input_length=seq_len,
                             trainable=False,
                             mask_zero=True)

    # encoder
    enc_in = layers.Input(shape=(seq_len,))
    enc_embed = embed(enc_in)
    enc_lstm = layers.LSTM(units, return_state=True)
    _, enc_hidden, enc_cell = enc_lstm(enc_embed)

    # decoder
    dec_in = layers.Input(shape=(seq_len,))
    dec_embed = embed(dec_in)
    dec_lstm = layers.LSTM(units, return_sequences=True, return_state=True)
    dec_out, _, _ = dec_lstm(dec_embed, initial_state=[enc_hidden, enc_cell])

    # output
    out = layers.Dense(vocab_size, activation='softmax')(dec_out)

    # model
    model = tf.keras.Model(inputs=[enc_in, dec_in], outputs=out)
    return model

In [137]:
model = lstm(vocab_size + 1, train.shape[-1], 32, embed_matrix)

In [138]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### Train

In [140]:
array = train[1]

In [145]:
right_col = np.zeros((array.shape[0], 1))
right_col.shape

(800, 1)

In [146]:
target_array = np.hstack([array[:, 1:], right_col])

In [149]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 52)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 52)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 52, 50)       500050      input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 32), (None,  10624       embedding_1[0][0]          

In [148]:
model.fit([train[0], train[1]], target_array, batch_size=1)

 85/800 [==>...........................] - ETA: 1:56 - loss: 0.9309

KeyboardInterrupt: 