In [None]:
import sys
import re
import pickle
import numpy as np
from os import path
from math import exp
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import Input
from keras.layers import TimeDistributed
from keras.callbacks import History
from keras.callbacks import EarlyStopping
from keras.callbacks import Callback
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import ModelCheckpoint

In [4]:
## text preprocessing for cross sentence training
def cross_sentence_preproc(text):
    return text.replace('\n',' <eos> ')

In [None]:
## tokenize textfile and save as pickle file
def tokenize_txt_cross_sentence(textfile):
    ## open the file as read only
    file = open(textfile, 'r', encoding='UTF-8')
    ## read all text
    text = file.read()
    file.close()
    print('\nSample of original txt:\n\n', text[:300])
    
    ## run text preprocessing
    text_proc = cross_sentence_preproc(text)
    print('\nSample of processed txt:\n\n', text_proc[:300])
    print('\nTotal tokens in text: %d' % len(text_proc.split()))
    print('Unique tokens in text: %d' % len(set(text_proc.split())))
    
    ## fit tokenizer
    tokenizer = Tokenizer(filters='', lower=False)
    tokenizer.fit_on_texts([text_proc])
    ## saving tokenizer
    with open('tokenizer/cs-{0}.pickle'.format(path.splitext(path.basename(textfile))[0]), 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
## Converting text to lm training tensor for cross sentence LSTM
def txt_to_tensor_cross_sent_LSTM(textfile, tokenizer, batch_size=20, seq_length=35):
    
    ## open the file as read only
    file = open(textfile, 'r', encoding='UTF-8')
    
    ## read all text
    text = file.read()
    file.close()
    print('\nSample of original txt:\n\n', text[:300])
    
    ## run text preprocessing
    text_proc = cross_sentence_preproc(text)
    print('\nSample of processed txt:\n\n', text_proc[:300])
    print('\nTotal tokens in text: %d' % len(text_proc.split()))
    print('Unique tokens in text: %d' % len(set(text_proc.split())))
    
    ## load tokenizer
    with open(tokenizer, 'rb') as handle:
        tokenizer = pickle.load(handle)

    ## vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocab size: %d' % vocab_size)

    ## coding text
    text_coded = tokenizer.texts_to_sequences([text_proc])
    text_coded_len = len(text_coded[0])
    print('Coded text length:', text_coded_len)
    
    ## pad according to current batch size and seq length (stateful training requirement!)
    padding_length = batch_size * seq_length * ( (text_coded_len // (batch_size * seq_length)) + 1 )
    input_array = pad_sequences(text_coded, padding='post', maxlen=padding_length)[0,:]
    print('Padded input array shape:', input_array.shape)
    
    ## creat target array from input array
    target_array = input_array.copy()
    target_array[0:-1] = input_array[1:]
    target_array[-1] = input_array[0]
    
    ## reshaping input and target array to fit stateful training
    ## reshaping according to batch_size
    input_array = input_array.reshape((batch_size, -1))
    target_array = target_array.reshape((batch_size, -1))
    ## creating list of batches (link: ...)
    x_batches = np.split(input_array, input_array.shape[1] // seq_length, axis=1)
    y_batches = np.split(target_array, target_array.shape[1] // seq_length, axis=1)
    assert len(x_batches) == len(y_batches)
    
    ## concatenting list of batches (fit instead of fit generator)
    X = np.concatenate(x_batches)
    y = np.concatenate(y_batches)
    ## additional rank for y array (Keras requirement)
    y = y.reshape(y.shape[0], y.shape[1], 1)

    print('Input tensor shape:' , X.shape)
    print('Target tensor shape:' , y.shape)

    return X, y, vocab_size

In [None]:
## Evaluation of cross sentence LSTMs
def cross_sentence_LSTM_eval(model_path, testfile, tokenizer, batch_size=20, seq_length=35):
    model = load_model(model_path)
    X_test, y_test, _ = txt_to_tensor_cross_sent_LSTM(testfile, tokenizer, batch_size, seq_length)
    test_loss = model.evaluate(X_test, y_test, batch_size=batch_size)
    print(testfile)
    print('Loss: %f\nPerplexity: %f\n\n' % (test_loss, exp(test_loss)))
    del model

In [None]:
## stateful 2-layer LSTM for cross sentence modeling
def LSTM_stateful_model(vocab_size, embedding_dim, batch_size, seq_length, dropout_rate, LSTM_hidden_size,
                        embedding_matrix = 'None'):
    model = Sequential()
    if embedding_matrix == 'None':
        model.add(Embedding(vocab_size, embedding_dim, batch_input_shape=(batch_size, seq_length), mask_zero=True))
    else:
        model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], batch_input_shape=(batch_size, seq_length),
                  trainable=True, mask_zero=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(LSTM_hidden_size, recurrent_activation='sigmoid', return_sequences=True, stateful=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(LSTM_hidden_size, recurrent_activation='sigmoid', return_sequences=True, stateful=True))
    model.add(Dropout(dropout_rate))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    print(model.summary())
    return model

In [None]:
## text generation with cross sentence LSTM
def cross_sentence_LSTM_generate(model_path, tokenizer, seed_text='', num_words=100, temperature=1.0,
                                 batch_size=20, seq_length=35, random_seed=None):
    
    ## seed numpy random
    np.random.seed(random_seed)

    ## load model
    model = load_model(model_path)
    ## load tokenizer
    with open(tokenizer, 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    ## creat id to word mapping dictionary
    word_to_id = tokenizer.word_index
    id_to_word = {}
    for c, i in word_to_id.items():
        id_to_word[i] = c
    ## add key 0 to dictionary if does not exist
    if 0 not in id_to_word:
        id_to_word[0] = '<mask>'
    
    ## coding seed text
    sentence = [word_to_id[word] for word in seed_text.split()]

    for i in range(num_words):
        ## úgy pad-eljük, hogy seq_length hosszú legyen
        sentence_padded = pad_sequences([sentence], maxlen=seq_length)
        ## seq_length*batch_size hosszúra padd-eljük a sorokat, így a későbbi sorokban csupa nulla lesz
        sentence_padded_postzero = pad_sequences(sentence_padded, maxlen=seq_length*batch_size, padding='post')
        ## úgy rendezzük, hogy minden batch-ben csak az első sor legyen értékes, többi nulla
        sentence_array = np.reshape(sentence_padded_postzero, (-1, seq_length))
        ## predikció (az első sor utolsó értékét vizsgáljuk)
        preds = model.predict(sentence_array, batch_size=batch_size)[0,-1]
        ## mintavételezzük az eloszlást
        next_index = sample_pred_simple(preds)
        ## a mintavételezett kódot átfordítjuk szóra
        next_word = id_to_word[next_index]
        ## a következő bemenő szó a mostani kimenő szó lesz (stateful háló megjegyzi az előzményt!)
        sentence = [next_index]
        ## kiírjuk a következő szót
        sys.stdout.write((next_word if next_word != '<eos>' else '\n') + ' ')
        sys.stdout.flush()

In [None]:
## training function for cross sentence LSTM
def cross_sentence_LSTM_train(train_txt, valid_txt, eval_txt, tokenizer, optimizer, callbacks, pretrained_embedding = 'None',
                              batch_size = 20, epochs = 100, dropout_rate = 0.5, embedding_dim = 650, LSTM_hidden_size = 650,
                              seq_length = 35):

    ## extract train txt filename
    train_txt_fn = path.splitext(path.basename(train_txt))[0]
    ## Creating tensors
    X, y, vocab_size = txt_to_tensor_cross_sent_LSTM(train_txt, tokenizer, batch_size, seq_length)
    X_valid, y_valid, _ = txt_to_tensor_cross_sent_LSTM(valid_txt, tokenizer, batch_size, seq_length)
    ## define model
    if pretrained_embedding == 'None':
        model = LSTM_stateful_model(vocab_size, embedding_dim, batch_size, seq_length, dropout_rate, LSTM_hidden_size)
    else:
        ## creating pretrained embedding matrix
        embedding_matrix = create_embedding_matrix(pretrained_embedding, tokenizer, embedding_dim)
        model = LSTM_stateful_model(vocab_size, embedding_dim, batch_size, seq_length, dropout_rate, LSTM_hidden_size,
                                    embedding_matrix = embedding_matrix)
    ## compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)
    ## extract pretrained embedding filename
    pretrained_embedding_fn = path.splitext(path.basename(pretrained_embedding))[0]
    ## compile model name
    model_fn = 'model/cs-LSTM_{0}_BS-{1}_EMB-{2}.h5'.format(train_txt_fn, batch_size, pretrained_embedding_fn)
    ## append model saving to callbacks
    callbacks.append(ModelCheckpoint(model_fn, monitor='val_loss', save_best_only=True))
    ## fit model
    model.fit(X, y, epochs=epochs, callbacks=callbacks, verbose=2, validation_data=(X_valid, y_valid), batch_size=batch_size, shuffle=False)
    ## evaluation
    cross_sentence_LSTM_eval(model_fn, valid_txt, tokenizer, batch_size, seq_length)
    cross_sentence_LSTM_eval(model_fn, eval_txt, tokenizer, batch_size, seq_length)

In [None]:
## global variables
batch_size = 20
epochs = 100
seq_length = 35
LSTM_hidden_size = 650
embedding_dim = 650
dropout_rate = 0.5

In [None]:
## training and validation data
train_txt='data/ptb/ptb_train.txt'
valid_txt='data/ptb/ptb_valid.txt'
eval_txt='data/ptb/ptb_test.txt'

In [None]:
## tokenizer
tokenizer='tokenizer/si-ptb_train.pickle'

## optimizer
optimizer = optimizers.SGD(lr=1.0, momentum=0.9)

## Create callback for early stopping on validation loss
callbacks = [EarlyStopping(monitor='val_loss', patience=3), 
             ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=0, min_lr=0.001, verbose=1)]

In [None]:
cross_sentence_LSTM_train(train_txt, valid_txt, eval_txt, tokenizer, optimizer, callbacks, batch_size=batch_size, 
                          epochs=epochs, dropout_rate=dropout_rate, embedding_dim=embedding_dim, 
                          LSTM_hidden_size=LSTM_hidden_size, seq_length=seq_length)