In [1]:
# Author Bereket Kebede, Graduate Student
# Neural Networks -  Assignment #4 - University of Memphis. Fall 2021
# Question #1, Stacked RNN, LSTM, GRU
# Last updated - Oct 25, 2021


In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
#####################################################################################
# Import necessary libraries

import numpy as np
from sklearn.metrics import accuracy_score
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import os

from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Activation
from tensorflow.keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.text import Tokenizer


from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [42]:
#####################################################################################
# Load Data


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()
 
    return data.split('\n')

In [43]:
#####################################################################################
# Row Data Reading

# Load English data
english_sentences = load_data('small_vocab_en.txt')
# Load French data
french_sentences = load_data('small_vocab_fr.txt')

print('Dataset Loaded')

Dataset Loaded


In [44]:
#####################################################################################
# Display Sample data

for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les Ã©tats-unis est gÃ©nÃ©ralement froid en juillet , et il gÃ¨le habituellement en novembre .


In [45]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    # TODO: Implement
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [46]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

#tests.test_pad(pad)

# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [47]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


In [48]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [49]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model


# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
print(tmp_x.shape[1:])

(21, 1)


In [11]:
#####################################################################################
# Train the Neural Network

simple_rnn_model = simple_model(tmp_x.shape,max_french_sequence_length,english_vocab_size,french_vocab_size)

simple_rnn_model.summary()

simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=16, epochs=10, validation_split=0.2)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 21, 1)]           0         
_________________________________________________________________
gru (GRU)                    (None, 21, 64)            12864     
_________________________________________________________________
time_distributed (TimeDistri (None, 21, 345)           22425     
_________________________________________________________________
activation (Activation)      (None, 21, 345)           0         
Total params: 35,289
Trainable params: 35,289
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14691419ca0>

In [12]:
prediction = simple_rnn_model.predict(tmp_x[:1])
print(logits_to_text(prediction[0], french_tokenizer))

new jersey est parfois humide en l' de il est il est en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [73]:
def stacked_rnn(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Build the layers
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    
    print(input_shape)
    
    
    #model = Model(input_seq, Activation('softmax')(logits))

    model = Sequential()
   
    model.add(SimpleRNN(50, input_shape = input_shape[1:], return_sequences = True))   # return_sequences parameter has to be set True to stack
    model.add(SimpleRNN(50, return_sequences = True))
    model.add(Dense(french_vocab_size, activation='softmax'))

    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model


#tests.test_simple_model(simple_model)


In [74]:
stacked_rnn_model = stacked_rnn(tmp_x.shape,max_french_sequence_length,english_vocab_size,french_vocab_size)

stacked_rnn_model.summary()

stacked_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=16, epochs=10, validation_split=0.2)

(137861, 21, 1)
Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_57 (SimpleRNN)    (None, 21, 50)            2600      
_________________________________________________________________
simple_rnn_58 (SimpleRNN)    (None, 21, 50)            5050      
_________________________________________________________________
dense_40 (Dense)             (None, 21, 345)           17595     
Total params: 25,245
Trainable params: 25,245
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14691c46400>

In [135]:
from numpy import zeros, newaxis
b = tmp_x[newaxis, :, :, :]
for i in range(1,11):
    prediction = stacked_rnn_model.predict(b[:,i,:,:])
    print(logits_to_text(prediction[0], french_tokenizer))

les ã©tats unis est gã©nã©ralement pluvieux en avril et il est gã©nã©ralement agrã©able en ã©tã© <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
californie est gã©nã©ralement habituellement en l' et il est gã©nã©ralement relaxant en l' <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
les ã©tats unis est parfois humide en printemps et il est est en juillet <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
votre fruit est moins aimã© la raisin mais moins moins aimã© est la <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
son fruit prã©fã©rã© est la citron mais votre favori est la raisin <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
paris est calme en dã©cembre mais il est gã©nã©ralement pluvieux en juillet <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est occupã© en juin et il est jamais tranquille en mars <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
notre fruit aimã© moins est la chaux mais moins moins aimã© est la pomme <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
les ã©tats unis 

In [143]:
from keras.layers import LSTM
def stacked_lstm(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    learning_rate = 1e-3
    
    print("Input Shape: ")
    print(input_shape)

    model = Sequential()
   
    model.add(LSTM(50, input_shape = input_shape[1:], return_sequences = True))   # return_sequences parameter has to be set True to stack
    model.add(LSTM(50, return_sequences = True))
    model.add(Dense(french_vocab_size, activation='softmax'))

    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model



In [144]:
stacked_lstm_model = stacked_lstm(tmp_x.shape,max_french_sequence_length,english_vocab_size,french_vocab_size)

stacked_lstm_model.summary()

stacked_lstm_model.fit(tmp_x, preproc_french_sentences, batch_size=16, epochs=10, validation_split=0.2)

(137861, 21, 1)
Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 21, 50)            10400     
_________________________________________________________________
lstm_5 (LSTM)                (None, 21, 50)            20200     
_________________________________________________________________
dense_42 (Dense)             (None, 21, 345)           17595     
Total params: 48,195
Trainable params: 48,195
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14691c46e80>

In [146]:
for i in range(1,11):
    prediction = stacked_lstm_model.predict(b[:,i,:,:])
    print(logits_to_text(prediction[0], french_tokenizer))

la ã©tats unis est gã©nã©ralement froid en juillet et il est gã©nã©ralement agrã©able en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
californie est gã©nã©ralement chaud en mois et il est gã©nã©ralement est en juin <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
la ã©tats unis est parfois doux en juin et il est merveilleux en juillet <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
votre fruit est moins aimã© la raisin mais mon moins aimã© est la pomme <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
son fruit prã©fã©rã© est la mais mais prã©fã©rã© prã©fã©rã© est la <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
paris est relaxant en fã©vrier mais il est gã©nã©ralement froid en juillet <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est occupã© en l' et il est jamais chaud en l' <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
notre fruit aimã© fruit est la pomme mais moins moins aimã© est la pomme <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
la ã©tats unis est parfois froid e

In [151]:
from keras.layers import GRU
def stacked_gru(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    learning_rate = 1e-3
    
    print("Input Shape: ")
    print(input_shape)

    model = Sequential()
   
    model.add(GRU(50, input_shape = input_shape[1:], return_sequences = True))   # return_sequences parameter has to be set True to stack
    model.add(GRU(50, return_sequences = True))
    model.add(Dense(french_vocab_size, activation='softmax'))

    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model



In [152]:
stacked_gru_model = stacked_gru(tmp_x.shape,max_french_sequence_length,english_vocab_size,french_vocab_size)

stacked_gru_model.summary()

stacked_gru_model.fit(tmp_x, preproc_french_sentences, batch_size=16, epochs=10, validation_split=0.2)

Input Shape: 
(137861, 21, 1)
Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_35 (GRU)                 (None, 21, 50)            7950      
_________________________________________________________________
gru_36 (GRU)                 (None, 21, 50)            15300     
_________________________________________________________________
dense_43 (Dense)             (None, 21, 345)           17595     
Total params: 40,845
Trainable params: 40,845
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1468f465ee0>

In [153]:
for i in range(1,11):
    prediction = stacked_gru_model.predict(b[:,i,:,:])
    print(logits_to_text(prediction[0], french_tokenizer))

les ã©tats unis est gã©nã©ralement pluvieux en juillet et il est gã©nã©ralement agrã©able en novembre <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
californie est gã©nã©ralement calme en l' et il est gã©nã©ralement calme en printemps <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
les ã©tats unis est parfois chaud en printemps et il est froid en septembre <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
votre fruit aimã© fruit est la pomme mais votre moins aimã© est la <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
son fruit prã©fã©rã© est la pomme mais votre prã©fã©rã© est la pomme <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
paris est calme au mois mais il est gã©nã©ralement gã©nã©ralement en juillet <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est occupã© au printemps et il est jamais tranquille en mars <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
notre fruit moins fruit est la pomme mais votre moins aimã© est la <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
les ã©