In [None]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt
import importlib
from keras.utils import to_categorical
import tensorboardcolab
import os
from keras.preprocessing.text import text_to_word_sequence
import collections
import pandas as pd
import string
from string import digits

from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import plot_model

import warnings
warnings.filterwarnings("ignore")

# using gpu
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

### Number of Different Characters

In [None]:
with open('drive/My Drive/Colab Notebooks/Problem1_HW4/ferdosi.txt', 'r') as file:
  whole_list = text_to_word_sequence(file.read())
  whole_list = [''.join(whole_list)] 
  chars = list(set(whole_list[0]))
  vocab_size = len(chars)
  print('data has {0} unique characters.'.format(vocab_size + 1))
  vocab_to_int = { ch:i for i,ch in enumerate(chars) }
  vocab_to_int[' '] = 36
  vocab_to_int['<PAD>'] = 37
  vocab_to_int['<GO>'] = 38
  vocab_to_int['<END>'] = 39

  int_to_vocab = { i:ch for i,ch in enumerate(chars) }
  int_to_vocab[36] = ' '
  int_to_vocab[37] = '<PAD>'
  int_to_vocab[38] = '<GO>'
  int_to_vocab[39] = '<END>'

del(whole_list)
del(chars)

#### Report:
The Whole text is readed and splitted to a list of words. Number of Unique characters is calcualted. 'vocab_to_int' and 'int_to_vocab' dictionaries are generated.

In [None]:
vocab_to_int

In [None]:
int_to_vocab

### Preparing Dataset 

In [None]:
# TODO: space at the end of each input
input_text = []  # text of every input
target_text = []  # text of every target

inputs = []  # integer seq of every input
targets = []  # integer seq of every target

with open('drive/My Drive/Colab Notebooks/Problem1_HW4/ferdosi.txt', 'r') as file:
  text_line = file.readlines()
  print(len(text_line))
  for line in text_line:
    current_text = line.split(',')[0].rstrip()
    input_text.append(current_text)
    inputs.append([vocab_to_int[ch] for ch in current_text])

    current_text = line.split(',')[1].rstrip()[1:]
    target_text.append(current_text)
    targets.append([vocab_to_int[ch] for ch in current_text])


#### Report:
At first, lines of the text are readed. We save first part of every line in 'input_text' and second part in 'target_text'. We also save their numeric sequence in 'inputs' and 'targets'(Extra spaces at the end of first part and the begining of second part are excluded).

### Finding Maximum Number of Characters in a Sequence

In [None]:
max_len_inputs = max([len(item) for item in inputs])
max_len_targets = max([len(item) for item in targets])
print('Maximum Length of Input Sequences : {0}'.format(max_len_inputs))
print('Maximum Length of Target Sequences : {0}'.format(max_len_targets))

#### Report:
Now we find maximum length of both input and target sequences.

### Padding Inputs and Targets And Performing One-Hot Encoding

In [None]:
# number 37 as the '<PAD>' number! 
# number 38 as the '<GO>' number in the sequence!
# number 39 as the '<END>' number in the sequence!
for item in range(len(inputs)):
  # padding inputs  
  inputs[item] = [vocab_to_int['<PAD>']] * (max_len_inputs - len(inputs[item])) + inputs[item] 
  inputs[item] = [vocab_to_int['<GO>']] + inputs[item] + [vocab_to_int['<END>']]
  
  # padding targets  
  targets[item] = targets[item] + [vocab_to_int['<PAD>']] * (max_len_targets - len(targets[item]))
  targets[item] = [vocab_to_int['<GO>']] + targets[item] + [vocab_to_int['<END>']]
  
# one-hot encoding
# oh_inputs = to_categorical(inputs)
# oh_targets = to_categorical(targets)

len_inputs = max([len(item) for item in inputs])
len_targets = max([len(item) for item in targets])

#### Report:
At first, We use padding to equalize the length of all input sequences and also equalize the length of all target sequences. we also add GO and END characters to all input and target sequences.

### Inputs And Targets of Model

In [None]:
encoder_inputs_data = np.array(inputs)
decoder_inputs_data = np.array(targets)[:, :-1]
decoder_targets_data = np.array(to_categorical(targets)[:, 1:, :])

#### Report:
encoder_inputs_data is provided by converting inputs to numpy array. decoder_inputs_data and decoder_targerts_data are also provided excluding END and Go characters respectively.

### Splitting Test and Train data

In [None]:
frac = round(0.9 * len(inputs))

tr_en_inputs = encoder_inputs_data[:frac]
tr_de_inputs = decoder_inputs_data[:frac]
tr_de_targets = decoder_targets_data[:frac]

ts_en_inputs = encoder_inputs_data[frac:]
ts_de_inputs = decoder_inputs_data[frac:]
ts_de_targets = decoder_targets_data[frac:]

print('Shape of Dataset: \t\t{0}'.format(encoder_inputs_data.shape))
print('Shape of Training Set Inputs: \t{0}'.format(tr_en_inputs.shape))
print('Shape of Training Set Targets: \t{0}'.format(tr_de_targets.shape))
print('Shape of Test Set Inputs: \t{0}'.format(ts_en_inputs.shape))
print('Shape of Test Set Targets: \t{0}'.format(ts_de_targets.shape))

#### Report:
We split test and train datasets. You can see their shapes above.

### Building The Encoder

In [None]:
vec_len       = 300   # Length of the vector that we will get from the embedding layer
latent_dim    = 1024  # Hidden layers dimension 
dropout_rate  = 0.2  # Rate of the dropout layers
batch_size    = 64    # Batch size
epochs        = 6    # Number of epochs

# Define an input sequence and process it.
# Input layer of the encoder :
encoder_input = Input(shape=(None,))

# Hidden layers of the encoder :
encoder_embedding = Embedding(input_dim = len(vocab_to_int), output_dim = vec_len)(encoder_input)
encoder_dropout   = (TimeDistributed(Dropout(rate = dropout_rate)))(encoder_embedding)
encoder_LSTM      = CuDNNLSTM(latent_dim, return_sequences=True)(encoder_dropout)

# Output layer of the encoder :
encoder_LSTM2_layer = CuDNNLSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM2_layer(encoder_LSTM)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

#### Report:
In this part, we build our encoder. Input layer is constructed and after that we use Embeddings of size 300. We use Dropout and then we feed the outputs to our LSTM layer of size 1024. At last, another LSTM layer is provided and we save encoder outputs and encoder states.

### Building The Decoder

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
# Input layer of the decoder :
decoder_input = Input(shape=(None,))

# Hidden layers of the decoder :
decoder_embedding_layer = Embedding(input_dim = len(vocab_to_int), output_dim = vec_len)
decoder_embedding = decoder_embedding_layer(decoder_input)

decoder_dropout_layer = (TimeDistributed(Dropout(rate = dropout_rate)))
decoder_dropout = decoder_dropout_layer(decoder_embedding)

decoder_LSTM_layer = CuDNNLSTM(latent_dim, return_sequences=True)
decoder_LSTM = decoder_LSTM_layer(decoder_dropout, initial_state = encoder_states)

decoder_LSTM_2_layer = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
decoder_LSTM_2,_,_ = decoder_LSTM_2_layer(decoder_LSTM)

# Output layer of the decoder :
decoder_dense = Dense(len(vocab_to_int), activation='softmax')
decoder_outputs = decoder_dense(decoder_LSTM_2)

#### Report:
Here we build our decoder. At first, an input layer is generated and after that we use the embedding layer of size 300. Just like encoder model, we have Dropouts and 2 consecutive LSTM layers of size 1024. At last, we use a dense layer of size len(vocab_to_int) to convert the outputs to a domain of our dictionary size to make decisions.

### Bringing The Encoder And Decoder Together Into One Model

In [None]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_input, decoder_input], decoder_outputs)

model.summary()

# Define a checkpoint callback :
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

#### Report:
Here you can see the summary of our model.

### Training The Model

In [None]:
# Run training
num_train_samples = len(tr_en_inputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
hsitory = model.fit([tr_en_inputs[:num_train_samples,:],
               tr_de_inputs[:num_train_samples,:]],
               tr_de_targets[:num_train_samples,:,:],
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.08,
          callbacks = callbacks_list)

#### Report:
We train our model by 'rmsprop' as our optimizer and 'categorical_crossentropy' as our loss function. You can see that our loss is decreasing and the accuracy of both validation and training datasets are increasing gradually.

### Plotting Model

In [None]:
plot_model(model, to_file='model_plot4a.png', show_shapes=True, show_layer_names=True)

#### Report:
Here you can see the structure of our model.

### Loss & Accuracy Plots

In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(8, 8))
plt.plot(hsitory.history['acc'], color='red', linestyle='dashed', marker='o',
         markerfacecolor='green', markersize=10)
plt.plot(hsitory.history['val_acc'],  color='blue', linestyle='dashed', marker='o',
         markerfacecolor='yellow', markersize=10)
plt.grid()
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.figure(figsize=(8, 8))
plt.plot(hsitory.history['loss'],  color='red', linestyle='dashed', marker='o',
         markerfacecolor='green', markersize=10)
plt.plot(hsitory.history['val_loss'], color='blue', linestyle='dashed', marker='o',
         markerfacecolor='yellow', markersize=10)
plt.grid()
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#### Report:
Loss function and accuracy of both validation and training samples are plotted. You can see that the loss of our model has decreased along the epochs and accuracies of both training and validation datasets have increased gradually and we have accuracies more that 70% for both datasets.

### Test Loss & Accuracies

In [None]:
test_loss, test_accuracy = model.evaluate([ts_en_inputs,
                                           ts_de_inputs],
                                           ts_de_targets)
train_loss, train_accuracy = model.evaluate([tr_en_inputs,
                                           tr_de_inputs],
                                           tr_de_targets)
table = pd.DataFrame({'Dataset': ['Training Dataset', 'Test Dataset'],
                           'Accuracy': [train_accuracy, test_accuracy],
                           'Loss': [train_loss, test_loss]})
display(table)

#### Report:
Accuracies and Values of loss function for both test and train samples are provided in the table above. As you can see, the accuracy of test dataset is so close to validation and training accuracy and we don't have the problem of ovefitting and the results are satisfactory.

### Inference Phase

In [None]:
encoder_model = Model(encoder_input, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Data flows through decoder
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding_layer(decoder_inputs_single)
decoder_inputs_single_x = decoder_dropout_layer(decoder_inputs_single_x)
decoder_preoutputs = decoder_LSTM_layer(decoder_inputs_single_x,
                                  initial_state = decoder_states_inputs)
decoder_outputs, h, c = decoder_LSTM_2_layer(decoder_preoutputs)
decoder_outputs = decoder_dense(decoder_outputs)

decoder_states = [h, c]

# defining new decoder model
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

#### Report:
Here we build our inference model. There is a small difference between inference and training model. In inference model we only feed GO character to decoder inputs and other inputs are the outputs of previous time steps.

### Plotting Modified Decoder Model

In [None]:
plot_model(decoder_model, to_file='model_plot_dec.png', show_shapes=True, show_layer_names=True)

#### Report:
We plot the structure of our modified decoder.

### Making Predictions

In [None]:
def predict_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = vocab_to_int['<GO>']
    eos = vocab_to_int['<END>']
    output_sentence = []

    for _ in range(len_targets - 1):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        

        word = int_to_vocab[idx]
        output_sentence.append(word)

        if eos == idx:
            break

        target_seq[0, 0] = idx
        states_value = [h, c]

    output_sentence.reverse()
    return ''.join(output_sentence)
for _ in range(10):
  i = np.random.choice(len(tr_en_inputs))
  input_seq = tr_en_inputs[i]
  prediction = predict_sentence(input_seq)
  print('Input: ', input_text[i])
  print('Response: ', prediction)
  print('Correct Response: ', target_text[i])
  print('------------------------------------------------------')