<a href="https://colab.research.google.com/github/ankurshrivastav/Seq2Seq/blob/master/Seq2SeqModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem Statement: Summarize product reviews using Seq2Seq models

**Learning Objectives**


1.   Understand how Seq2Seq models work
2.   Understand how Keras work with the latest tf (v2.0) backend
3. Apply this architecture to other similar problems



**Approach**



1.   Download glove embeddings (50d) and amazon product reviews (to be used for training & test data)
2.   Define following dictionaries

> a. Word to Index - Maps each word (words sorted alphabettically) in the glove embedding to an index

> b. Index to Word - Reverse mapping of index to words

> c. Word_to_vec - Glove Embeddings for each word


3. Define global variables



> m = total number of reviews to be read (training + test set)

> Tx = Max number of Number of words in the review (time steps) beyond which it will be truncated

> Ty = Max number of Number of words in the review (time steps) beyond which it will be truncated

> vocab_size = Number of words in the dictionary (glove embeddings)


4. Load the reviews (X) and corresponding summaries (Y)

5. Convert X & Y into indices using Word to Index. Truncate or pad them based on Tx & Ty

6. One Hot encode X & Y (Do we need to do this?)

7. Create the Model
> a. # Base Code From: https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/

> b. Use an embedding layyer with (glove embeddings pre-fed to it) as the input layer to the model



















In [0]:
# Base Code From: https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/

# Import packages
import numpy as np
import time
import importlib
import re
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
import csv
import matplotlib.pyplot as plt
%matplotlib inline

#import pickle
#import sklearn.model_selection 


#importlib.reload(nmt_utils)
#importlib.reload(emo_utils)


try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf




from tensorflow import keras
import keras.backend as K



#from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, load_model

from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation, Embedding, RepeatVector, Concatenate, Dot, Bidirectional
#from tensorflow.keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import *
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model


from tensorflow.keras.utils import to_categorical


from datetime import datetime
#from emo_utils import *
#from nmt_utils import *

print(tf.keras.__version__)
print(keras.__version__)



In [0]:
# Util Functions

def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


# Read data file to get training & test data
def read_review_data(filename,m):
    review = []
    summary = []
    
    ctr = 0

    review_summary_data = []
    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
         

        for row in csvReader:
            #review.append(row[1])
            #summary.append(row[0])
            #row = [d.replace('"', '') for d in data]
            
            review_summary_data.append((row[1], row[0])) 
            #print("Review is ", row[1])
            #print("Summary is ", row[0])
            
            ctr = ctr + 1
            if ctr<m:
                if ctr%(m*0.1)==0:
                    print("Number of reviews loaded ", ctr)
            else:
                break

    #X = np.asarray(review)
    #Y = np.asarray(summary)     

    #return X, Y
    
    
    return np.array(review_summary_data)

def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

def string_to_int(string, length, vocab):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"
    
    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"
    
    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """
    
    #make lower to standardize
    string = str(string)
    string = string.lower()
    
    
    string = string.replace('\'','')
    string = string.replace('"','')
    #print(string)
    
    
        
    string = re.sub(r"[^a-zA-Z0-9]+", ' ', string)
    #print("String length is  ", len(string))
    
    rep=[]  
    string = string.split()
    #print("number of words in string are ", len(string))
    
    if len(string) > length:
        string = string[:length]
        #print("String length after adjustment is  ", len(string))
        
    
    #rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    #rep = list(map(lambda x: vocab.get(x), string))
    for word_index in range (0,len(string)):
        idx = vocab.get(string[word_index])
        if idx:
            rep.append(idx)
    
    #print("length of rep indices list is ", len(rep))
    #print("Inside string_to_int, rep is ", rep)
    
    # Pad the remaining places in the sentence with <pad>
    if len(rep) < length:
        rep += [vocab['<pad>']] * (length - (len(rep)))
        #print("length of rep indices list after padding with <pad> is ", len(rep))  
    
          
    #print (rep)
    return rep

def preprocess_data(dataset, word_to_index, Tx, Ty,m):
    
    X1, Y1 = zip(*dataset)
    #print("X1 inside preprocess data ", X1)
    
    #X = str(X).split()
    #Y = str(Y).split()
    
    #print(X)
    #print(Y)
    X,Y= np.empty((m,Tx)),np.empty((m,Ty))     
    
    #X = [string_to_int(a, Tx, word_to_index) for a in X1]
    #Y = [string_to_int(t, Ty, word_to_index) for t in Y1]
    
    ctr = 0
    for sentence in X1:
        indices_X = np.array(string_to_int(sentence, Tx, word_to_index))
        # debug
        #if indices_X.shape[0]!=Tx:
            #print("indices_X shape !=50 for sentence ", sentence)
            #print("indices_X shape !=50 at position ", ctr)
            #print("indices_X shape !=50 , shape is  ", indices_X.shape[0])
        
        X[ctr] = indices_X
        ctr = ctr + 1
        
        
    ctr = 0
    for sentence in Y1:
        indices_Y = string_to_int(sentence, Ty, word_to_index) 
        Y[ctr] = indices_Y
        ctr = ctr + 1
        
        
    
    #Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    #Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    #return X, np.array(Y)
    
    #print("X shape from preprocess _data is ", (np.array(X)).shape)
    #print("Y shape from preprocess _data is ", (np.array(Y)).shape)
    return np.array(X), np.array(Y)

def convert_indices_to_words(idx_array,dictionary):
    output = []
    #print("idx_array ", idx_array)
    for word_idx in idx_array:
        word = dictionary.get(int(word_idx))
        #print(word_idx)
        if word!=None:
            output.append(word)
    
    return output

In [0]:
# Implementation of Step # 3
#word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../../glove.6B/glove.6B.50d.txt')
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('https://drive.google.com/file/d/1Mnw2oyYdh8FI7ufj8RaGGXCkNiUJ4sbW/view?usp=sharing')


In [0]:
print(word_to_vec_map["3"])

In [0]:
# Initialize global variables

m = 20 # Remember that the 1 record (header record) from the raw data file will be popped. So set m to be 1 more that the train + test records required
Tx = 40 # Maximum of 40 words in a review
Ty = 5 # Maximum 5 words in the summary of the review

# Add <pad> and '<unk> tokens to all the lists
index_to_word[0]="<pad>" # End of sequence word
word_to_index["<pad>"]=0
word_to_vec_map["<pad>"] = np.zeros((1,50))

# In the original glove.6B file, index = 4 had "!!!!". Since its extremely rare & ununsed, replacing this index position with "<unk>"

index_to_word[4]="<unk>" # End of sequence word
word_to_index["<unk>"]=4
word_to_vec_map["<unk>"] = np.zeros((1,50))

vocab_size = len(word_to_index)+1 #Since <pad> has been added (nothing existed at index = 0) and "unk" has been swapped

#print(word_to_index["None"])




In [0]:
# Unit testing review & summary pre-processing 

dset = read_review_data('file_for_testing_pre_processing.csv',3)

print("len(dset) ", len(dset))



#dataset= [[X_data,Y_data]]
#for i in range(1,m):
    #dataset.append((X_data[i], Y_data[i]))

X_Indices, Y_Indices = preprocess_data(dset, word_to_index, Tx, Ty,m)

print("X_Indices.shape ", X_Indices.shape)
print("Y_Indices.shape ", Y_Indices.shape)

#print("X_Indices[0,0:10]  ", X_Indices[0][1])

# print & check the source & target conversion to indices
index = 2

print("dataset[0] ", dset[index][0])
print("dataset[1] ", dset[index][1])
#print("Source date:", dataset[index][0])
#print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X_Indices[index])
print("Target after preprocessing (indices):", Y_Indices[index])
print()

print("sentence x is ", convert_indices_to_words(X_Indices[index],index_to_word))
print("sentence y is ", convert_indices_to_words(Y_Indices[index],index_to_word))

In [0]:
# Load test & training data

dataset = []
#X_data, Y_data = read_review_data('product_reviews_modified.csv',m)

dataset = read_review_data('product_reviews_modified.csv',m)
# Remove the header rows
#X_data.pop(0)
#Y_data.pop(0)

#print("Number of Reviews & Summaries Loaded = ", len(X_train), len(Y_train))

#print(X_data[1], Y_data[1])
#print("X_data shape is ", X_data.shape)

print("len(dataset) ", len(dataset))



#dataset= [[X_data,Y_data]]
#for i in range(1,m):
    #dataset.append((X_data[i], Y_data[i]))

X_Indices, Y_Indices = preprocess_data(dataset, word_to_index, Tx, Ty,m)

print("X_Indices.shape ", X_Indices.shape)
print("Y_Indices.shape ", Y_Indices.shape)

#print("X_Indices[0,0:10]  ", X_Indices[0][1])

# print & check the source & target conversion to indices
index = m-1

print("dataset[0] ", dataset[index][0])
print("dataset[1] ", dataset[index][1])
#print("Source date:", dataset[index][0])
#print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X_Indices[index])
print("Target after preprocessing (indices):", Y_Indices[index])
print()

print("sentence x is ", convert_indices_to_words(X_Indices[index],index_to_word))
print("sentence y is ", convert_indices_to_words(Y_Indices[index],index_to_word))

In [0]:
# generate a sequence of random integers
def generate_sequence(length, n_unique):
	return [randint(1, n_unique-1) for _ in range(length)]

# prepare data for the LSTM
def get_dataset(Tx, Ty, vocab_size, m, X_Indices, Y_Indices):
    X1, X2, y = list(), list(), list()
    #for _ in range(m):
    # generate source sequence
    #source = generate_sequence(Tx, vocab_size)
    source = X_Indices
    # define padded target sequence
    target = Y_Indices
    #target.reverse()
    # create padded input target sequence
    #target_in = [0] + target[:-1]
    target_in = np.insert(target, 0, 0, axis=-1)
    target_in = target_in[:,0:Y_Indices.shape[1]]
    
    # encode
    #src_encoded = to_categorical(source, num_classes=vocab_size)
    #tar_encoded = to_categorical(target, num_classes=vocab_size)
    #tar2_encoded = to_categorical(target_in, num_classes=vocab_size)
    # store
    X1 = source #np.array(src_encoded)
    X2= target_in #np.array(tar2_encoded)
    y= target #np.array(tar_encoded)
    
    #return array(X1), array(X2), array(y)
    return X1, X2, y

def get_dataset_one_hot(Tx, Ty, vocab_size, m, X_Indices, Y_Indices):
    X1,X2,y = get_dataset(Tx, Ty, vocab_size, m, X_Indices, Y_Indices)
    # encode
    #X1_one_hot = to_categorical(X1, num_classes=vocab_size)
    y_one_hot = to_categorical(y, num_classes=vocab_size)
    X2_one_hot = to_categorical(X2, num_classes=vocab_size)
    
    return X1, X2_one_hot, y_one_hot
    

# load common Keras layers as objects
def load_layers (n_input, n_output, n_units):
    encoder = LSTM(n_units, return_state=True)
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_dense = Dense(n_output, activation='softmax')
    
    return encoder,decoder_lstm,decoder_dense


def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer



# returns train, inference_encoder and inference_decoder models
def define_models(n_input, n_output, n_units,Ty, word_to_vec_map, word_to_index):
    
    
    encoder,decoder_lstm,decoder_dense = load_layers(n_input, n_output, n_units)
    
    
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    #input_shape = (maxlen,) (in emojify example) which is further = Tx (Maximum words in the input sequence)
    #sentence_indices = Input(shape=input_shape, dtype=np.int32)
    sentence_indices = Input(shape=(n_input,))
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    

    # define training encoder
    #encoder_inputs = Input(shape=(None, n_input))
    
    #encoder = LSTM(n_units, return_state=True)
    
    encoder_inputs = embeddings
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    
    
    # define training decoder
    #decoder_inputs = Input(shape=(None, n_output))
    decoder_sentence_indices = Input(shape=(Ty,n_output,))
    
    # 2nd input, the one being given to the decoder, should also be converted to word embeddings, to make it consistent in format with the encoder input
    decoder_embeddings = embedding_layer(decoder_sentence_indices)
    
    decoder_inputs = decoder_embeddings

    #decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    #decoder_dense = Dense(n_output, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
   
    #model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model = Model([sentence_indices, decoder_sentence_indices], decoder_outputs)
    
    # define inference encoder
    #encoder_model = Model(encoder_inputs, encoder_states)
    encoder_model = Model(sentence_indices, encoder_states)
    
    
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    #decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    decoder_model = Model([decoder_sentence_indices] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model

# generate target given source sequence
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
	# encode
	state = infenc.predict(source)
	# start of sequence input
	target_seq = array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
	# collect predictions
	output = list()
	for t in range(n_steps):
		# predict next char
		yhat, h, c = infdec.predict([target_seq] + state)
		# store prediction
		output.append(yhat[0,0,:])
		# update state
		state = [h, c]
		# update target sequence
		target_seq = yhat
	return array(output)

# decode a one hot encoded string
def one_hot_decode(encoded_seq):
	return [argmax(vector) for vector in encoded_seq]

In [0]:
# Unit Test for Embedding layer set up

embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
# Check the function. Expected output should be **weights[0][1][3] =**	-0.3403
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

In [0]:
# configure problem
n_features = vocab_size
n_steps_in = Tx
n_steps_out = Ty

# generate training dataset
#X1, X2, y = get_dataset(n_steps_in, n_steps_out, n_features, 100000)
#X1, X2,y = get_dataset_one_hot(Tx, Ty, vocab_size, m, X_Indices, Y_Indices)


#generating data set for 

X1, X2,y = get_dataset_one_hot(Tx, Ty, vocab_size, m, X_Indices, Y_Indices)
print(X1.shape,X2.shape,y.shape)
# train model


# define model
train, infenc, infdec = define_models(Tx, vocab_size, 128,Ty,word_to_vec_map, word_to_index)
train.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=['acc'])
train.summary()

infenc.summary()

infdec.summary()

plot_model(train, to_file='model_train.png',show_shapes=True)
plot_model(infenc, to_file='model_infenc.png',show_shapes=True)
plot_model(infdec, to_file='model_infdec.png',show_shapes=True)
#plot_model(encoder_model, to_file='encoder_model.png', show_shapes=True)
#plot_model(decoder_model, to_file='decoder_model.png', show_shapes=True)

In [0]:
history = train.fit([X1, X2], y, epochs=1, batch_size=10, shuffle=True)

#train = load_model("seq2seq_model14_09_2019_20_19_06.h5")

In [0]:
model_name = "seq2seq_model" + datetime.now().strftime("%d_%m_%Y_%H_%M_%S")+".h5"

# Save the model
#train.save(model_name)

# evaluate LSTM
total, correct = 100, 0
#for _ in range(total):
#X1, X2, y = get_dataset(Tx, Ty, vocab_size, m, X_Indices, Y_Indices)
#target = predict_sequence(infenc, infdec, X1, n_steps_out, n_features)
#if array_equal(one_hot_decode(y[0]), one_hot_decode(target)):
    #correct += 1
#print('Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))
# spot check some examples
#for _ in range(10):
	#X1, X2, y = get_dataset(Tx, Ty, vocab_size, m, X_Indices, Y_Indices)
target = predict_sequence(infenc, infdec, X1[3:4,:,:], n_steps_out, n_features)
print('X=%s y=%s, yhat=%s' % (one_hot_decode(X1[2]), one_hot_decode(y[2]), one_hot_decode(target)))

In [0]:
print(index_to_word[50320])

In [0]:
print('X=%s y=%s, yhat=%s' % (one_hot_decode(X1[5]), one_hot_decode(y[5]), one_hot_decode(target)))

In [0]:
print(target.shape)