#### Connect to Kaggle

Data is available on Kaggle website. We will first connect Colab to Kaggle. Instructions for downloading kaggle data to Colab can be found [in this post](https://towardsdatascience.com/setting-up-kaggle-in-google-colab-ebb281b61463).

In [None]:
!pip install kaggle --quiet

In [None]:
#Make a directory for Kaggle
!mkdir .kaggle

#Connect Google drive to colab
from google.colab import drive
drive.mount('/gdrive')

#Copy kaggle.json file. Change gdrive folder based on where you have saved your json file from Kaggle
!cp '/gdrive/My Drive/AI-ML/Machine-Learning/Code/Utilities/kaggle.json' /content/.kaggle/kaggle.json

#Check if json file is there
!ls -l /content/.kaggle

!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!kaggle config set -n path -v{/content}
!chmod 600 /root/.kaggle/kaggle.json

#### Download Amazon Reviews Dataset

Here is the [link](https://www.kaggle.com/snap/amazon-fine-food-reviews) for Amazon Fine Food reviews dataset. You may need to agree to the rules of the competition before download is allowed.

In [None]:
!kaggle datasets download -d snap/amazon-fine-food-reviews -p /content

In [None]:
!unzip amazon-fine-food-reviews.zip

In [None]:
!ls -l

#### Read Data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Reviews.csv', nrows=50000)
df.shape

In [None]:
df.head()

In [None]:
#Drop NA
df.dropna(axis=0,inplace=True)
df.reset_index(inplace=True)

In [None]:
df.shape

Review paragraph and summary

In [None]:
#Select a random record
rec_num = np.random.randint(0, df.shape[0])
print('--------')
print ('Paragraph: ', df.loc[rec_num, 'Text'])
print ('Summary: ', df.loc[rec_num, 'Summary'] )

### Separate Source and Target pairs

In [None]:
encoder_text = [] #Initialize paragraph list
decoder_text = [] #Initialize summary list

for i in range(df.shape[0]):

    #Get Pragraph and summary for each record
    text_paragraph = df.loc[i, 'Text']
    text_summary = df.loc[i, 'Summary']

    #Add Paragraph to the list
    encoder_text.append(text_paragraph)
    #Add start and end sequence to summary text.
    #IMPORTANT: Make sure start_seq and end_seq are not part of regular words
    decoder_text.append('startseq ' + text_summary + ' endseq')

### Separate Source and Target pairs..

In [None]:
#Review some paragraphs
encoder_text[100:105]

In [None]:
#Corresponding Summary Text
decoder_text[100:105]

### Tokenize Text data

In [None]:
import tensorflow as tf

In [None]:
#Build Tokenizer
vocab_size=30000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)

#Fit both paragraph and summary text
tokenizer.fit_on_texts(encoder_text)
tokenizer.fit_on_texts(decoder_text)

In [None]:
print(tokenizer.word_index)

In [None]:
#Convert paragraph text to indexes
encoder_seq = tokenizer.texts_to_sequences(encoder_text)

In [None]:
encoder_text[100:105]

In [None]:
print(encoder_seq[100:105]) #Display some converted sentences

In [None]:
#Convert summary text to indexes
decoder_seq = tokenizer.texts_to_sequences(decoder_text)

In [None]:
#Maximum length of paragraph
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])

#Maximum length of summaryparagraph
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])

print('Maximum sentence length for paragraph: ', max_encoder_seq_length)
print('Maximum sentence length for summary: ', max_decoder_seq_length)

### Compare different sentences length

In [None]:
#Source Language sentences
print('Length for sentence number 100: ', len(encoder_seq[100]))
print('Length for sentence number 150: ', len(encoder_seq[150]))

In [None]:
#Target Language sentences
print('Length for sentence number 100: ', len(decoder_seq[100]))
print('Length for sentence number 150: ', len(decoder_seq[150]))

### How do we make it same?

### Padding the sentences

In [None]:
#Source sentences
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(encoder_seq, 
                                                                   maxlen=500,
                                                                   padding='pre', 
                                                                   truncating='post')

#Target Sentences
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_seq, 
                                                                   maxlen=max_decoder_seq_length,
                                                                   padding='post')

In [None]:
print('Source data shape: ', encoder_input_data.shape)
print('Target data shape: ', decoder_input_data.shape)

In [None]:
encoder_text[100]

In [None]:
encoder_input_data[100]

In [None]:
decoder_text[100]

In [None]:
decoder_input_data[100]

#### Integer to Word converter for Decoder data

In [None]:
int_to_word_decoder = dict((i,c) for c, i in tokenizer.word_index.items())

In [None]:
print(int_to_word_decoder)

#### Build Batch Generator

In [None]:
import numpy as np

In [None]:
def batch_generator(encoder_input_data, decoder_input_data, batch_size=64):

    while True:

        batch_idx = np.random.randint(0, encoder_input_data.shape[0], batch_size)
        paragraph_batch = np.zeros((batch_size, 500))
        summary_batch = np.zeros((batch_size, max_decoder_seq_length))
        decoder_output = np.zeros((batch_size, max_decoder_seq_length, vocab_size+1))

        for i in range(batch_size):

            paragraph_batch[i] = encoder_input_data[i]
            summary_batch[i] = decoder_input_data[i]

            decoder_output_seq = np.zeros((max_decoder_seq_length))
            for j in range(1,max_decoder_seq_length):
                decoder_output_seq[j-1] = summary_batch[i][j]

            for j in range(max_decoder_seq_length):
                decoder_output[i][j] = tf.keras.utils.to_categorical(decoder_output_seq[j],
                                                                     num_classes=vocab_size+1)
            

        yield [paragraph_batch, summary_batch], decoder_output

In [None]:
a = batch_generator(encoder_input_data, decoder_input_data, batch_size=2)
b,c = next(a)

In [None]:
c.shape

### Building the Training Model

In [None]:
#Define config parameters
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256 #Memory size for LSTM

#### Build Encoder

In [None]:
tf.keras.backend.clear_session()

In [None]:
#Input Layer
encoder_inputs = tf.keras.layers.Input(shape=(None,))

#Embedding layer - Word2Vec 
encoder_embedding = tf.keras.layers.Embedding(vocab_size+1, #Size for One hot encoding
                                              encoder_embedding_size) #How many numbers to use for each word

#Get embedding layer output by feeding inputs
encoder_embedding_output = encoder_embedding(encoder_inputs)

#LSTM Layer and its output
x, state_h, state_c = tf.keras.layers.LSTM(rnn_units,return_state=True)(encoder_embedding_output)

#Build a list to feed Decoder - Sentence Embedding
encoder_states = [state_h, state_c]

In [None]:
encoder_states

#### Build Decoder

In [None]:
#Decode input - padded Target sentences
decoder_inputs = tf.keras.layers.Input(shape=(None,))

#Decoder Embedding layer
decoder_embedding = tf.keras.layers.Embedding(vocab_size + 1, 
                                              decoder_embedding_size)

#Embedding layer output
decoder_embedding_output = decoder_embedding(decoder_inputs)

#Decoder RNN
decoder_rnn = tf.keras.layers.LSTM(rnn_units, 
                                   return_sequences=True, 
                                   return_state=True)

#Decoder RNN Output, State initialization from Encoder states
#Output will be all hidden sequences, last 'h' state and last 'c' state
x,_,_ = decoder_rnn(decoder_embedding_output, 
                    initial_state=encoder_states)

#Output Layer
decoder_dense = tf.keras.layers.Dense(vocab_size + 1, #+1 to make sure one-hot encoding works for highest index value
                                      activation='softmax')

#Output of Dense layer
decoder_outputs = decoder_dense(x)

### Build Model using both Encoder and Decoder

In [None]:
#Build a Seq2Seq model -> Encoder + Decoder
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], #2 Inputs to the model
                              decoder_outputs) #Output of the model

In [None]:
model.output

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model.summary()

### Train the model

In [None]:
train_generator = batch_generator(encoder_input_data, decoder_input_data, batch_size=64)

In [None]:
model.fit(train_generator,
          steps_per_epoch = encoder_input_data.shape[0]//64,
          batch_size=64,
          epochs=10)

Save the model

In [None]:
model.save('seq2seq_text_summarization.h5')

In [None]:
!ls -l

#### Building Model for Prediction

##### Build the Encoder Model to predict Encoder States

In [None]:
encoder_model = tf.keras.models.Model(encoder_inputs, #Padded input sequences
                                      encoder_states) #Hidden state and Cell state at last time step

In [None]:
encoder_model.output

##### Build the Decoder Model 
<p/>

<ol><li>Define Input for both 'h' state and 'c' state initialization </li>
<li>Get Decoder RNN outputs along with h and c state</li>
<li>Get Decoder Dense layer output</li>
        <li>Build Model</li></ol>

##### Step 1 - Define Input for both 'h' state and 'c' state initialization

In [None]:
#Hidden state input
decoder_state_input_h = tf.keras.layers.Input(shape=(rnn_units,))

#Cell state input
decoder_state_input_c = tf.keras.layers.Input(shape=(rnn_units,))

#Putting it together
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

##### Step 2 - Get Decoder RNN outputs along with h and c state

In [None]:
#Get Embedding layer output
x = decoder_embedding(decoder_inputs)

#We will use the layer which we trained earlier
rnn_outputs, state_h, state_c = decoder_rnn(x, initial_state=decoder_states_inputs)

#Why do we need this?
decoder_states = [state_h, state_c]

##### Step 3 - Get Decoder Dense layer output

In [None]:
decoder_outputs = decoder_dense(rnn_outputs)

##### Step 4 - Build Decoder Model

In [None]:
decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,  #Model inputs
                                      [decoder_outputs] + decoder_states)

#### Predicting output from Seq2Seq model

Build a prediction function

In [None]:
tokenizer.word_index['startseq']

In [None]:
int_to_word_decoder[1]

In [None]:
def summarize_paragraph(input_sequence):
    
    #Get the encoder state values - Sentence embedding
    decoder_initial_states_value = encoder_model.predict(input_seq)
    
    #Build a sequence with 'startseq' - starting sequence for Decoder
    target_seq = np.zeros((1,1))    
    target_seq[0][0] = tokenizer.word_index['startseq']
    
    #flag to check if prediction should be stopped
    stop_loop = False
    
    #Initialize predicted sentence
    predicted_sentence = ''
    
    num_of_predictions = 0
    
    #start the loop
    while not stop_loop:
        
        predicted_outputs, h, c = decoder_model.predict([target_seq] + 
                                                        decoder_initial_states_value)
        
        #Get the predicted word index with highest probability
        predicted_output = np.argmax(predicted_outputs[0,-1,:])
        
        #Get the predicted word from predicter index
        predicted_word = int_to_word_decoder[predicted_output]
        
        #Check if prediction should stop
        if(predicted_word == 'endseq' or num_of_predictions > max_decoder_seq_length):
            
            stop_loop = True
            continue
        
        num_of_predictions += 1
        
        #Updated predicted sentence
        if (len(predicted_sentence) == 0):
            predicted_sentence = predicted_word
        else:
            predicted_sentence = predicted_sentence + ' ' + predicted_word
            
        #Update target_seq to be the predicted word index
        target_seq[0][0] = predicted_output
        
        #Update initial states value for decoder
        decoder_initial_states_value = [h,c]
        
    
    return predicted_sentence

##### Call Prediction function on a random sentence

In [None]:
#Generate a random number
start_num = np.random.randint(0, high=len(encoder_text) - 10)

#Predict model output for 5 sentences
for i in range(start_num, start_num + 5):
    input_seq = encoder_input_data[i : i+1]
    predicted_summary = summarize_paragraph(input_seq)
    print('--------')
    print ('Input paragraph: ', encoder_text[i])
    print ('Predicted summary: ', predicted_summary )

##### Save encoder and decoder model

In [None]:
#Compile models to avoid error
encoder_model.compile(optimizer='adam',loss='categorical_crossentropy')
decoder_model.compile(optimizer='adam',loss='categorical_crossentropy')

#Save the models
encoder_model.save('seq2seq_encoder_eng_hin.hd5')  #Encoder model
decoder_model.save('seq2seq_decoder_eng_hin.hd5')  #Decoder model

##### Save Tokenizer

In [None]:
import pickle

pickle.dump(tokenizer,open('tokenizer_summarize','wb'))