In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf 
import numpy as np
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords #provides list of english stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
train = pd.read_csv('/content/drive/My Drive/NLP Project/reviews.csv')#,  nrows=1000)  #, nrows=100000 sep='\t',

In [None]:
train.head() 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
train = train[['Summary','Text']]

In [None]:
train['text_lower'] = train['Text'].str.lower()
train['text_no_punctuation'] = train['text_lower'].str.replace('[^\w\s]','')

In [None]:
train['summary_lower'] = train["Summary"].str.lower()
train['summary_no_punctuation'] =  '_start_' + ' ' +train['summary_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'

**VERY IMPORTANT TRICK!! NOTICE THAT WE ADD "_start_" and "_end_" EXACTLY AT THE BEGINNING AND THE END OF EACH SENTENCE TO HAVE SOME KIND OF'DELIMITERS' THAT WILL TELL OUR DECODER TO START AND FINISH. BECAUSE WE DON'T HAVE GENERAL SIGNALS OF START AND FINISH IN NATURAL LANGUAGE. BASICALLY '_end_' REFLECTS THE POINT IN WHICH OUR OUTPUT SENTENCE IS MORE LIKELY TO END.**

In [None]:
train = train.drop(columns=['Summary','Text','text_lower','summary_lower'])
train

Unnamed: 0,text_no_punctuation,summary_no_punctuation
0,i have bought several of the vitality canned d...,_start_ good quality dog food _end_
1,product arrived labeled as jumbo salted peanut...,_start_ not as advertised _end_
2,this is a confection that has been around a fe...,_start_ delight says it all _end_
3,if you are looking for the secret ingredient i...,_start_ cough medicine _end_
4,great taffy at a great price there was a wide...,_start_ great taffy _end_
...,...,...
568449,great for sesame chickenthis is a good if not ...,_start_ will not do without _end_
568450,im disappointed with the flavor the chocolate ...,_start_ disappointed _end_
568451,these stars are small so you can give 1015 of ...,_start_ perfect for our maltipoo _end_
568452,these are the best treats for training and rew...,_start_ favorite training and reward treat _end_


In [None]:
max_features1 = 5000
max_features2 = 5000

In [None]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1) 
tok1.fit_on_texts(list(train['text_no_punctuation'].astype(str)))
tf_train_text =tok1.texts_to_sequences(list(train['text_no_punctuation'].astype(str)))
tf_train_text =tf.keras.preprocessing.sequence.pad_sequences(tf_train_text)

In [None]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters = '*') 
tok2.fit_on_texts(list(train['summary_no_punctuation'].astype(str)))
tf_train_summary = tok2.texts_to_sequences(list(train['summary_no_punctuation'].astype(str)))
tf_train_summary = tf.keras.preprocessing.sequence.pad_sequences(tf_train_summary, padding ='post')

# Define Model Architecture

In [None]:
vectorized_summary = tf_train_summary
decoder_input_data = vectorized_summary[:, :-1]

decoder_target_data = vectorized_summary[:, 1:]
vectorized_text = tf_train_text
encoder_input_data = vectorized_text
doc_length = encoder_input_data.shape[1]

Shape of decoder input: (568454, 39)
Shape of decoder target: (568454, 39)
Shape of encoder input: (568454, 2961)


In [None]:
vocab_size_encoder = len(tok1.word_index) + 1
vocab_size_decoder = len(tok2.word_index) + 1

### Define Model Architecture

In [None]:
#arbitrarly set latent dimension for embedding and hidden units
#We previously represented word with array of numbers. But it means nothing to our model. 
#We should represent each word in a vector representation such that we can understand its meaning by finding its word embedding.
#So we need to learn this representation that can give word embeeddings for each word. 
#This latent dimension is what your word embedding look like means each word is represented as a 300 dimensional vector
latent_dim = 300



In [None]:
encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (English text)
x = tf.keras.layers.Embedding(vocab_size_encoder, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)


#Batch normalization is used so that the distribution of the inputs 
#to a specific layer doesn't change over time
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)


# We do not need the `encoder_output` just the hidden state.
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

#### Decoder Model ####
decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder
dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
#again batch normalization
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) #the decoder "decodes" the encoder output.
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

#### Seq2Seq Model ####
seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

** Examine Model Architecture Summary **

In [None]:
seq2seq_Model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 300)    12858600    Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      [(None, 2961)]       0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 300)    1200        Decoder-Word-Embedding[0][0]     
______________________________________________________________________________________________

# Train Model

In [None]:
'''
batch_size = 64
epochs = 3 
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,  epochs=epochs ,  validation_split=0.12) 
'''

'\nbatch_size = 64\nepochs = 3 \nhistory = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),\n          batch_size=batch_size,  epochs=epochs ,  validation_split=0.12) \n'

In [None]:
seq2seq_Model.load_weights('drive/My Drive/Colab Notebooks/1-model/final_model_weights.h5')

In [None]:
sam="Give your Sample text here to summarize it"
test_text = [sam]
tok1.fit_on_texts(test_text)
raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=len(tf_train_text[0]))
body_encoding = encoder_model.predict(raw_tokenized)
latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]
decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input 
dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)
gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')
gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                      [dense_out, gru_state_out])
original_body_encoding = body_encoding
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)
decoded_sentence = []
stop_condition = False
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())
while not stop_condition:
    preds, st = decoder_model.predict([state_value, body_encoding])

    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    if pred_word_str == '_end_':
        stop_condition = True
        break
    print(pred_word_str)
    decoded_sentence.append(pred_word_str)
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)