<a href="https://colab.research.google.com/github/bhaveshsingh0206/Text-Summmariser./blob/master/text_summariser_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, Bidirectional, RepeatVector, Concatenate, Dot, Lambda
from keras.callbacks import ModelCheckpoint
from keras.models import Input, Model
import keras.backend as K

Using TensorFlow backend.


In [0]:
# Loading file from Drive
import pydrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':"1FfO8fsNXnenTfsq7skCgZpv6PBBQhjpa"})   
downloaded.GetContentFile('glove.6B.100d.txt')  

In [0]:
downloaded = drive.CreateFile({'id':"13JdWTJ4hSKpUQnOlAxmzTdjNVUjco4fV"}) 
downloaded.GetContentFile('DF.csv') 

In [0]:
word2Vec = {}
with open('glove.6B.100d.txt') as f:
  for line in f:
    line = line.rstrip().split(' ')
    word = line[0]
    embedding_vector = line[1:]
    word2Vec[word] = embedding_vector

In [0]:
dataset = pd.read_csv('DF.csv')
dataset.head()


Unnamed: 0,headlines,article
0,"India reports more than 5,000 coronavirus case...",The total number of coronavirus cases in India...
1,"Because of few jokers, COVID-19 is spreading: ...",Salman Khan has condemned the attack on medica...
2,"45% districts without a COVID-19 case, 27 dist...",The government on Thursday announced that 45% ...
3,68-yr-old UP man suffering from cold declared ...,"A 68-year-old man from Uttar Pradesh's Amroha,..."
4,"Lockdown is like pause button, will not defeat...","Addressing the media via video on Thursday, Co..."


In [0]:
MAX_LEN = 100
MAX_VOCAB = 30000
LSTM_UNITS = 256
DIMENSIONS = 100

In [0]:

input_sequences = []
target_input_sequences = []
target_sequences = []
for index, row in dataset.iterrows():
  input_sequences.append(row['article'])
  target_input_sequences.append('<sos> '+row['headlines'])
  target_sequences.append(row['headlines']+' <eos>')

In [0]:
print(input_sequences[0])
print(target_input_sequences[0])
print(target_sequences[0])

In [0]:
max_input_len = max(len(s.split(' ')) for s in input_sequences)
max_target_len = max(len(y.split(' ')) for y in target_sequences)

In [0]:
print("target length max is ",max_target_len, " input length max is ",max_input_len)

In [0]:
tokenizer_input = Tokenizer(num_words=MAX_VOCAB)
tokenizer_input.fit_on_texts(input_sequences)
input_sequences = tokenizer_input.texts_to_sequences(input_sequences)

In [0]:
tokenizer_output = Tokenizer(num_words=MAX_VOCAB)
tokenizer_output.fit_on_texts(target_input_sequences+target_sequences)
target_input_sequences = tokenizer_output.texts_to_sequences(target_input_sequences)
target_sequences = tokenizer_output.texts_to_sequences(target_sequences)

In [0]:
words_input = tokenizer_input.word_index
words_output = tokenizer_output.word_index
input_num_words = min(MAX_VOCAB, len(words_input)+1)
target_num_words = min(MAX_VOCAB, len(words_output)+1)

In [0]:
input_sequences = pad_sequences(input_sequences, max_input_len, padding='pre')
target_input_sequences = pad_sequences(target_input_sequences, max_target_len, padding='post')
target_sequences = pad_sequences(target_sequences, max_target_len, padding='post')

In [0]:
input_embedding_matrix = np.zeros((input_num_words, DIMENSIONS))
for word, k in words_input.items():
  if k < input_num_words:
    embedding_vector = word2Vec.get(word)
    if embedding_vector is not None:
      input_embedding_matrix[k] = embedding_vector

In [0]:
target_embedding_matrix = np.zeros((target_num_words, DIMENSIONS))
for word, k in words_input.items():
  if k < target_num_words:
    embedding_vector = word2Vec.get(word)
    if embedding_vector is not None:
      target_embedding_matrix[k] = embedding_vector

In [0]:
embedding_input = Input(shape=(max_input_len,))
embedding_input_layer = Embedding(input_num_words, DIMENSIONS, weights=[input_embedding_matrix], trainable=True)
x = embedding_input_layer(embedding_input)

In [0]:
input_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))
input_lstm1_output = input_lstm1(x)

input_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))
encoder_output = input_lstm2(input_lstm1_output)


In [0]:
def softmax_over_time(x):
  assert(K.ndim(x) > 2)
  e = K.exp(x - K.max(x, axis=1, keepdims=True))
  s = K.sum(e, axis=1, keepdims=True)
  return e / s

In [0]:
atten_layer_repeat = RepeatVector(max_input_len)
atten_concatenate = Concatenate(axis=-1)
atten_dense1 = Dense(30, activation='tanh')
atten_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1)

In [0]:
def attention_procedure(h, st_1):
  st_1 = atten_layer_repeat(st_1)
  x = atten_concatenate([h, st_1])
  x = atten_dense1(x)
  alphas = atten_dense2(x)
  context = attn_dot([alphas,h])
  return context

In [0]:
st_0 = Input(shape=(LSTM_UNITS,))
c_0 = Input(shape=(LSTM_UNITS,))
context_last_word_concat_layer = Concatenate(axis=2)

In [0]:
embedding_decoder_input = Input(shape=(max_target_len,))
embedding_decoder_layer = Embedding(target_num_words, DIMENSIONS, weights=[target_embedding_matrix], trainable=True)
decoder_x = embedding_decoder_layer(embedding_decoder_input)

s = st_0
c = c_0
outputs = []
decoder_lstm = LSTM(LSTM_UNITS, return_state=True)
decoder_dense_layer = Dense(target_num_words, activation='softmax')
for i in range(max_target_len):
  context = attention_procedure(encoder_output, s)

  selector = Lambda(lambda x: x[:, i:i+1])
  xt = selector(decoder_x)
  decoder_lstm_input = context_last_word_concat_layer([context, xt])
  
  
  decoder_lstm_output, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s,c])

  
  decoder_output = decoder_dense_layer(decoder_lstm_output)
  outputs.append(decoder_output)

In [0]:
def stack_and_transpose(x):
  x = K.stack(x) 
  x = K.permute_dimensions(x, pattern=(1, 0, 2)) 
  return x


In [0]:
stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)

model = Model(
  inputs=[
    embedding_input,
    embedding_decoder_input,
    st_0, 
    c_0,
  ],
  outputs=outputs
)

In [0]:
# model.summary()

In [0]:
# my_callbacks = [
#     ModelCheckpoint(filepath = 'my_model.h5', 
#     verbose=1, save_best_only=True, save_weights_only=False) 
#     ]
model.load_weights('/content/gdrive/My Drive/model/epochs:011-val_acc(part2):0.687.hdf5')

In [0]:
from keras.callbacks import *
filepath="/content/gdrive/My Drive/model/epochs:{epoch:03d}-val_acc(part2):{val_acc:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [0]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])

z = np.zeros((len(target_sequences), LSTM_UNITS))
r = model.fit(
  [input_sequences, target_input_sequences, z, z], target_sequences.reshape(target_sequences.shape[0],target_sequences.shape[1], 1),
  batch_size=128,
  epochs=20,
  validation_split=0.2,
  verbose=1,
  callbacks=callbacks_list
)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
Epoch 00007: val_acc improved from 0.65365 to 0.65503, saving model to /content/gdrive/My Drive/model/epochs:007-val_acc:0.655.hdf5
Epoch 8/40
51275/51275 [==============================] - 3589s 70ms/step - loss: 2.6328 - acc: 0.6448 - val_loss: 2.9198 - val_acc: 0.6557


In [0]:
# Predictions!!!!!!!

In [0]:
encoder_outputs_as_input = Input(shape=(max_input_len, LSTM_UNITS * 2,))
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = embedding_decoder_layer(decoder_inputs_single)

context = attention_procedure(encoder_outputs_as_input, st_0)


decoder_lstm_input = context_last_word_concat_layer([context, decoder_inputs_single_x])





o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[st_0, c_0])
decoder_outputs = decoder_dense_layer(o)






decoder_model = Model(
  inputs=[
    decoder_inputs_single,
    encoder_outputs_as_input,
    st_0, 
    c_0
  ],
  outputs=[decoder_outputs, s, c]
)


In [0]:
encoder_model = Model(embedding_input, encoder_output)

In [0]:
idx2word = {v:k for k, v in words_output.items()}

In [0]:
def prediction(input_seq):

  enc_out = encoder_model.predict([[input_seq]])

  target_seq = np.zeros((1, 1))
  

  target_seq[0, 0] = words_output['sos']

  eos = words_output['eos']



  s = np.zeros((1, LSTM_UNITS))
  c = np.zeros((1, LSTM_UNITS))


  output_sentence = []
  for _ in range(max_target_len):
    o, s, c = decoder_model.predict([target_seq, enc_out, s, c])
        

    idx = np.argmax(o.flatten())

    if eos == idx:
      break

    word = ''
    if idx > 0:
      word = idx2word[idx]
      output_sentence.append(word)


    target_seq[0, 0] = idx

  return ' '.join(output_sentence)

In [0]:
import random
predictions_inputs = []
true_outputs = []
count = 0
while count<5:
  id = random.randint(0,100)
  predictions_inputs.append(dataset['article'][id])
  true_outputs.append(dataset['headlines'][id])
  count += 1
predictions_inputs = tokenizer_input.texts_to_sequences(predictions_inputs)
predictions_inputs = pad_sequences(predictions_inputs, maxlen=max_input_len)

In [0]:
for k,text in enumerate(predictions_inputs):
  print(len(text))
  print('ARTICLE IS')
  print('---------------------')
  print('ORIGINAL HEADLINE IS')
  print(true_outputs[k])
  print('---------------------')
  print('PREDICTED HEADLINE IS')
  print(prediction(text))

87
ARTICLE IS
---------------------
ORIGINAL HEADLINE IS
Odisha's Ganjam imposes ₹500 fine for spitting in public places
---------------------
PREDICTED HEADLINE IS
himachal pradesh bans coronavirus fear in public ban public
87
ARTICLE IS
---------------------
ORIGINAL HEADLINE IS
Uttar Pradesh reports 67 new coronavirus cases, state total rises to 727
---------------------
PREDICTED HEADLINE IS
coronavirus cases in new york state total reaches 9 to cases today
87
ARTICLE IS
---------------------
ORIGINAL HEADLINE IS
Consulted 12-13 doctors in family: Kumaraswamy defends son's wedding tomorrow
---------------------
PREDICTED HEADLINE IS
10 yr old my family even even even day night or vote on coronavirus lockdown
87
ARTICLE IS
---------------------
ORIGINAL HEADLINE IS
UK thanks India as country gets 1st batch of 28 lakh paracetamol packs
---------------------
PREDICTED HEADLINE IS
uk thanks india as country gets 1st 28 lakh lakh united of 14 years
87
ARTICLE IS
---------------------
OR

In [0]:
len(predictions_inputs[0])

87