In [1]:
# import the dependencies and load the model

import pandas as pd
import contractions
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle
import random
import numpy as np
from seq2seq import Seq2Seq

model = load_model("s2s.h5")
spacy_model = spacy.load('en_core_web_md')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 74)]         0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, 74, 100)      3098000     ['input_1[0][0]']                
                                                                                                  
 encoder_0 (LSTM)               [(None, 74, 200),    240800      ['encoder_embedding[0][0]']      
                                 (None, 200),                                                     
                                 (None, 200)]                                                     
                                                                                              

In [2]:
# load the tokenizers
with open("xtk.pkl","rb") as f:
    xtk = pickle.load(f)
    
with open("ytk.pkl","rb") as f:
    ytk = pickle.load(f)

In [3]:

def create_input_seq(text):
    text = text.lower().split()
    tokens = [contractions.fix(t) for t in text]
    tokens = spacy_model(' '.join(tokens))
    tokens = [t for t in tokens if not t.is_punct]
    tokens = [str(t) for t in tokens if not t.is_space]
    
    tokens = [t for t in tokens if str(t)!="'s"]
    text = ' '.join(tokens)
    seq = xtk.texts_to_sequences([text])
    seq = pad_sequences(seq,maxlen=74,padding='post')
    return seq.reshape((1,74))

def seq2seq_prediction(in_seq,encoder_inf,decoder_inf):
    e_out,st_h,st_c = encoder_inf.predict(in_seq)
    tar_seq = np.zeros((1,1))
    tar_seq[0,0] = ytk.word_index['<sos>']
    stop = False
    dec_seq = ""
    
    while not stop:
        out_tok,pred_h,pred_c = decoder_inf.predict([tar_seq,e_out,st_h,st_c])
        idx = np.argmax(out_tok[0,-1,:])
        sampled_tok = ytk.index_word[idx]
        
        if sampled_tok != '<eos>' and len(dec_seq.split()) < 18: 
            dec_seq += " " + sampled_tok
        
        else:
            stop = True
        
        tar_seq = np.zeros((1,1))
        tar_seq[0,0] = idx
        
        st_h = pred_h
        st_c = pred_c
        
    return dec_seq

In [4]:
#initialize class
s2s = Seq2Seq()

# initialize encoder and decoder inference models
encoder_inf = s2s.encoder_inference_model(model)
decoder_inf = s2s.decoder_inference_model(model)

encoder_inf.summary()
decoder_inf.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 74)]         0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, 74, 100)      3098000     ['input_1[0][0]']                
                                                                                                  
 encoder_0 (LSTM)               [(None, 74, 200),    240800      ['encoder_embedding[0][0]']      
                                 (None, 200),                                                     
                                 (None, 200)]                                                     
                                                                                              

In [9]:
data = pd.read_csv("news_summaries.csv")


In [20]:
i = random.randint(0,len(data)-1)
print(actual[i])
print()
seq = create_input_seq(text[i])
print(seq2seq_prediction(seq,encoder_inf,decoder_inf))

SC dismisses plea over illegal excavation, construction at Puri temple

 sc orders probe into tirumala temple shrine to be


In [23]:
def print_predictions(df, num_pred=5):
    for _ in range(num_pred):
        i = random.randint(0,len(data)-1)
        print(f"Text : {df.text.loc[i]}")
        print(f"Actual Summary : {df.headlines.loc[i]}")
        seq = create_input_seq(df.text.loc[i])
        pred = seq2seq_prediction(seq, encoder_inf, decoder_inf).strip()
        print(f"Predicted Summary : {pred}")
        print()

In [26]:
print_predictions(data)

Text : A film on the life of environmentalist Dr Binish Desai, known as the 'Recycle Man of India', is under development. "[It'll] be...world's first mainstream Bollywood movie which will revolve around a change maker's journey working towards eliminating the idea of waste," Desai said. The biopic will aim at promoting sustainability with a carbon-negative approach throughout its making, makers said.  
Actual Summary : Film on India's Recycle Man Binish Desai in the works
Predicted Summary : first film to be a star wars in india

Text : A 13-year-old boy, Sai Sudhir Kawade, from Pune, climbed the Kala Patthar mountain in Nepal at an elevation of 5644.5 metres. Sai was the youngest mountaineer to participate in the Tenzing Hillary Everest Marathon. Reports said that 45 countries participated in the annual marathon. Sai unfurled a 175-feet Tricolour after climbing the peak.
Actual Summary : 13-yr-old Pune boy climbs Kala Patthar in Nepal, unfurls Tricolour
Predicted Summary : indian man 