In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.models import Model

In [2]:
data = pd.read_csv("news_summary_more.csv")

In [3]:
import re
def clean(text):
    text = text.lower()
    newString = re.sub(r'[^a-z0-9.\s]','',text)
    return newString

In [4]:
data['headlines'].apply(clean)
data['text'].apply(clean)

0        saurav kant an alumnus of upgrad and iiitbs pg...
1        kunal shahs credit card bill payment platform ...
2        new zealand defeated india by 8 wickets in the...
3        with aegon life iterm insurance plan customers...
4        speaking about the sexual harassment allegatio...
                               ...                        
98396    a crpf jawan was on tuesday axed to death with...
98397    uff yeh the first song from the sonakshi sinha...
98398    according to reports a new version of the 1999...
98399    a new music video shows rapper snoop dogg aimi...
98400    madhesi morcha an alliance of seven political ...
Name: text, Length: 98401, dtype: object

In [5]:
MAX_TEXT_LEN = 0
for i in data["text"]:
    if len(i.split())>MAX_TEXT_LEN:
        MAX_TEXT_LEN = len(i.split())
MAX_TEXT_LEN

66

In [6]:
data['headlines']=data['headlines'].apply(lambda x: 'sostok '+x+' eostok')

In [7]:
MAX_SUM_LEN = 0
for i in data["headlines"]:
    if len(i.split())>MAX_SUM_LEN:
        MAX_SUM_LEN = len(i.split())
MAX_SUM_LEN

20

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_vocab = 15000
x_tokenizer = Tokenizer(num_words=x_vocab)
x_tokenizer.fit_on_texts(data["text"].values)
X = x_tokenizer.texts_to_sequences(data["text"].values)
X = pad_sequences(X, maxlen=MAX_TEXT_LEN, padding="post")
X.shape

(98401, 66)

In [9]:
y_vocab = 10000
y_tokenizer = Tokenizer(num_words=y_vocab)
y_tokenizer.fit_on_texts(data["headlines"].values)
Y = y_tokenizer.texts_to_sequences(data["headlines"].values)
Y = pad_sequences(Y,maxlen=MAX_SUM_LEN, padding="post")
Y.shape

(98401, 20)

In [11]:
import pickle
pickle_open = open("y_tokenizer.pkl","wb")
pickle.dump(y_tokenizer,pickle_open)

In [14]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(X, Y, train_size=0.2, random_state=42, shuffle=True)

In [16]:

pickle_open = open("x_val.pkl","wb")
pickle.dump(x_val,pickle_open)

In [11]:
#making encoder model
embedding_dim = 100
latent_dim = 300

encoder_input = Input(shape=(MAX_TEXT_LEN,))
encoder_emb_layer = Embedding(x_vocab, embedding_dim)
encoder_emb = encoder_emb_layer(encoder_input)

encoder_lstm = LSTM(latent_dim, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output, state_h, state_c = encoder_lstm(encoder_emb)

#making decoder

decoder_input = Input(shape=(None,))
decoder_emb_layer = Embedding(y_vocab, embedding_dim)
decoder_emb = decoder_emb_layer(decoder_input)

decoder_lstm = LSTM(latent_dim,return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
decoder_output, decoder_fw, decoder_bw = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

decoder_dense = TimeDistributed(Dense(y_vocab, activation="softmax"))
decoder_output = decoder_dense(decoder_output)

model = Model([encoder_input, decoder_input], decoder_output)

model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 66)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 66, 100)      1500000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    1000000     input_2[0][0]                    
______________________________________________________________________________________________

In [12]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [35]:
history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50,batch_size=128, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))

Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 

In [36]:
encoder_model = Model(inputs=encoder_input,outputs=[encoder_output, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(latent_dim,))

# Get the embeddings of the decoder sequence
dec_emb2= decoder_emb_layer(decoder_input) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2) 

# Final decoder model
decoder_model = Model(
    [decoder_input] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [37]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [38]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (MAX_SUM_LEN-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
for i in range(0,100):
    print("Predicted summary:",decode_sequence(x_val[i].reshape(1,MAX_TEXT_LEN)))
    print("\n")

Predicted summary:  delhi cm orders to get cow from delhi cm


Predicted summary:  us man who killed for being from road in us


Predicted summary:  mumbai airport to get in delhi


Predicted summary:  govt to get up to be for for delhi govt


Predicted summary:  us woman who shot dead in front of killing


Predicted summary:  tesla ceo becomes world's first time in 2017


Predicted summary:  india to play in 1st t20i cricket for 1st time


Predicted summary:  us prez orders to pay to trump


Predicted summary:  n korea to be down if korea n korea


Predicted summary:  india to be down in india for 1st time


Predicted summary:  india to be down as india china


Predicted summary:  up govt to get free to be held for


Predicted summary:  govt to give â¹1 crore crore for aadhaar in delhi govt


Predicted summary:  us court orders to be down for


Predicted summary:  rohit sharma to be down in a odi cricket


Predicted summary:  i was a life in my life on my life


Predicted summary:  m