In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.models import Model

In [2]:
data = pd.read_csv("news_summary_more.csv")

In [3]:
import re
def clean(text):
    text = text.lower()
    newString = re.sub(r'[^a-z0-9.\s]','',text)
    return newString

In [4]:
data['headlines'].apply(clean)
data['text'].apply(clean)

0        saurav kant an alumnus of upgrad and iiitbs pg...
1        kunal shahs credit card bill payment platform ...
2        new zealand defeated india by 8 wickets in the...
3        with aegon life iterm insurance plan customers...
4        speaking about the sexual harassment allegatio...
                               ...                        
98396    a crpf jawan was on tuesday axed to death with...
98397    uff yeh the first song from the sonakshi sinha...
98398    according to reports a new version of the 1999...
98399    a new music video shows rapper snoop dogg aimi...
98400    madhesi morcha an alliance of seven political ...
Name: text, Length: 98401, dtype: object

In [5]:
MAX_TEXT_LEN = 0
for i in data["text"]:
    if len(i.split())>MAX_TEXT_LEN:
        MAX_TEXT_LEN = len(i.split())
MAX_TEXT_LEN

66

In [6]:
data['headlines']=data['headlines'].apply(lambda x: 'sostok '+x+' eostok')

In [7]:
MAX_SUM_LEN = 0
for i in data["headlines"]:
    if len(i.split())>MAX_SUM_LEN:
        MAX_SUM_LEN = len(i.split())
MAX_SUM_LEN

20

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_vocab = 15000
x_tokenizer = Tokenizer(num_words=x_vocab)
x_tokenizer.fit_on_texts(data["text"].values)
X = x_tokenizer.texts_to_sequences(data["text"].values)
X = pad_sequences(X, maxlen=MAX_TEXT_LEN, padding="post")
X.shape

(98401, 66)

In [9]:
y_vocab = 10000
y_tokenizer = Tokenizer(num_words=y_vocab)
y_tokenizer.fit_on_texts(data["headlines"].values)
Y = y_tokenizer.texts_to_sequences(data["headlines"].values)
Y = pad_sequences(Y,maxlen=MAX_SUM_LEN, padding="post")
Y.shape

(98401, 20)

In [10]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(X, Y, train_size=0.2, random_state=42, shuffle=True)

In [11]:
#making encoder model
embedded_dim = 100
latent_dim = 300

encoder_input = Input(shape=(MAX_TEXT_LEN,))
encoder_emb_layer = Embedding(x_vocab, embedded_dim)
encoder_emb = encoder_emb_layer(encoder_input)
encoder_lstm = LSTM(latent_dim, recurrent_dropout=0.4, dropout=0.4, return_sequences=True, return_state=True)
encoder_output, state_h, state_c = encoder_lstm(encoder_emb)

#decoding layer
decoder_input = Input(shape=(None,))
decoder_emb_layer = Embedding(y_vocab, embedded_dim)
decoder_emb = decoder_emb_layer(decoder_input)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_output, decoder_state_h, decoder_state_c = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

decoder_dense = TimeDistributed(Dense(y_vocab, activation="softmax"))
decoder_output = decoder_dense(decoder_output)

model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)



In [12]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [None]:
model.fit([x_tr,y_tr[:,:-1]],y_tr.reshape(y_tr.shape[0],y_tr.shape[1],1)[:,1:], epochs=50, batch_size=128, validation_data=([x_val,y_val[:,:-1]],y_val.reshape(y_val.shape[0],y_val.shape[1],1)[:,1:]))

Epoch 1/50
Epoch 2/50

In [21]:
#making inference model for prediction

#for encoder model
encoder_model = Model(inputs=[encoder_input], outputs=[encoder_output,state_h,state_c])

decoder_internal_state_h = Input(shape=(latent_dim,))
decoder_internal_state_c = Input(shape=(latent_dim,))
decoder_internal_states = [decoder_internal_state_h, decoder_internal_state_c]


dec_emb_2 = decoder_emb_layer(decoder_input)
decoder_output, state_h2, state_c2 = decoder_lstm(dec_emb_2, initial_state=[decoder_internal_state_h, decoder_internal_state_c])
decoder_output = decoder_dense(decoder_output)
#for decoder model
decoder_model = Model(inputs=[decoder_input,decoder_internal_states], outputs=[decoder_output,state_h2,state_c2])


In [22]:
encoder_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 66)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 66, 100)           1500000   
_________________________________________________________________
lstm (LSTM)                  [(None, 66, 300), (None,  481200    
Total params: 1,981,200
Trainable params: 1,981,200
Non-trainable params: 0
_________________________________________________________________


In [23]:
decoder_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    1000000     input_2[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 300)]        0                                            
____________________________________________________________________________________________

In [25]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [27]:
def decode_sequence():
    e_out, e_h, e_c = encoder_model.predict(x_val[1].reshape(1, MAX_TEXT_LEN))
    
    target_seq = np.zeros((1,1))
    
    target_seq[0,0] = target_word_index['sostok']
    
    stop_condition = False
    decoded_seq = ''
    count = 0
    while not stop_condition:
        
        output_tokens,state_h2,state_c2 = decoder_model.predict([target_seq,e_h,e_c])
        
        sampled_token_index = np.argmax(output_tokens)
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if sampled_token!='eostok':
            decoded_seq+=' '+sampled_token
            
        if sampled_token == 'eostok' or len(decoded_seq.split()) >= MAX_SUM_LEN:
            stop_condition = True
        
        target_seq = np.zeros((1,1))
        target_seq[0,0] = sampled_token_index
        
        e_h, e_c = state_h2, state_c2
    return decoded_seq

In [28]:
print(decode_sequence())

 us to to to to to to in in
