In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

porter = PorterStemmer()

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation

In [3]:
def cleanText(text):
    text = text.lower()
    text = re.sub('[^\w\s]','',text)
    
    text.strip()
    
    txt=[]
    for w in text.split():
        stemWord = porter.stem(w)
        txt.append(stemWord)
    txt = ' '.join(txt)
    return txt

In [4]:
train_df = pd.read_csv('/kaggle/input/textsummarization-data/train.csv')
test_df = pd.read_csv('/kaggle/input/textsummarization-data/test.csv')
val_df = pd.read_csv('/kaggle/input/textsummarization-data/val.csv')
train_df.shape

(20000, 2)

In [5]:
pre = pd.DataFrame()
pre['text'] = pd.concat([train_df['document'], val_df['document'], test_df['document']], ignore_index=True)
pre['summary'] = pd.concat([train_df['summary'], val_df['summary'], test_df['summary']], ignore_index=True)
pre.head()

Unnamed: 0,text,summary
0,jason blake of the islanders will miss the res...,blake missing rest of season
1,the u.s. military on wednesday captured a wife...,u.s. arrests wife and daughter of saddam deput...
2,craig bellamy 's future at west ham appeared i...,west ham drops bellamy amid transfer turmoil
3,cambridge - when barack obama sought advice be...,in search for expertise harvard looms large
4,"wall street held on to steep gains on monday ,...",wall street ends a three-day losing streak


In [6]:
pre['text'] = pre['text'].apply(cleanText)
pre['summary'] = pre['summary'].apply(cleanText)

In [7]:
pre.shape


(22000, 2)

In [8]:

pre.isnull().sum()

text       0
summary    0
dtype: int64

# 

In [9]:
cnt = 0
for i in pre['text']:
    if len(i.split()) <= 70:
        cnt = cnt + 1
print(cnt / len(pre['text']))

1.0


In [10]:
cnt = 0
for i in pre['summary']:
    if len(i.split()) <= 20:
        cnt = cnt + 1
print(cnt / len(pre['summary']))

0.9997272727272727


In [11]:
max_text_len = 70
max_summary_len = 20

In [12]:

import numpy as np

cleaned_text = np.array(pre['text'])
cleaned_summary = np.array(pre['summary'])

short_text = []
short_summary = []

for i in range(len(train_df)):
    if len(cleaned_summary[i].split()) <= max_summary_len and len(cleaned_text[i].split()) <= max_text_len:
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
post_pre = pd.DataFrame({'text': short_text,'summary': short_summary})

post_pre.head(2)

Unnamed: 0,text,summary
0,jason blake of the island will miss the rest o...,blake miss rest of season
1,the us militari on wednesday captur a wife and...,us arrest wife and daughter of saddam deputi t...


In [13]:
post_pre['summary'] = post_pre['summary'].apply(lambda x: 'sostok ' + x \
        + ' eostok')

post_pre.head(2)


Unnamed: 0,text,summary
0,jason blake of the island will miss the rest o...,sostok blake miss rest of season eostok
1,the us militari on wednesday captur a wife and...,sostok us arrest wife and daughter of saddam d...


In [14]:
from sklearn.model_selection import train_test_split

x_train ,x_test, y_train, y_test = train_test_split(
    np.array(post_pre["text"]),
    np.array(post_pre["summary"]),
    test_size=0.1,
    random_state=0,
    shuffle=True,
)

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(x_train))

In [16]:
x_tokenizer.word_counts.items()

odict_items([('the', 28632), ('second', 422), ('round', 206), ('climat', 58), ('meet', 413), ('of', 14935), ('major', 379), ('economi', 275), ('will', 1655), ('be', 1395), ('conclud', 47), ('with', 3215), ('a', 15583), ('chairman', 156), ('s', 7190), ('summari', 3), ('an', 2797), ('offici', 1692), ('attend', 85), ('told', 161), ('xinhua', 81), ('on', 7875), ('thursday', 2134), ('tivo', 2), ('inc', 165), ('and', 9165), ('nero', 1), ('ag', 57), ('germani', 205), ('were', 1026), ('set', 379), ('to', 13707), ('announc', 553), ('monday', 2035), ('that', 3514), ('they', 659), ('launch', 231), ('packag', 49), ('turn', 151), ('window', 30), ('pc', 7), ('into', 786), ('tv', 68), ('record', 263), ('just', 185), ('like', 207), ('settop', 1), ('box', 33), ('three', 622), ('us', 2132), ('soldier', 239), ('injur', 240), ('tuesday', 2244), ('when', 529), ('their', 1304), ('militari', 529), ('vehicl', 83), ('roll', 45), ('over', 1203), ('while', 297), ('patrol', 38), ('in', 12800), ('southeast', 76), 

In [17]:
cnt = 0
tot_cnt = 0

for key, value in x_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < 5:
        cnt = cnt + 1
    
print("% of rare words in vocabulary: ", (cnt / tot_cnt) * 100)

% of rare words in vocabulary:  70.7153189305212


In [18]:

x_tokenizer = Tokenizer(num_words = tot_cnt - cnt) 
x_tokenizer.fit_on_texts(list(x_train))


x_train_seq = x_tokenizer.texts_to_sequences(x_train) 
x_test_seq = x_tokenizer.texts_to_sequences(x_test)


x_train = pad_sequences(x_train_seq,  maxlen=max_text_len, padding='post')
x_test = pad_sequences(x_test_seq, maxlen=max_text_len, padding='post')


x_voc = x_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))

Size of vocabulary in X = 6277


In [19]:

y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(y_train))

cnt = 0
tot_cnt = 0

for key, value in y_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < 5:
        cnt = cnt + 1
    
print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)


y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(y_train))

 
y_train_seq = y_tokenizer.texts_to_sequences(y_train) 
y_test_seq = y_tokenizer.texts_to_sequences(y_test) 


y_train = pad_sequences(y_train_seq, maxlen=max_summary_len, padding='post')
y_test = pad_sequences(y_test_seq, maxlen=max_summary_len, padding='post')


y_voc = y_tokenizer.num_words + 1

print("Size of vocabulary in Y = {}".format(y_voc))

% of rare words in vocabulary: 71.4763185782995
Size of vocabulary in Y = 3500


In [20]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [21]:
latent_dim = 300
embedding_dim = 200


encoder_inputs = Input(shape=(max_text_len, ))


enc_emb = Embedding(x_voc, embedding_dim, trainable=True)(encoder_inputs)


encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)


encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)


encoder_lstm3 = LSTM(latent_dim, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)


decoder_inputs = Input(shape=(None, ))


dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)


decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])


decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


In [22]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [23]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [25]:
history = model.fit(
    [x_train, y_train[:, :-1]],
    y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
    epochs=5,
    callbacks=[es],
    batch_size=128,
    validation_data=([x_test, y_test[:, :-1]],y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:, 1:]),
    )

Epoch 1/5
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 299ms/step - loss: 3.9273 - val_loss: 2.7501
Epoch 2/5
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 292ms/step - loss: 2.7791 - val_loss: 2.7298
Epoch 3/5
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 291ms/step - loss: 2.7541 - val_loss: 2.7183
Epoch 4/5
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 290ms/step - loss: 2.7337 - val_loss: 2.7021
Epoch 5/5
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 289ms/step - loss: 2.7165 - val_loss: 2.6917


In [26]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index


In [27]:

encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs,
                      state_h, state_c])


decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim))


dec_emb2 = dec_emb_layer(decoder_inputs)


(decoder_outputs2, state_h2, state_c2) = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])


decoder_outputs2 = decoder_dense(decoder_outputs2)


decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [28]:
def decode_sequence(input_seq):

    
    (e_out, e_h, e_c) = encoder_model.predict(input_seq)

    
    target_seq = np.zeros((1, 1))

    
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c])

        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        
        if sampled_token == 'eostok' or len(decoded_sentence.split()) \
            >= max_summary_len - 1:
            stop_condition = True

        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [29]:
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['sostok'] and i \
            != target_word_index['eostok']:
            newString = newString + reverse_target_word_index[i] + ' '

    return newString


def seq2text(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0:
            newString = newString + reverse_source_word_index[i] + ' '

    return newString

In [30]:
for i in range(0, 19):
    print ('Review:', seq2text(x_train[i]))
    print ('Original summary:', seq2summary(y_train[i]))
    print ('Predicted summary:', decode_sequence(x_train[i].reshape(1,max_text_len)))
    print ('\n')

Review: the second round climat meet of major economi will be conclud with a chairman s an offici attend the meet told xinhua on thursday 
Original summary: climat chang meet to be conclud with chairman summari 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Predicted summary:  us s unk to the


Review: inc and ag of germani were set to announc monday that they will be launch a packag that turn a window pc into a tv record just like a box 
Original summary: new softwar turn pc into tv record 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[