In [92]:
# importing libiray
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

porter = PorterStemmer()

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation

In [93]:
# normalization (Data Preprocessing)
def cleanText(text):
    text = text.lower()
    text = re.sub('[^\w\s]','',text)
    
    #Remove spaces at the beginning and at the end of the string
    text.strip()
    
    txt=[]
    for w in text.split():
        stemWord = porter.stem(w)
        txt.append(stemWord)
    txt = ' '.join(txt)
    return txt

In [94]:
# reading the datasets (training - testing - validation)
train_df = pd.read_csv('Dataset/train.csv')
test_df = pd.read_csv('Dataset/test.csv')
val_df = pd.read_csv('Dataset/val.csv')
# train_df.head()
train_df.shape

(20000, 2)

In [95]:
# Combine data from the three CSV files into a single DataFrame
pre = pd.DataFrame()
pre['text'] = pd.concat([train_df['document'], val_df['document'], test_df['document']], ignore_index=True)
pre['summary'] = pd.concat([train_df['summary'], val_df['summary'], test_df['summary']], ignore_index=True)
pre.head()

Unnamed: 0,text,summary
0,jason blake of the islanders will miss the res...,blake missing rest of season
1,the u.s. military on wednesday captured a wife...,u.s. arrests wife and daughter of saddam deput...
2,craig bellamy 's future at west ham appeared i...,west ham drops bellamy amid transfer turmoil
3,cambridge - when barack obama sought advice be...,in search for expertise harvard looms large
4,"wall street held on to steep gains on monday ,...",wall street ends a three-day losing streak


In [96]:
pre['text'] = pre['text'].apply(cleanText)
pre['summary'] = pre['summary'].apply(cleanText)

In [97]:
pre.shape
# pre.head()

(22000, 2)

In [98]:
#check for null values
pre.isnull().sum()

text       0
summary    0
dtype: int64

# 

In [160]:
pip install spacy

Collecting spacy
  Downloading spacy-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 495 kB/s eta 0:00:01
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.7-cp39-cp39-macosx_10_9_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 940 kB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp39-cp39-macosx_10_9_x86_64.whl (32 kB)
Collecting wasabi<1.1.0,>=0.9.1
  Downloading wasabi-0.9.1-py3-none-any.whl (26 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp39-cp39-macosx_10_9_x86_64.whl (457 kB)
[K     |████████████████████████████████| 457 kB 1.8 MB/s eta 0:00:01
Collecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.16-cp39-cp39-macosx_10_9_x86_64.whl (645 kB)
[K     |████████████████████████████████| 645 kB 1.8 MB/s eta 0:00:01
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloadi

In [163]:
# This ensures that all pieces of text and summaries possess the string data type.
import spacy
from time import time

nlp = spacy.blank('en') 

# Process text as batches and yield Doc objects in order
text = [str(doc) for doc in nlp.pipe(pre['text'], batch_size=5000)]

summary = ['_START_ '+ str(doc) + ' _END_' for doc in nlp.pipe(pre['summary'], batch_size=5000)]

In [164]:
text[0]

'jason blake of the island will miss the rest of the season so he can be with hi wife who ha thyroid cancer and is to give birth april'

In [165]:
summary[0]

'_START_ blake miss rest of season _END_'

In [166]:
pre['text'] = pd.Series(text)
pre['summary'] = pd.Series(summary)

In [167]:
# Check on the most number of word in text
# Check how much % of text have 0-60 words
cnt = 0
for i in pre['text']:
    if len(i.split()) <= 70:
        cnt = cnt + 1
print(cnt / len(pre['text']))

1.0


In [168]:
# Check on the most number of word in summary
# Check how much % of summary have 0-20 words
cnt = 0
for i in pre['summary']:
    if len(i.split()) <= 20:
        cnt = cnt + 1
print(cnt / len(pre['summary']))

0.9992727272727273


In [169]:
# Model to summarize the text between 0-20 words for Summary and 0-70 words for Text
max_text_len = 70
max_summary_len = 20

In [170]:
# Select the Summaries and Text which fall below max length 

import numpy as np

cleaned_text = np.array(pre['text'])
cleaned_summary = np.array(pre['summary'])

short_text = []
short_summary = []

for i in range(len(train_df)):
    if len(cleaned_summary[i].split()) <= max_summary_len and len(cleaned_text[i].split()) <= max_text_len:
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
post_pre = pd.DataFrame({'text': short_text,'summary': short_summary})

post_pre.head(2)

Unnamed: 0,text,summary
0,jason blake of the island will miss the rest o...,_START_ blake miss rest of season _END_
1,the us militari on wednesday captur a wife and...,_START_ us arrest wife and daughter of saddam ...


In [171]:
# Add sostok(start of the sequence) and eostok(end of the sequence)

post_pre['summary'] = post_pre['summary'].apply(lambda x: 'sostok ' + x \
        + ' eostok')

post_pre.head(2)


Unnamed: 0,text,summary
0,jason blake of the island will miss the rest o...,sostok _START_ blake miss rest of season _END_...
1,the us militari on wednesday captur a wife and...,sostok _START_ us arrest wife and daughter of ...


In [172]:
# split the data into train and test data chunks.

from sklearn.model_selection import train_test_split

x_train ,x_val, y_train, y_val = train_test_split(
    np.array(post_pre["text"]),
    np.array(post_pre["summary"]),
    test_size=0.1,
    random_state=0,
    shuffle=True,
)

In [173]:
# Tokenize the text to get the vocab count 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare a tokenizer on training data
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(x_train))

# print(x_tokenizer.word_index)

In [174]:
#Find the percentage of occurrence of rare words (say, occurring less than 5 times) in the text.
thresh = 5

cnt = 0
tot_cnt = 0

for key, value in x_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
print("% of rare words in vocabulary: ", (cnt / tot_cnt) * 100)

% of rare words in vocabulary:  70.78782787267805


In [175]:
# Prepare a tokenizer, again -- by not considering the rare words
x_tokenizer = Tokenizer(num_words = tot_cnt - cnt) 
x_tokenizer.fit_on_texts(list(x_train))

# Convert text sequences to integer sequences 
x_tr_seq = x_tokenizer.texts_to_sequences(x_train) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

# Pad zero upto maximum length
x_train = pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val = pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

# Size of vocabulary (+1 for padding token)
x_voc = x_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))

Size of vocabulary in X = 6260


In [176]:
# Prepare a tokenizer on testing data
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(list(y_train))

thresh = 5

cnt = 0
tot_cnt = 0

for key, value in y_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)

# Prepare a tokenizer, again -- by not considering the rare words
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(y_train))

# Convert text sequences to integer sequences 
y_tr_seq = y_tokenizer.texts_to_sequences(y_train) 
y_val_seq = y_tokenizer.texts_to_sequences(y_val) 

# Pad zero upto maximum length
y_tr = pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

# Size of vocabulary (+1 for padding token)
y_voc = y_tokenizer.num_words + 1

print("Size of vocabulary in Y = {}".format(y_voc))

% of rare words in vocabulary: 71.19436019345848
Size of vocabulary in Y = 3515


In [177]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [178]:
latent_dim = 300
embedding_dim = 200

# Encoder
encoder_inputs = Input(shape=(max_text_len, ))

# Embedding layer
enc_emb = Embedding(x_voc, embedding_dim, trainable=True)(encoder_inputs)

# Encoder LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)

# Encoder LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)

# Encoder LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Dense layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 70, 200)      1252000     ['input_3[0][0]']                
                                                                                                  
 lstm_4 (LSTM)                  [(None, 70, 300),    601200      ['embedding_2[0][0]']            
                                 (None, 300),                                                     
                                 (None, 300)]                                                     
                                                                                            

In [179]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [180]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [181]:
# y_train[0][0:-7]
# y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:,1:]
y_train.shape[0

SyntaxError: unexpected EOF while parsing (263454914.py, line 3)

In [182]:

history = model.fit([x_train, y_train[0][0:-7] ], epochs=50, callbacks=[es], batch_size=128)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'str'>", "<class 'numpy.ndarray'>"}), <class 'NoneType'>

In [130]:
# y_train
y_train = y_train[:, -1].reshape(y_train.shape[0], y_train.shape[1], 1)
y_train
# history = model.fit(
#     [x_train, y_train[:, :-1]],
#     y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
#     epochs=50,
#     callbacks=[es],
#     batch_size=128,
#     validation_data=([x_val, y_val[:, :-1]],
#                      y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:
#                      , 1:]),
#     )

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [183]:
# from matplotlib import pyplot

# pyplot.plot(history.history['loss'], label='train')
# pyplot.plot(history.history['val_loss'], label='test')
# pyplot.legend()
# pyplot.show()

In [None]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index


In [None]:
# Inference Models

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs,
                      state_h, state_c])

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
(decoder_outputs2, state_h2, state_c2) = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [None]:
def decode_sequence(input_seq):

    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) \
            >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [None]:
# To convert sequence to summary
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['sostok'] and i \
            != target_word_index['eostok']:
            newString = newString + reverse_target_word_index[i] + ' '

    return newString


# To convert sequence to text
def seq2text(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0:
            newString = newString + reverse_source_word_index[i] + ' '

    return newString