In [1]:
# importing libiray
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

porter = PorterStemmer()

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation

In [2]:
# normalization (Data Preprocessing)
def cleanText(text):
    text = text.lower()
    text = re.sub('[^\w\s]','',text)
    
    #Remove spaces at the beginning and at the end of the string
    text.strip()
    
    txt=[]
    for w in text.split():
        stemWord = porter.stem(w)
        txt.append(stemWord)
    txt = ' '.join(txt)
    return txt

In [3]:
# reading the datasets (training - testing - validation)
train_df = pd.read_csv('Dataset/train.csv')
test_df = pd.read_csv('Dataset/test.csv')
val_df = pd.read_csv('Dataset/val.csv')
# train_df.head()
train_df.shape

(20000, 2)

In [4]:
# Combine data from the three CSV files into a single DataFrame
pre = pd.DataFrame()
pre['text'] = pd.concat([train_df['document'], val_df['document'], test_df['document']], ignore_index=True)
pre['summary'] = pd.concat([train_df['summary'], val_df['summary'], test_df['summary']], ignore_index=True)
pre.head()

Unnamed: 0,text,summary
0,jason blake of the islanders will miss the res...,blake missing rest of season
1,the u.s. military on wednesday captured a wife...,u.s. arrests wife and daughter of saddam deput...
2,craig bellamy 's future at west ham appeared i...,west ham drops bellamy amid transfer turmoil
3,cambridge - when barack obama sought advice be...,in search for expertise harvard looms large
4,"wall street held on to steep gains on monday ,...",wall street ends a three-day losing streak


In [5]:
pre['text'] = pre['text'].apply(cleanText)
pre['summary'] = pre['summary'].apply(cleanText)

In [6]:
pre.shape
# pre.head()

(22000, 2)

In [7]:
#check for null values
pre.isnull().sum()

text       0
summary    0
dtype: int64

# 

In [8]:
# Tokenize the text to get the vocab count 
from tensorflow.keras.preprocessing.text import Tokenizer 

x_tokenizer = Tokenizer(lower=True, split=' ') 
x_tokenizer.fit_on_texts(pre['text'].values)

In [9]:
# print(x_tokenizer.word_index)

In [10]:
X = x_tokenizer.texts_to_sequences(pre['text'].values)
print(X[0])

[3321, 8048, 3, 1, 348, 37, 601, 1, 1358, 3, 1, 323, 555, 51, 425, 39, 13, 19, 969, 53, 20, 9750, 1210, 6, 28, 4, 415, 2116, 964]


In [11]:
# make all text same size
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X)
print(X[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0 3321 8048    3
    1  348   37  601    1 1358    3    1  323  555   51  425   39   13
   19  969   53   20 9750 1210    6   28    4  415 2116  964]


In [12]:
X.shape

(22000, 68)

In [13]:
# Tokenize the text to get the vocab count 
from tensorflow.keras.preprocessing.text import Tokenizer 

y_tokenizer = Tokenizer(lower=True, split=' ') 
y_tokenizer.fit_on_texts(pre['summary'].values)

In [14]:
# print(y_tokenizer.word_index)

In [15]:
Y = y_tokenizer.texts_to_sequences(pre['summary'].values)
print(Y[1])

[7, 64, 854, 18, 1124, 3, 831, 703, 130, 582, 4, 4588]


In [16]:
# make all text same size
Y = pad_sequences(Y)
print(Y[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    7
   64  854   18 1124    3  831  703  130  582    4 4588]


In [17]:
Y.shape

(22000, 25)

In [18]:
# Y = pd.get_dummies(pre['summary'])
# Y[0]
Y[:4]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0, 4587,  227,
        1696,    3,  796],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    7,   64,  854,   18, 1124,    3,  831,  703,  130,
         582,    4, 4588],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,  439, 3947,  223, 7425,
         242, 1201, 2481],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    2,  505,    4, 5545,
        5546, 1202, 1385]], dtype=int32)

In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(17600, 68) (17600, 25)
(4400, 68) (4400, 25)


In [20]:
# Check on the most number of word in text
# Check how much % of text have 0-60 words
cnt = 0
for i in pre['text']:
    if len(i.split()) <= 70:
        cnt = cnt + 1
print(cnt / len(pre['text']))

1.0


In [21]:
# Check on the most number of word in summary
# Check how much % of summary have 0-20 words
cnt = 0
for i in pre['summary']:
    if len(i.split()) <= 20:
        cnt = cnt + 1
print(cnt / len(pre['summary']))

0.9997272727272727


In [22]:
# Model to summarize the text between 0-20 words for Summary and 0-70 words for Text
max_text_len = 70
max_summary_len = 20

In [23]:
# # Select the Summaries and Text which fall below max length 

# import numpy as np

# cleaned_text = np.array(pre['text'])
# cleaned_summary = np.array(pre['summary'])

# short_text = []
# short_summary = []

# for i in range(len(train_df)):
#     if len(cleaned_summary[i].split()) <= max_summary_len and len(cleaned_text[i].split()) <= max_text_len:
#         short_text.append(cleaned_text[i])
#         short_summary.append(cleaned_summary[i])
        
# post_pre = pd.DataFrame({'text': short_text,'summary': short_summary})

# post_pre.head(2)

In [24]:
# # Add sostok(start of the sequence) and eostok(end of the sequence)

# post_pre['summary'] = post_pre['summary'].apply(lambda x: 'sostok ' + x \
#         + ' eostok')

# post_pre.head(2)


In [25]:
# # split the data into train and test data chunks.

# from sklearn.model_selection import train_test_split

# x_train ,x_val, y_train, y_val = train_test_split(
#     np.array(post_pre["text"]),
#     np.array(post_pre["summary"]),
#     test_size=0.1,
#     random_state=0,
#     shuffle=True,
# )

In [26]:
# # Tokenize the text to get the vocab count 
# from tensorflow.keras.preprocessing.text import Tokenizer 
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Prepare a tokenizer on training data
# x_tokenizer = Tokenizer() 
# x_tokenizer.fit_on_texts(list(x_train))

In [27]:
# #Find the percentage of occurrence of rare words (say, occurring less than 5 times) in the text.
# thresh = 5

# cnt = 0
# tot_cnt = 0

# for key, value in x_tokenizer.word_counts.items():
#     tot_cnt = tot_cnt + 1
#     if value < thresh:
#         cnt = cnt + 1
    
# print("% of rare words in vocabulary: ", (cnt / tot_cnt) * 100)

# # Prepare a tokenizer, again -- by not considering the rare words
# x_tokenizer = Tokenizer(num_words = tot_cnt - cnt) 
# x_tokenizer.fit_on_texts(list(x_train))

# # Convert text sequences to integer sequences 
# x_tr_seq = x_tokenizer.texts_to_sequences(x_train) 
# x_val_seq = x_tokenizer.texts_to_sequences(x_val)

# # Pad zero upto maximum length
# x_train = pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
# x_val = pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

# # Size of vocabulary (+1 for padding token)
# x_voc = x_tokenizer.num_words + 1

# print("Size of vocabulary in X = {}".format(x_voc))

In [28]:
# # Prepare a tokenizer on testing data
# y_tokenizer = Tokenizer()   
# y_tokenizer.fit_on_texts(list(y_train))

# thresh = 5

# cnt = 0
# tot_cnt = 0

# for key, value in y_tokenizer.word_counts.items():
#     tot_cnt = tot_cnt + 1
#     if value < thresh:
#         cnt = cnt + 1
    
# print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)

# # Prepare a tokenizer, again -- by not considering the rare words
# y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
# y_tokenizer.fit_on_texts(list(y_train))

# # Convert text sequences to integer sequences 
# y_tr_seq = y_tokenizer.texts_to_sequences(y_train) 
# y_val_seq = y_tokenizer.texts_to_sequences(y_val) 

# # Pad zero upto maximum length
# y_tr = pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
# y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

# # Size of vocabulary (+1 for padding token)
# y_voc = y_tokenizer.num_words + 1

# print("Size of vocabulary in Y = {}".format(y_voc))

In [29]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [30]:
latent_dim = 300
embedding_dim = 200

# Encoder
encoder_inputs = Input(shape=(max_text_len, ))

# Embedding layer
enc_emb = Embedding(x_voc, embedding_dim, trainable=True)(encoder_inputs)

# Encoder LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)

# Encoder LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)

# Encoder LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Dense layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


NameError: name 'x_voc' is not defined

In [106]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [130]:
# y_train
y_train = y_train[:, -1].reshape(y_train.shape[0], y_train.shape[1], 1)
y_train
# history = model.fit(
#     [x_train, y_train[:, :-1]],
#     y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
#     epochs=50,
#     callbacks=[es],
#     batch_size=128,
#     validation_data=([x_val, y_val[:, :-1]],
#                      y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:
#                      , 1:]),
#     )

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
# from matplotlib import pyplot

# pyplot.plot(history.history['loss'], label='train')
# pyplot.plot(history.history['val_loss'], label='test')
# pyplot.legend()
# pyplot.show()