# Text Summarization || Deep Learning || NLP

# Import Libraries

In [1]:
#import all the required libraries
import numpy as np
import pandas as pd
import pickle
from statistics import mode
import nltk
from nltk import word_tokenize
from nltk.stem import LancasterStemmer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from tensorflow.keras.models import Model
from tensorflow.keras import models
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,Attention
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to /Users/buddha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/buddha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/buddha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Parse the Data

In [2]:
#read the dataset file for text Summarizer
df=pd.read_csv("data/Reviews.csv",nrows=100000)
#drop the duplicate and na values from the records
df.drop_duplicates(subset=['Text'],inplace=True)
df.dropna(axis=0,inplace=True)
input_data = df.loc[:,'Text']
target_data = df.loc[:,'Summary']
target_data.replace('', np.nan, inplace=True)

# Preprocessing

In [3]:
input_texts=[]
target_texts=[]
input_words=[]
target_words=[]
contractions=pickle.load(open("contractions.pkl","rb"))['contractions']
#initialize stop words and LancasterStemmer
stop_words=set(stopwords.words('english'))
stemm=LancasterStemmer()

In [4]:
input_texts

[]

# Data Cleaning

In [5]:
def clean(texts,src):
  #remove the html tags
  texts = BeautifulSoup(texts, "lxml").text
  #tokenize the text into words 
  words=word_tokenize(texts.lower())
  #filter words which contains \ 
  #integers or their length is less than or equal to 3
  words= list(filter(lambda w:(w.isalpha() and len(w)>=3),words))
  #contraction file to expand shortened words
  words= [contractions[w] if w in contractions else w for w in words ]
  #stem the words to their root word and filter stop words
  if src=="inputs":
    words= [stemm.stem(w) for w in words if w not in stop_words]
  else:
    words= [w for w in words if w not in stop_words]
  return words

In [6]:
#pass the input records and taret records
for in_txt,tr_txt in zip(input_data,target_data):
  in_words= clean(in_txt,"inputs")
  input_texts+= [' '.join(in_words)]
  input_words+= in_words
  #add 'sos' at start and 'eos' at end of text
  tr_words= clean("sos "+tr_txt+" eos","target")
  target_texts+= [' '.join(tr_words)]
  target_words+= tr_words

In [7]:
#store only unique words from input and target list of words
input_words = sorted(list(set(input_words)))
target_words = sorted(list(set(target_words)))
num_in_words = len(input_words) #total number of input words
num_tr_words = len(target_words) #total number of target words
 
#get the length of the input and target texts which appears most often  
max_in_len = mode([len(i) for i in input_texts])
max_tr_len = mode([len(i) for i in target_texts])
 
print("number of input words : ",num_in_words)
print("number of target words : ",num_tr_words)
print("maximum input length : ",max_in_len)
print("maximum target length : ",max_tr_len)

number of input words :  32198
number of target words :  14171
maximum input length :  74
maximum target length :  17


# Split it

In [8]:
#split the input and target text into 80:20 ratio or testing size of 20%.
x_train,x_test,y_train,y_test=train_test_split(input_texts,target_texts,test_size=0.2,random_state=0) 

In [9]:
#train the tokenizer with all the words
in_tokenizer = Tokenizer()
in_tokenizer.fit_on_texts(x_train)
tr_tokenizer = Tokenizer()
tr_tokenizer.fit_on_texts(y_train)
 
#convert text into sequence of integers
#where the integer will be the index of that word
x_train= in_tokenizer.texts_to_sequences(x_train) 
y_train= tr_tokenizer.texts_to_sequences(y_train)

In [10]:
#pad array of 0's if the length is less than the maximum length 
en_in_data= pad_sequences(x_train,  maxlen=max_in_len, padding='post') 
dec_data= pad_sequences(y_train,  maxlen=max_tr_len, padding='post')
 
#decoder input data will not include the last word 
#i.e. 'eos' in decoder input data
dec_in_data = dec_data[:,:-1]
#decoder target data will be one time step ahead as it will not include
# the first word i.e 'sos'
dec_tr_data = dec_data.reshape(len(dec_data),max_tr_len,1)[:,1:]

# Model Building

In [11]:
K.clear_session() 
latent_dim = 500
 
#create input object of total number of encoder words
en_inputs = Input(shape=(max_in_len,)) 
en_embedding = Embedding(num_in_words+1, latent_dim)(en_inputs) 

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2021-12-20 21:20:36.585684: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-20 21:20:36.592648: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-20 21:20:36.593660: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
#create 3 stacked LSTM layer with the shape of hidden dimension for text summarizer using deep learning
#LSTM 1
en_lstm1= LSTM(latent_dim, return_state=True, return_sequences=True) 
en_outputs1, state_h1, state_c1= en_lstm1(en_embedding) 
 
#LSTM2
en_lstm2= LSTM(latent_dim, return_state=True, return_sequences=True) 
en_outputs2, state_h2, state_c2= en_lstm2(en_outputs1) 
 
#LSTM3
en_lstm3= LSTM(latent_dim,return_sequences=True,return_state=True)
en_outputs3 , state_h3 , state_c3= en_lstm3(en_outputs2)
 
#encoder states
en_states= [state_h3, state_c3]

In [13]:
#create 3 stacked LSTM layer with the shape of hidden dimension for text summarizer using deep learning
#LSTM 1
en_lstm1= LSTM(latent_dim, return_state=True, return_sequences=True) 
en_outputs1, state_h1, state_c1= en_lstm1(en_embedding) 
 
#LSTM2
en_lstm2= LSTM(latent_dim, return_state=True, return_sequences=True) 
en_outputs2, state_h2, state_c2= en_lstm2(en_outputs1) 
 
#LSTM3
en_lstm3= LSTM(latent_dim,return_sequences=True,return_state=True)
en_outputs3 , state_h3 , state_c3= en_lstm3(en_outputs2)
 
#encoder states
en_states= [state_h3, state_c3]

# Decoder

In [15]:
# Decoder. 
dec_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(num_tr_words+1, latent_dim) 
dec_embedding = dec_emb_layer(dec_inputs) 
 
#initialize decoder's LSTM layer with the output states of encoder
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_outputs, *_ = dec_lstm(dec_embedding,initial_state=en_states) 

# Attention Layer

In [16]:
#Attention layer
attention =Attention()
attn_out = attention([dec_outputs,en_outputs3])
 
#Concatenate the attention output with the decoder outputs
merge=Concatenate(axis=-1, name='concat_layer1')([dec_outputs,attn_out])

In [17]:
#Dense layer (output layer)
dec_dense = Dense(num_tr_words+1, activation='softmax') 
dec_outputs = dec_dense(merge) 

# Train the Model

In [21]:
#Model class and model summary for text Summarizer
model = Model([en_inputs, dec_inputs], dec_outputs) 
model.summary()
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 74)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 74, 500)      16099500    input_1[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, 74, 500), (N 2002000     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
____________________________________________________________________________________________

In [22]:
model.compile( 
    optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"] ) 
model.fit( 
    [en_in_data, dec_in_data],
    dec_tr_data, 
    batch_size=512, 
    epochs=10, 
    validation_split=0.1,
    )
 
#Save model
model.save("s2s")

Epoch 1/10


InvalidArgumentError: Cannot assign a device for operation model_2/embedding/embedding_lookup: Could not satisfy explicit device specification '/job:localhost/replica:0/task:0/device:GPU:0' because no supported kernel for GPU devices is available.
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=2 requested_device_name_='/job:localhost/replica:0/task:0/device:GPU:0' assigned_device_name_='/job:localhost/replica:0/task:0/device:GPU:0' resource_device_name_='/job:localhost/replica:0/task:0/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
RealDiv: GPU CPU 
Sqrt: GPU CPU 
GatherV2: GPU CPU 
AssignVariableOp: GPU CPU 
UnsortedSegmentSum: GPU CPU 
Identity: GPU CPU 
StridedSlice: CPU 
Const: GPU CPU 
NoOp: GPU CPU 
Mul: GPU CPU 
Shape: GPU CPU 
_Arg: GPU CPU 
ResourceScatterAdd: GPU CPU 
Unique: CPU 
ReadVariableOp: GPU CPU 
AddV2: GPU CPU 
ResourceGather: GPU CPU 

Colocation members, user-requested devices, and framework assigned devices, if any:
  model_2_embedding_embedding_lookup_17013 (_Arg)  framework assigned device=/job:localhost/replica:0/task:0/device:GPU:0
  rmsprop_rmsprop_update_readvariableop_resource (_Arg)  framework assigned device=/job:localhost/replica:0/task:0/device:GPU:0
  model_2/embedding/embedding_lookup (ResourceGather) /job:localhost/replica:0/task:0/device:GPU:0
  model_2/embedding/embedding_lookup/Identity (Identity) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/Unique (Unique) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/Shape (Shape) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/strided_slice/stack (Const) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/strided_slice/stack_1 (Const) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/strided_slice/stack_2 (Const) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/strided_slice (StridedSlice) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/UnsortedSegmentSum (UnsortedSegmentSum) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/mul (Mul) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/mul_1 (Mul) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/ReadVariableOp (ReadVariableOp) 
  RMSprop/RMSprop/update/mul_2 (Mul) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/AssignVariableOp (AssignVariableOp) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/ResourceScatterAdd (ResourceScatterAdd) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/ReadVariableOp_1 (ReadVariableOp) 
  RMSprop/RMSprop/update/GatherV2/axis (Const) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/GatherV2 (GatherV2) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/mul_3 (Mul) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/Sqrt (Sqrt) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/add (AddV2) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/truediv (RealDiv) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/ResourceScatterAdd_1 (ResourceScatterAdd) /job:localhost/replica:0/task:0/device:GPU:0
  RMSprop/RMSprop/update/ReadVariableOp_2 (ReadVariableOp) 
  RMSprop/RMSprop/update/group_deps (NoOp) /job:localhost/replica:0/task:0/device:GPU:0

Op: ResourceGather
Node attrs: _class=["loc:@model_2/embedding/embedding_lookup/17013"], batch_dims=0, Tindices=DT_INT32, dtype=DT_FLOAT, validate_indices=true
Registered kernels:
  device='XLA_CPU_JIT'; Tindices in [DT_INT32, DT_INT64]; dtype in [DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, DT_INT8, DT_COMPLEX64, DT_INT64, DT_BOOL, DT_QINT8, DT_QUINT8, DT_QINT32, DT_BFLOAT16, DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64]
  device='GPU'; dtype in [DT_INT64]; Tindices in [DT_INT32]
  device='GPU'; dtype in [DT_INT64]; Tindices in [DT_INT64]
  device='GPU'; dtype in [DT_FLOAT]; Tindices in [DT_INT32]
  device='GPU'; dtype in [DT_FLOAT]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_UINT64]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_UINT64]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_INT64]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_INT64]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_UINT32]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_UINT32]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_UINT16]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_UINT16]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_INT16]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_INT16]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_UINT8]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_UINT8]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_INT8]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_INT8]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_INT32]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_INT32]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_HALF]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_HALF]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_BFLOAT16]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_BFLOAT16]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_FLOAT]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_FLOAT]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_DOUBLE]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_DOUBLE]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_COMPLEX64]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_COMPLEX64]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_COMPLEX128]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_COMPLEX128]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_BOOL]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_BOOL]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_STRING]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_STRING]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_RESOURCE]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_RESOURCE]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_VARIANT]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_VARIANT]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_QINT8]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_QINT8]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_QUINT8]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_QUINT8]; Tindices in [DT_INT64]
  device='CPU'; dtype in [DT_QINT32]; Tindices in [DT_INT32]
  device='CPU'; dtype in [DT_QINT32]; Tindices in [DT_INT64]

	 [[{{node model_2/embedding/embedding_lookup}}]] [Op:__inference_train_function_21402]