# Recipe 6-1. Retrieving Information
Information retrieval is one of the highly used applications of NLP and it is
quite tricky. The meaning of the words or sentences not only depends on
the exact words used but also on the context and meaning. Two sentences
may be of completely different words but can convey the same meaning.
We should be able to capture that as well.
An information retrieval (IR) system allows users to efficiently
search documents and retrieve meaningful information based on a
search text/query.

## Problem

Information retrieval using word embeddings.

# Step 1-1 Import the libraries
Here are the libraries:

In [2]:
import gensim
from gensim.models import Word2Vec
import numpy as np 
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy 
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re

nltk.download('stopwords')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')  
print(stopword_list) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

# Step 1-2 Create/import documents
Randomly taking sentences from the internet:

In [3]:
# Randomly taking sentences from internet 

Doc1 = ["With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders." ] 
     
Doc2 = ["Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data."]

Doc3 = ["He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems."]

Doc4 = ["But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg."]

# Put all the documents in one list

fin= Doc1+Doc2+Doc3+Doc4

print(fin)


['With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.', 'Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.', 'He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems.', 'But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.']


# Step 1-3 Download word2vec
As mentioned earlier, we are going to use the word embeddings to solve
this problem. Download word2vec from the below link:

In [4]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
#!wget -P /root/input/ -c "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit/GoogleNews-vectors-negative300.bin.gz"

#load the model
from gensim.models.word2vec import Word2Vec
model = gensim.models.KeyedVectors.load_word2vec_format('/root/input/GoogleNews-vectors-negative300.bin.gz', binary=True)
#model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary = True);


--2020-08-26 19:25:41--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.32.198
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.32.198|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/root/input/GoogleNews-vectors-negative300.bin.gz’


2020-08-26 19:26:28 (34.2 MB/s) - ‘/root/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Step 1-4 Create IR [information retrieval] system
Now we build the information retrieval system:

In [5]:
#Preprocessing 

def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]' 
    text = re.sub(pattern, '', ''.join(text))
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Function to get the embedding vector for n dimension, we have used "300"

def get_embedding(word):
    if word in model.wv.vocab:
        #return model[x]
        return model[word]
    else:
        return np.zeros(300)


For every document, we will get a lot of vectors based on the number of
words present. We need to calculate the average vector for the document
through taking a mean of all the word vectors.

In [7]:
nltk.download('punkt')

# Getting average vector for each document 
out_dict =  {}
for sen in fin:
    average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen))]), axis=0))
    dict = { sen : (average_vector) }
    out_dict.update(dict)

# Function to calculate the similarity between the query vector and document vector

def get_sim(query_embedding, average_vector_doc):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vector_doc))]
    return sim

# Rank all the documents based on the similarity to get relevant docs

def Ranked_documents(query):
    query_words =  (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))
    rank = []
    for k,v in out_dict.items():
        rank.append((k, get_sim(query_words, v)))
    rank = sorted(rank,key=lambda t: t[1], reverse=True)
    print('Ranked Documents :')
    return rank


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




# Step 1-5 Results and applications
Let’s see how the information retrieval system we built is working with a
couple of examples.

In [8]:
# Call the IR function with a query
# If you see, doc4 (on top in result), this will be most relevant for the
# query “cricket” even though the word “cricket” is not even mentioned once
# with the similarity of 0.449.

Ranked_documents("cricket")


Ranked Documents :




[('But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.',
  [0.44954328830341783]),
 ('He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems.',
  [0.23973446930269127]),
 ('With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
  [0.1832371201201335]),
 ('Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.',
  [0.179950

In [9]:
#Let’s take one more example as may be driving.
# Again, since driving is connected to transport and the Motor Vehicles
# Act, it pulls out the most relevant documents on top. The first 2 documents
# are relevant to the query. 

Ranked_documents("driving")

Ranked Documents :




[('With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
  [0.3594728772380067]),
 ('But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.',
  [0.19042557661139026]),
 ('He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems.',
  [0.1706653724240128]),
 ('Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.',
  [0.0887230

# Recipe 6-2. Classifying Text with Deep Learning

# Problem
We want to build a text classification model using CNN, RNN, and LSTM.

# Step 2-1 Understanding/defining business problem
Email classification (spam or ham). We need to classify spam or ham email
based on email content.

# Step 2-2 Load data sources

In [26]:
# download spam.csv
import pandas as pd
import requests
import io
# csv_url = "https://github.com/alberwan/Test-Data/blob/master/spam.csv"
# csv_url = "https://drive.google.com/drive/folders/1ZOPzP0Id8B8d3xP4BMgLGoYLlPBRRKEs/spam.csv"
# csv_url = "https://www.kaggle.com/uciml/sms-spam-collection-dataset#spam.csv"
# csv_url = "spam.csv"

# csv_url = "https://www.kaggle.com/ishansoni/sms-spam-collection-dataset?select=spam.csv"
s = requests.get(csv_url).content
# file_content = pd.read_csv(io.StringIO(s.decode('utf-8')))
# file_content = pd.read_csv(csv_url, encoding="ISO-8859-1")
# file_content = pd.read_csv(csv_url)
file_content = pd.read_csv('/content/sample_data/spam.csv')
file_content.head()

Unnamed: 0,label,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Step 2-3 Text preprocessing

In [27]:
#check sample content in the email
file_content['text'][1]

#Import library
from nltk.corpus import stopwords
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Remove stop words
stop = stopwords.words('english')
# file_content['v2'] = file_content['v2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
file_content['text'] = file_content['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Delete unwanted columns
Email_Data = file_content[['label', 'text']]

# Rename column names
Email_Data = Email_Data.rename(columns={"label":"Target", "text":"Email"})
Email_Data.head()

#Delete punctuations, convert text in lower case and delete the double space 

Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', x.lower()))
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub(' ', ' ', x))
Email_Data['Email'].head(5)


0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4             nah i think goes usf lives around though
Name: Email, dtype: object

In [28]:
#Separating text(input) and target classes

list_sentences_rawdata = Email_Data["Email"].fillna("_na_").values
list_classes = ["Target"]
target = Email_Data[list_classes].values


To_Process=Email_Data[['Email', 'Target']]


# Step 2-4 Data preparation for model building

In [38]:
#Train and test split with 80:20 ratio
train, test = train_test_split(To_Process, test_size=0.2) 

# Define the sequence lengths, max number of words and embedding dimensions
# Sequence length of each sentence. If more, truncate. If less, pad with zeros

MAX_SEQUENCE_LENGTH = 300 

# Top 20000 frequently occurring words
MAX_NB_WORDS = 20000 
 
# Get the frequently occurring words
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(train.Email) 
train_sequences = tokenizer.texts_to_sequences(train.Email)
test_sequences = tokenizer.texts_to_sequences(test.Email)

# dictionary containing words and their index
word_index = tokenizer.word_index 
# print(tokenizer.word_index) 
# total words in the corpus
print('Found %s unique tokens.' % len(word_index)) 

# get only the top frequent words on train
import tensorflow as tf
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
# train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) 

# get only the top frequent words on test
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) 

print(train_data.shape)
print(test_data.shape)


Found 8451 unique tokens.
(4457, 300)
(1115, 300)


In [39]:
train_labels = train['Target']
test_labels = test['Target']

#import library

from sklearn.preprocessing import LabelEncoder
# converts the character array to numeric array. Assigns levels to unique labels.

le = LabelEncoder() 
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))


['ham' 'spam']
(array([0, 1]), array([3862,  595]))
(array([0, 1]), array([963, 152]))


In [41]:
# changing data types
from keras.utils import to_categorical
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)

EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)


Shape of data tensor: (4457, 300)
Shape of label tensor: (4457, 2)
Shape of label tensor: (1115, 2)
300


# Step 2-5 Model building and predicting
We are building the models using different deep learning approaches
like CNN, RNN, LSTM, and Bidirectional LSTM and comparing the
performance of each model using different accuracy metrics.

We can now define our CNN model.

Here we define a single hidden layer with 128 memory units. The
network uses a dropout with a probability of 0.5. The output layer is a
dense layer using the softmax activation function to output a probability
prediction.

In [42]:
# Import Libraries 
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential



In [43]:
print('Training CNN 1D model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(Dropout(0.5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy',
 optimizer='rmsprop',
 metrics=['acc'])

model.fit(train_data, labels_train,
 batch_size=64,
 epochs=5,
 validation_data=(test_data, labels_test))


Training CNN 1D model.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5b47ce0ac8>

In [44]:
#predictions on test data

predicted=model.predict(test_data)
predicted

#model evaluation

import sklearn
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted.round()))


precision: [0.86678668 1.        ]
recall: [1.         0.02631579]
fscore: [0.92864031 0.05128205]
support: [963 152]
############################
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       963
           1       1.00      0.03      0.05       152

   micro avg       0.87      0.87      0.87      1115
   macro avg       0.93      0.51      0.49      1115
weighted avg       0.88      0.87      0.81      1115
 samples avg       0.87      0.87      0.87      1115



In [45]:
#Now define RNN model
#import library
from keras.layers.recurrent import SimpleRNN

#model training

print('Training SIMPLERNN model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(SimpleRNN(2, input_shape=(None,1)))

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

model.fit(train_data, labels_train,
 batch_size=16,
 epochs=5,
 validation_data=(test_data, labels_test))


Training SIMPLERNN model.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5bb04a7518>

In [47]:
# prediction on test data
predicted_Srnn=model.predict(test_data)
predicted_Srnn

array([[0.98101735, 0.01898264],
       [0.9854204 , 0.01457962],
       [0.64395374, 0.35604626],
       ...,
       [0.99076146, 0.00923848],
       [0.639665  , 0.36033493],
       [0.95951307, 0.04048699]], dtype=float32)

In [48]:
#model evaluation

from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_Srnn.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted_Srnn.round()))


precision: [0.93522267 0.69291339]
recall: [0.95950156 0.57894737]
fscore: [0.94720656 0.63082437]
support: [963 152]
############################
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       963
           1       0.69      0.58      0.63       152

   micro avg       0.91      0.91      0.91      1115
   macro avg       0.81      0.77      0.79      1115
weighted avg       0.90      0.91      0.90      1115
 samples avg       0.91      0.91      0.91      1115



# Below is LSTM (Long Short-Term Memory) model

In [51]:
#model training

print('Training LSTM model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
# model.add(LSTM(output_dim=16, activation='relu', inner_activation='hard_sigmoid',return_sequences=True))
model.add(LSTM(16, activation='relu', recurrent_activation='hard_sigmoid',return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Flatten()) 

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

model.fit(train_data, labels_train,
 batch_size=16,
 epochs=5,
 validation_data=(test_data, labels_test))


Training LSTM model.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5baf4694e0>

In [52]:
#prediction on text data
predicted_lstm=model.predict(test_data)
predicted_lstm

array([[9.3131465e-01, 6.8685375e-02],
       [9.9458694e-01, 5.4129958e-03],
       [9.8875856e-01, 1.1241476e-02],
       ...,
       [9.9999952e-01, 4.7185071e-07],
       [9.9999356e-01, 6.4308206e-06],
       [1.0000000e+00, 3.1326636e-08]], dtype=float32)

In [53]:
#model evaluation 

from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_lstm.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted_lstm.round()))


precision: [0.98564103 0.98571429]
recall: [0.99792316 0.90789474]
fscore: [0.99174407 0.94520548]
support: [963 152]
############################
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       963
           1       0.99      0.91      0.95       152

   micro avg       0.99      0.99      0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115
 samples avg       0.99      0.99      0.99      1115



# Finally, let’s see what is Bidirectional LSTM and implement the same.
As we know, LSTM preserves information from inputs using the
hidden state. In bidirectional LSTMs, inputs are fed in two ways: one
from previous to future and the other going backward from future to
past, helping in learning future representation as well. Bidirectional
LSTMs are known for producing very good results as they are capable of
understanding the context better.

In [54]:
#model training

print('Training Bidirectional LSTM model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(16, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

model.fit(train_data, labels_train,
 batch_size=16,
 epochs=3,
 validation_data=(test_data, labels_test))


Training Bidirectional LSTM model.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5ba1d956a0>

In [55]:
# prediction on test data

predicted_blstm=model.predict(test_data)
predicted_blstm

array([[9.8216712e-01, 1.7832812e-02],
       [9.9993086e-01, 6.9191941e-05],
       [9.9416107e-01, 5.8389390e-03],
       ...,
       [9.9994600e-01, 5.3950647e-05],
       [9.9996638e-01, 3.3666583e-05],
       [9.9999869e-01, 1.3665087e-06]], dtype=float32)

In [56]:
#model evaluation
#We will see that Bidirectional LSTM outperforms the rest of the algorithms.

from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_blstm.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted_blstm.round()))


precision: [0.9825998  0.97826087]
recall: [0.99688474 0.88815789]
fscore: [0.98969072 0.93103448]
support: [963 152]
############################
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       963
           1       0.98      0.89      0.93       152

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115
 samples avg       0.98      0.98      0.98      1115



Recipe 6-3. Next word/sequence of words suggestion – Next word prediction

In [None]:
file_content = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

# Just selecting emails and connverting it into list
Email_Data = file_content[[ 'v2']]

list_data = Email_Data.values.tolist()
list_data 


In [None]:
import numpy as np
import random
import pandas as pd
import sys
import os
import time
import codecs
import collections
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy 
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()


In [None]:
#Converting list to string
from collections import Iterable


def flatten(items):
    """Yield items from any nested iterable"""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x


TextData=list(flatten(list_data))  
TextData = ''.join(TextData) 

# Remove unwanted lines and converting into lower case
TextData = TextData.replace('\n','')
TextData = TextData.lower() 

pattern = r'[^a-zA-z0-9\s]' 
TextData = re.sub(pattern, '', ''.join(TextData)) 

# Tokenizing

tokens = tokenizer.tokenize(TextData)
tokens = [token.strip() for token in tokens] 

# get the distinct words and sort it

word_counts = collections.Counter(tokens)
word_c = len(word_counts)
print(word_c)

distinct_words = [x[0] for x in word_counts.most_common()]
distinct_words_sorted = list(sorted(distinct_words)) 


# Generate indexing for all words

word_index = {x: i for i, x in enumerate(distinct_words_sorted)} 


# decide on sentence lenght

sentence_length = 25


In [None]:
#prepare the dataset of input to output pairs encoded as integers
# Generate the data for the model

#input = the input sentence to the model with index 
#output = output of the model with index

InputData = []
OutputData = []

for i in range(0, word_c - sentence_length, 1):
    X = tokens[i:i + sentence_length]
    Y = tokens[i + sentence_length]
    InputData.append([word_index[char] for char in X])
    OutputData.append(word_index[Y])

print (InputData[:1])
print ("\n")
print(OutputData[:1]) 


In [None]:
# Generate  X 
X = numpy.reshape(InputData, (len(InputData), sentence_length, 1))


# One hot encode the output variable
Y = np_utils.to_categorical(OutputData) 

Y


In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

 
#define the checkpoint
file_name_path="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(file_name_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint] 

#fit the model
model.fit(X, Y, epochs=5, batch_size=128, callbacks=callbacks) 


In [None]:
# load the network weights
file_name = "weights-improvement-05-6.8213.hdf5"
model.load_weights(file_name)
model.compile(loss='categorical_crossentropy', optimizer='adam') 


In [None]:
# Generating random sequence
start = numpy.random.randint(0, len(InputData))
input_sent = InputData[start]

# Generate index of the next word of the email 

X = numpy.reshape(input_sent, (1, len(input_sent), 1))
predict_word = model.predict(X, verbose=0)
index = numpy.argmax(predict_word)

print(input_sent)
print ("\n")
print(index)


In [None]:
# Convert these indexes back to words

word_index_rev = dict((i, c) for i, c in enumerate(tokens))
result = word_index_rev[index]
sent_in = [word_index_rev[value] for value in input_sent]

print(sent_in)
print ("\n")
print(result)
