In [22]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [23]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

data = load_doc('./new_intents.json')
data

{'intents': [{'tag': 'start_conversation',
   'patterns': ['Hi there', 'Is anyone there?', 'Hey', 'Hola', 'Hello', 'Hi'],
   'responses': ['Hello',
    'Happy to have you here',
    'Good to see you again',
    'Hi there, how can I help?'],
   'context': ['']},
  {'tag': 'what_are_you',
   'patterns': ['What is your name?',
    'what are you?',
    'who are you?',
    'your name pls?'],
   'responses': ["Hi, I'm  Bowhead Bot",
    "I'm Bowhead Bot ",
    'Call me Bowhead Bot'],
   'context': ['']},
  {'tag': 'end_conversation',
   'patterns': ['Bye',
    'See you later',
    'Good bye',
    'Goodbye',
    'Nice chatting with you, bye'],
   'responses': ['Have a lovely Day!', 'Bye', 'Happy to help you'],
   'context': ['']},
  {'tag': 'thanks',
   'patterns': ['Thanks',
    'Thank you',
    'That is helpful',
    'Awesome, thanks',
    'Thanks for helping me'],
   'responses': ['Happy to help!', 'Any time!'],
   'context': ['']},
  {'tag': 'not_satisfied',
   'patterns': ['That is not h

In [25]:
def frame_data(feat_1,feat_2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feat_1,feat_2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
    return df

In [26]:
questions_df = frame_data('questions','labels',True)
questions_df

Unnamed: 0,questions,labels
0,Hi there,start_conversation
1,Is anyone there?,start_conversation
2,Hey,start_conversation
3,Hola,start_conversation
4,Hello,start_conversation
5,Hi,start_conversation
6,What is your name?,what_are_you
7,what are you?,what_are_you
8,who are you?,what_are_you
9,your name pls?,what_are_you


In [27]:
responses_df = frame_data('responses','labels',False)
responses_df

Unnamed: 0,responses,labels
0,Hello,start_conversation
1,Happy to have you here,start_conversation
2,Good to see you again,start_conversation
3,"Hi there, how can I help?",start_conversation
4,"Hi, I'm Bowhead Bot",what_are_you
5,I'm Bowhead Bot,what_are_you
6,Call me Bowhead Bot,what_are_you
7,Have a lovely Day!,end_conversation
8,Bye,end_conversation
9,Happy to help you,end_conversation


# For creating tokens and vocab of training data;this vocab is later used when testing new user input(testing the bot)

In [28]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return 

def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)   
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return 


create_vocab(tokenizer,questions_df,'questions')
remove_stop_words(tokenizer,questions_df,'questions')

# Creating training and test data and storing the indexes

In [29]:
# Creating a test list containing only the first question of each group after grouping them according to their labels.
test_list = list(questions_df.groupby(by='labels',as_index=False).first()['questions'])
test_list

# The index of the above test observations taken from the dataframe df1
test_index = []
for i,_ in enumerate(test_list):
    idx = questions_df[questions_df.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

# Train index
train_index = [i for i in questions_df.index if i not in test_index]
train_index

['bye',
 'that is not helpful',
 'how can you help me',
 'what service do you provide',
 'hi there',
 'can you help me find survey',
 'thanks',
 'what can you tell me about covid trial',
 'what is your name']

[10, 20, 27, 29, 0, 47, 15, 35, 6]

[1,
 2,
 3,
 4,
 5,
 7,
 8,
 9,
 11,
 12,
 13,
 14,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 28,
 30,
 31,
 32,
 33,
 34,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56]

# Initializing a tokenizer, fitting it on to questions df and encoding the questions_df

In [30]:
def encoder(df,feature):
#     text = ' '.join(list(vocab.keys()))
#     from tensorflow.keras.preprocessing.text import text_to_word_sequence
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    print(entries)
    #step1 To create dictionary having 'keys' as words from each entry(sentences) and 'values' are randomly assigned by tokenizer.
    t.fit_on_texts(entries)
    print(t.word_index)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    print(vocab_size)
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    print(max_length)
    #setp2 To see the output of the dataframe when dictionary 'values' from the above created dictionary in step1 are used in the place of the words in the sentences
    encoded = t.texts_to_sequences(entries)
    print(encoded)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [31]:
X,vocab_size = encoder(questions_df,'questions')
X
questions_df_encoded = pd.DataFrame(X)
questions_df_encoded['labels'] = questions_df.labels
questions_df_encoded

['hi there', 'is anyone there', 'hey', 'hola', 'hello', 'hi', 'what is your name', 'what are you', 'who are you', 'your name pls', 'bye', 'see you later', 'good bye', 'goodbye', 'nice chatting with you bye', 'thanks', 'thank you', 'that is helpful', 'awesome thanks', 'thanks for helping me', 'that is not helpful', 'you dont understand', 'that is not what want', 'it is not what asked for', 'you are bad', 'you do not seem to be of much help', 'you are not helpful', 'how can you help me', 'what can you do', 'what service do you provide', 'which service', 'what service', 'what other service do you offer', 'tell me few service', 'how can you help other than trial', 'what can you tell me about covid trial', 'need some information about covid trial', 'covid trial information needed', 'want to know about covid trial', 'show me covid trial info', 'want to find hepatitis trial', 'find heart trial detail', 'find medical trial info', 'diabetes trial', 'find study for breast trial', 'want to find s

array([[25, 26,  0,  0,  0,  0,  0,  0,  0],
       [ 8, 38, 26,  0,  0,  0,  0,  0,  0],
       [39,  0,  0,  0,  0,  0,  0,  0,  0],
       [40,  0,  0,  0,  0,  0,  0,  0,  0],
       [41,  0,  0,  0,  0,  0,  0,  0,  0],
       [25,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  8, 27, 28,  0,  0,  0,  0,  0],
       [ 5, 16,  1,  0,  0,  0,  0,  0,  0],
       [42, 16,  1,  0,  0,  0,  0,  0,  0],
       [27, 28, 43,  0,  0,  0,  0,  0,  0],
       [19,  0,  0,  0,  0,  0,  0,  0,  0],
       [44,  1, 45,  0,  0,  0,  0,  0,  0],
       [46, 19,  0,  0,  0,  0,  0,  0,  0],
       [47,  0,  0,  0,  0,  0,  0,  0,  0],
       [48, 49, 50,  1, 19,  0,  0,  0,  0],
       [20,  0,  0,  0,  0,  0,  0,  0,  0],
       [51,  1,  0,  0,  0,  0,  0,  0,  0],
       [21,  8, 22,  0,  0,  0,  0,  0,  0],
       [52, 20,  0,  0,  0,  0,  0,  0,  0],
       [20, 17, 53,  3,  0,  0,  0,  0,  0],
       [21,  8, 10, 22,  0,  0,  0,  0,  0],
       [ 1, 54, 55,  0,  0,  0,  0,  0,  0],
       [21

Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
0,25,26,0,0,0,0,0,0,0,start_conversation
1,8,38,26,0,0,0,0,0,0,start_conversation
2,39,0,0,0,0,0,0,0,0,start_conversation
3,40,0,0,0,0,0,0,0,0,start_conversation
4,41,0,0,0,0,0,0,0,0,start_conversation
5,25,0,0,0,0,0,0,0,0,start_conversation
6,5,8,27,28,0,0,0,0,0,what_are_you
7,5,16,1,0,0,0,0,0,0,what_are_you
8,42,16,1,0,0,0,0,0,0,what_are_you
9,27,28,43,0,0,0,0,0,0,what_are_you


In [32]:
#Creating 2 observations with 'confused' label and appending them to dataframe.
#max_length should be used to create to observations with arrays of max_length size
for i in range(0,2):
    dt = [0]*9
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {9:'labels'})
    questions_df_encoded = questions_df_encoded.append(pd.DataFrame(dt).rename(columns = {9:'labels'}),ignore_index=True)
    
questions_df_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
0,0,0,0,0,0,0,0,0,0,confused


Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
0,0,0,0,0,0,0,0,0,0,confused


Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
0,25,26,0,0,0,0,0,0,0,start_conversation
1,8,38,26,0,0,0,0,0,0,start_conversation
2,39,0,0,0,0,0,0,0,0,start_conversation
3,40,0,0,0,0,0,0,0,0,start_conversation
4,41,0,0,0,0,0,0,0,0,start_conversation
5,25,0,0,0,0,0,0,0,0,start_conversation
6,5,8,27,28,0,0,0,0,0,what_are_you
7,5,16,1,0,0,0,0,0,0,what_are_you
8,42,16,1,0,0,0,0,0,0,what_are_you
9,27,28,43,0,0,0,0,0,0,what_are_you


In [33]:
# Appending one of the confused labeled observation to train index by appending the correspoding index number
train_index.append(57)
# Appending one of the confused labeled observation to test index
test_index.append(58)

# Encoding the labels of questions_df_encoded and then later responses_df 

In [34]:
# Encoding the 'labels' with values choosen(random) by labelEncoder 
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

labl = lable_enc.fit_transform(questions_df_encoded.labels)
labl

mapper = {}
for index,key in enumerate(questions_df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper

array([5, 5, 5, 5, 5, 5, 9, 9, 9, 9, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 2, 2,
       2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0])

{'start_conversation': 5,
 'what_are_you': 9,
 'end_conversation': 1,
 'thanks': 7,
 'not_satisfied': 2,
 'options': 3,
 'services': 4,
 'trials': 8,
 'surveys': 6,
 'confused': 0}

In [35]:
# Encoding the labels of responses dataframe df2 with values choosen from questions dataframe df1.
responses_df.labels = responses_df.labels.map(mapper).astype({'labels': 'int32'})
responses_df
responses_df.to_csv('./responses.csv')

Unnamed: 0,responses,labels
0,Hello,5
1,Happy to have you here,5
2,Good to see you again,5
3,"Hi there, how can I help?",5
4,"Hi, I'm Bowhead Bot",9
5,I'm Bowhead Bot,9
6,Call me Bowhead Bot,9
7,Have a lovely Day!,1
8,Bye,1
9,Happy to help you,1


# Creating train and test for the model based on the above calculated indexes

In [36]:
train = questions_df_encoded.loc[train_index]
test = questions_df_encoded.loc[test_index]
train
test

X_train = train.drop(columns=['labels'],axis=1)
X_train
y_train = train.labels
y_train
X_test = test.drop(columns=['labels'],axis=1)
X_test
y_test = test.labels
y_test
y_train =pd.get_dummies(y_train).values
y_train
y_test =pd.get_dummies(y_test).values
y_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
1,8,38,26,0,0,0,0,0,0,start_conversation
2,39,0,0,0,0,0,0,0,0,start_conversation
3,40,0,0,0,0,0,0,0,0,start_conversation
4,41,0,0,0,0,0,0,0,0,start_conversation
5,25,0,0,0,0,0,0,0,0,start_conversation
7,5,16,1,0,0,0,0,0,0,what_are_you
8,42,16,1,0,0,0,0,0,0,what_are_you
9,27,28,43,0,0,0,0,0,0,what_are_you
11,44,1,45,0,0,0,0,0,0,end_conversation
12,46,19,0,0,0,0,0,0,0,end_conversation


Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
10,19,0,0,0,0,0,0,0,0,end_conversation
20,21,8,10,22,0,0,0,0,0,not_satisfied
27,29,7,1,13,3,0,0,0,0,options
29,5,14,9,1,63,0,0,0,0,services
0,25,26,0,0,0,0,0,0,0,start_conversation
47,7,1,13,3,2,6,0,0,0,surveys
15,20,0,0,0,0,0,0,0,0,thanks
35,5,7,1,32,3,23,15,4,0,trials
6,5,8,27,28,0,0,0,0,0,what_are_you
58,0,0,0,0,0,0,0,0,0,confused


Unnamed: 0,0,1,2,3,4,5,6,7,8
1,8,38,26,0,0,0,0,0,0
2,39,0,0,0,0,0,0,0,0
3,40,0,0,0,0,0,0,0,0
4,41,0,0,0,0,0,0,0,0
5,25,0,0,0,0,0,0,0,0
7,5,16,1,0,0,0,0,0,0
8,42,16,1,0,0,0,0,0,0
9,27,28,43,0,0,0,0,0,0
11,44,1,45,0,0,0,0,0,0
12,46,19,0,0,0,0,0,0,0


1     start_conversation
2     start_conversation
3     start_conversation
4     start_conversation
5     start_conversation
7           what_are_you
8           what_are_you
9           what_are_you
11      end_conversation
12      end_conversation
13      end_conversation
14      end_conversation
16                thanks
17                thanks
18                thanks
19                thanks
21         not_satisfied
22         not_satisfied
23         not_satisfied
24         not_satisfied
25         not_satisfied
26         not_satisfied
28               options
30              services
31              services
32              services
33              services
34              services
36                trials
37                trials
38                trials
39                trials
40                trials
41                trials
42                trials
43                trials
44                trials
45                trials
46                trials
48               surveys


Unnamed: 0,0,1,2,3,4,5,6,7,8
10,19,0,0,0,0,0,0,0,0
20,21,8,10,22,0,0,0,0,0
27,29,7,1,13,3,0,0,0,0
29,5,14,9,1,63,0,0,0,0
0,25,26,0,0,0,0,0,0,0
47,7,1,13,3,2,6,0,0,0
15,20,0,0,0,0,0,0,0,0
35,5,7,1,32,3,23,15,4,0
6,5,8,27,28,0,0,0,0,0
58,0,0,0,0,0,0,0,0,0


10      end_conversation
20         not_satisfied
27               options
29              services
0     start_conversation
47               surveys
15                thanks
35                trials
6           what_are_you
58              confused
Name: labels, dtype: object

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0,

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint8)

In [37]:
y_train[0].shape,y_test[0].shape
X_train.shape

((10,), (10,))

(49, 9)

In [38]:
max_length = X_train.shape[1]
max_length

#output should be max_length + 1 (This output is used in Dense layer)
output = 10

9

# Model construction

In [39]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v3.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]


def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(output, activation='softmax'))
    
    
    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [40]:
model = define_model(vocab_size, max_length)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 9, 300)            23700     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 6, 64)             76864     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                650       
Total params: 101,214
Trainable params: 101,214
Non-trainable params: 0
_________________________________________________________________


In [41]:
history = model.fit(X_train, y_train, epochs=500, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

Train on 49 samples, validate on 10 samples
Epoch 1/500
Epoch 00001: val_loss improved from inf to 2.28076, saving model to model-v3.h5
Epoch 2/500
Epoch 00002: val_loss improved from 2.28076 to 2.26280, saving model to model-v3.h5
Epoch 3/500
Epoch 00003: val_loss improved from 2.26280 to 2.24478, saving model to model-v3.h5
Epoch 4/500
Epoch 00004: val_loss improved from 2.24478 to 2.22905, saving model to model-v3.h5
Epoch 5/500
Epoch 00005: val_loss improved from 2.22905 to 2.21363, saving model to model-v3.h5
Epoch 6/500
Epoch 00006: val_loss improved from 2.21363 to 2.19761, saving model to model-v3.h5
Epoch 7/500
Epoch 00007: val_loss improved from 2.19761 to 2.18140, saving model to model-v3.h5
Epoch 8/500
Epoch 00008: val_loss improved from 2.18140 to 2.16494, saving model to model-v3.h5
Epoch 9/500
Epoch 00009: val_loss improved from 2.16494 to 2.14622, saving model to model-v3.h5
Epoch 10/500
Epoch 00010: val_loss improved from 2.14622 to 2.12601, saving model to model-v3.h5

Epoch 29/500
Epoch 00029: val_loss improved from 1.57668 to 1.54931, saving model to model-v3.h5
Epoch 30/500
Epoch 00030: val_loss improved from 1.54931 to 1.52229, saving model to model-v3.h5
Epoch 31/500
Epoch 00031: val_loss improved from 1.52229 to 1.49554, saving model to model-v3.h5
Epoch 32/500
Epoch 00032: val_loss improved from 1.49554 to 1.46800, saving model to model-v3.h5
Epoch 33/500
Epoch 00033: val_loss improved from 1.46800 to 1.43901, saving model to model-v3.h5
Epoch 34/500
Epoch 00034: val_loss improved from 1.43901 to 1.41176, saving model to model-v3.h5
Epoch 35/500
Epoch 00035: val_loss improved from 1.41176 to 1.38088, saving model to model-v3.h5
Epoch 36/500
Epoch 00036: val_loss improved from 1.38088 to 1.35115, saving model to model-v3.h5
Epoch 37/500
Epoch 00037: val_loss improved from 1.35115 to 1.32169, saving model to model-v3.h5
Epoch 38/500
Epoch 00038: val_loss improved from 1.32169 to 1.29275, saving model to model-v3.h5
Epoch 39/500
Epoch 00039: val_

Epoch 00056: val_loss improved from 0.96661 to 0.95745, saving model to model-v3.h5
Epoch 57/500
Epoch 00057: val_loss improved from 0.95745 to 0.94798, saving model to model-v3.h5
Epoch 58/500
Epoch 00058: val_loss improved from 0.94798 to 0.93913, saving model to model-v3.h5
Epoch 59/500
Epoch 00059: val_loss improved from 0.93913 to 0.93177, saving model to model-v3.h5
Epoch 60/500
Epoch 00060: val_loss improved from 0.93177 to 0.92589, saving model to model-v3.h5
Epoch 61/500
Epoch 00061: val_loss improved from 0.92589 to 0.92104, saving model to model-v3.h5
Epoch 62/500
Epoch 00062: val_loss improved from 0.92104 to 0.91718, saving model to model-v3.h5
Epoch 63/500
Epoch 00063: val_loss improved from 0.91718 to 0.91381, saving model to model-v3.h5
Epoch 64/500
Epoch 00064: val_loss improved from 0.91381 to 0.91180, saving model to model-v3.h5
Epoch 65/500
Epoch 00065: val_loss improved from 0.91180 to 0.91035, saving model to model-v3.h5
Epoch 66/500
Epoch 00066: val_loss improved

Epoch 84/500
Epoch 00084: val_loss improved from 0.88139 to 0.88019, saving model to model-v3.h5
Epoch 85/500
Epoch 00085: val_loss improved from 0.88019 to 0.87935, saving model to model-v3.h5
Epoch 86/500
Epoch 00086: val_loss improved from 0.87935 to 0.87866, saving model to model-v3.h5
Epoch 87/500
Epoch 00087: val_loss improved from 0.87866 to 0.87796, saving model to model-v3.h5
Epoch 88/500
Epoch 00088: val_loss improved from 0.87796 to 0.87622, saving model to model-v3.h5
Epoch 89/500
Epoch 00089: val_loss improved from 0.87622 to 0.87464, saving model to model-v3.h5
Epoch 90/500
Epoch 00090: val_loss improved from 0.87464 to 0.87306, saving model to model-v3.h5
Epoch 91/500
Epoch 00091: val_loss improved from 0.87306 to 0.87168, saving model to model-v3.h5
Epoch 92/500
Epoch 00092: val_loss improved from 0.87168 to 0.87093, saving model to model-v3.h5
Epoch 93/500
Epoch 00093: val_loss improved from 0.87093 to 0.87026, saving model to model-v3.h5
Epoch 94/500
Epoch 00094: val_

Epoch 112/500
Epoch 00112: val_loss improved from 0.86247 to 0.86244, saving model to model-v3.h5
Epoch 113/500
Epoch 00113: val_loss improved from 0.86244 to 0.86243, saving model to model-v3.h5
Epoch 114/500
Epoch 00114: val_loss improved from 0.86243 to 0.86230, saving model to model-v3.h5
Epoch 115/500
Epoch 00115: val_loss improved from 0.86230 to 0.86218, saving model to model-v3.h5
Epoch 116/500
Epoch 00116: val_loss improved from 0.86218 to 0.86175, saving model to model-v3.h5
Epoch 117/500
Epoch 00117: val_loss improved from 0.86175 to 0.86113, saving model to model-v3.h5
Epoch 118/500
Epoch 00118: val_loss improved from 0.86113 to 0.86048, saving model to model-v3.h5
Epoch 119/500
Epoch 00119: val_loss improved from 0.86048 to 0.85981, saving model to model-v3.h5
Epoch 120/500
Epoch 00120: val_loss improved from 0.85981 to 0.85914, saving model to model-v3.h5
Epoch 121/500
Epoch 00121: val_loss improved from 0.85914 to 0.85876, saving model to model-v3.h5
Epoch 122/500
Epoch 

# Testing the model

In [42]:
[np.argmax(i) for i in model.predict(X_test)][:8]

[1, 2, 7, 4, 5, 6, 7, 3]

In [43]:
[np.argmax(i) for i in y_test][:8]

[1, 2, 3, 4, 5, 6, 7, 8]

# Posing new question to our model

In [50]:
def get_text():
    input_text  = ['find breast studies']
    df_input = pd.DataFrame(input_text,columns=['questions'])
    print(df_input)
    a = df_input.iloc[0]['questions']
    print(type(a))
    if not a:
        print('true')
    return df_input 

#load artifacts 
from tensorflow.keras.models import load_model
model = load_model('model-v3.h5')
# tokenizer_t to be used in encode_input_text
tokenizer_t = joblib.load('tokenizer_t.pkl')
# vocab to be used in bot_precaution
vocab = joblib.load('vocab.pkl')

def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry = entry = [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=9, padding='post')
    return padded

def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 0
    return pred

def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(pred).shape[0]
    r = np.random.randint(0,upper_bound)
    responses = list(df2.groupby('labels').get_group(pred).responses)
    return responses[r]

def bot_response(response,):
    print(response)
    

In [51]:
df_input = get_text()

#load artifacts 
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
input = df_input.iloc[0]['questions']
input
encoded_input = encode_input_text(tokenizer_t,df_input,'questions')
encoded_input

pred = get_pred(model,encoded_input)
pred
pred = bot_precausion(df_input,pred)
pred

response = get_response(responses_df,pred)
bot_response(response)


             questions
0  find breast studies
<class 'str'>


'find breast study'

array([[ 2, 73, 24,  0,  0,  0,  0,  0,  0]])

8

8

Gathering the trials information 


# If we change intents.json, we need to make few modifications to this file.
# 1. In cell 46, change the indexes for appending confused labels accordingly.
# 2. In cell 56 of model construction, change the number of output nodes in Dense layer to match with the 'maxlength + 1' (If not already matched)
# And also change maxlen in encode_input_text in testing and in preprocessor.py file.
# Change model name in load_model testing when posing new question