In [25]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [26]:
nltk.download('wordnet')
nltk.download('geograpy')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

[nltk_data] Error loading geograpy: Package 'geograpy' not found in
[nltk_data]     index


False

In [3]:
def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

data = load_doc('./intents.json')
data

{'intents': [{'tag': 'start_conversation',
   'patterns': ['Hi there',
    'Is anyone there?',
    'Hey',
    'Hola',
    'Hello',
    'Good day',
    'Hi'],
   'responses': ['Hello',
    'Happy to have you here',
    'Good to see you again',
    'Hi there, how can I help?'],
   'context': ['']},
  {'tag': 'what_are_you',
   'patterns': ['What is your name?',
    'what are you?',
    'who are you?',
    'your name pls?'],
   'responses': ["Hi, I'm  Bowhead Bot",
    "I'm Bowhead Bot ",
    'Call me Bowhead Bot'],
   'context': ['']},
  {'tag': 'end_conversation',
   'patterns': ['Bye',
    'See you later',
    'Goodbye',
    'Nice chatting to you, bye',
    'Till next time'],
   'responses': ['Have a lovely Day!',
    'Bye',
    'Buhbyee !',
    'Happy to help you'],
   'context': ['']},
  {'tag': 'thanks',
   'patterns': ['Thanks',
    'Thank you',
    "That's helpful",
    'Awesome, thanks',
    'Thanks for helping me'],
   'responses': ['Happy to help!', 'Any time!', 'My pleasure', 

In [4]:
def frame_data(feat_1,feat_2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feat_1,feat_2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
    return df

In [5]:
questions_df = frame_data('questions','labels',True)
questions_df

Unnamed: 0,questions,labels
0,Hi there,start_conversation
1,Is anyone there?,start_conversation
2,Hey,start_conversation
3,Hola,start_conversation
4,Hello,start_conversation
5,Good day,start_conversation
6,Hi,start_conversation
7,What is your name?,what_are_you
8,what are you?,what_are_you
9,who are you?,what_are_you


In [6]:
responses_df = frame_data('responses','labels',False)
responses_df

Unnamed: 0,responses,labels
0,Hello,start_conversation
1,Happy to have you here,start_conversation
2,Good to see you again,start_conversation
3,"Hi there, how can I help?",start_conversation
4,"Hi, I'm Bowhead Bot",what_are_you
5,I'm Bowhead Bot,what_are_you
6,Call me Bowhead Bot,what_are_you
7,Have a lovely Day!,end_conversation
8,Bye,end_conversation
9,Buhbyee !,end_conversation


# For creating tokens and vocab of training data;this vocab is later used when testing new user input(testing the bot)

In [7]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return 

def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)   
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return 


create_vocab(tokenizer,questions_df,'questions')
remove_stop_words(tokenizer,questions_df,'questions')

# Creating training and test data and storing the indexes

In [8]:
# Creating a test list containing only the first question of each group after grouping them according to their labels.
test_list = list(questions_df.groupby(by='labels',as_index=False).first()['questions'])
test_list

# The index of the above test observations taken from the dataframe df1
test_index = []
for i,_ in enumerate(test_list):
    idx = questions_df[questions_df.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

# Train index
train_index = [i for i in questions_df.index if i not in test_index]
train_index

['what can you tell me about covid trial',
 'bye',
 'how can you help me',
 'hi there',
 'thanks',
 'what is your name']

[29, 11, 21, 0, 16, 7]

[1,
 2,
 3,
 4,
 5,
 6,
 8,
 9,
 10,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 20,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 30,
 31,
 32,
 33]

# Initializing a tokenizer, fitting it on to questions df and encoding the questions_df

In [9]:
def encoder(df,feature):
#     text = ' '.join(list(vocab.keys()))
#     from tensorflow.keras.preprocessing.text import text_to_word_sequence
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    print(entries)
    #step1 To create dictionary having 'keys' as words from each entry(sentences) and 'values' are randomly assigned by tokenizer.
    t.fit_on_texts(entries)
    print(t.word_index)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    print(vocab_size)
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    print(max_length)
    #setp2 To see the output of the dataframe when dictionary 'values' from the above created dictionary in step1 are used in the place of the words in the sentences
    encoded = t.texts_to_sequences(entries)
    print(encoded)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [10]:
X,vocab_size = encoder(questions_df,'questions')
X
questions_df_encoded = pd.DataFrame(X)
questions_df_encoded['labels'] = questions_df.labels
questions_df_encoded

['hi there', 'is anyone there', 'hey', 'hola', 'hello', 'good day', 'hi', 'what is your name', 'what are you', 'who are you', 'your name pls', 'bye', 'see you later', 'goodbye', 'nice chatting to you bye', 'till next time', 'thanks', 'thank you', 'thats helpful', 'awesome thanks', 'thanks for helping me', 'how can you help me', 'what can you do', 'what help you provide', 'how can you be helpful', 'what support do you offered', 'what do you know', 'what help', 'what are you about', 'what can you tell me about covid trial', 'need some info about covid trial', 'covid trial info needed', 'want to know about covid trial', 'show me covid trial info']
{'you': 1, 'what': 2, 'covid': 3, 'trial': 4, 'me': 5, 'can': 6, 'about': 7, 'are': 8, 'thanks': 9, 'help': 10, 'do': 11, 'info': 12, 'hi': 13, 'there': 14, 'is': 15, 'your': 16, 'name': 17, 'bye': 18, 'to': 19, 'helpful': 20, 'how': 21, 'know': 22, 'anyone': 23, 'hey': 24, 'hola': 25, 'hello': 26, 'good': 27, 'day': 28, 'who': 29, 'pls': 30, 's

array([[13, 14,  0,  0,  0,  0,  0,  0],
       [15, 23, 14,  0,  0,  0,  0,  0],
       [24,  0,  0,  0,  0,  0,  0,  0],
       [25,  0,  0,  0,  0,  0,  0,  0],
       [26,  0,  0,  0,  0,  0,  0,  0],
       [27, 28,  0,  0,  0,  0,  0,  0],
       [13,  0,  0,  0,  0,  0,  0,  0],
       [ 2, 15, 16, 17,  0,  0,  0,  0],
       [ 2,  8,  1,  0,  0,  0,  0,  0],
       [29,  8,  1,  0,  0,  0,  0,  0],
       [16, 17, 30,  0,  0,  0,  0,  0],
       [18,  0,  0,  0,  0,  0,  0,  0],
       [31,  1, 32,  0,  0,  0,  0,  0],
       [33,  0,  0,  0,  0,  0,  0,  0],
       [34, 35, 19,  1, 18,  0,  0,  0],
       [36, 37, 38,  0,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0],
       [39,  1,  0,  0,  0,  0,  0,  0],
       [40, 20,  0,  0,  0,  0,  0,  0],
       [41,  9,  0,  0,  0,  0,  0,  0],
       [ 9, 42, 43,  5,  0,  0,  0,  0],
       [21,  6,  1, 10,  5,  0,  0,  0],
       [ 2,  6,  1, 11,  0,  0,  0,  0],
       [ 2, 10,  1, 44,  0,  0,  0,  0],
       [21,  6, 

Unnamed: 0,0,1,2,3,4,5,6,7,labels
0,13,14,0,0,0,0,0,0,start_conversation
1,15,23,14,0,0,0,0,0,start_conversation
2,24,0,0,0,0,0,0,0,start_conversation
3,25,0,0,0,0,0,0,0,start_conversation
4,26,0,0,0,0,0,0,0,start_conversation
5,27,28,0,0,0,0,0,0,start_conversation
6,13,0,0,0,0,0,0,0,start_conversation
7,2,15,16,17,0,0,0,0,what_are_you
8,2,8,1,0,0,0,0,0,what_are_you
9,29,8,1,0,0,0,0,0,what_are_you


In [11]:
#Creating 2 observations with 'confused' label and appending them to dataframe.
for i in range(0,2):
    dt = [0]*8
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {8:'labels'})
    questions_df_encoded = questions_df_encoded.append(pd.DataFrame(dt).rename(columns = {8:'labels'}),ignore_index=True)
    
questions_df_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,labels
0,0,0,0,0,0,0,0,0,confused


Unnamed: 0,0,1,2,3,4,5,6,7,labels
0,0,0,0,0,0,0,0,0,confused


Unnamed: 0,0,1,2,3,4,5,6,7,labels
0,13,14,0,0,0,0,0,0,start_conversation
1,15,23,14,0,0,0,0,0,start_conversation
2,24,0,0,0,0,0,0,0,start_conversation
3,25,0,0,0,0,0,0,0,start_conversation
4,26,0,0,0,0,0,0,0,start_conversation
5,27,28,0,0,0,0,0,0,start_conversation
6,13,0,0,0,0,0,0,0,start_conversation
7,2,15,16,17,0,0,0,0,what_are_you
8,2,8,1,0,0,0,0,0,what_are_you
9,29,8,1,0,0,0,0,0,what_are_you


In [12]:
# Appending one of the confused labeled observation to train index
train_index.append(34)
# Appending one of the confused labeled observation to test index
test_index.append(35)

# Encoding the labels of questions_df_encoded and then later responses_df 

In [13]:
# Encoding the 'labels' with values choosen(random) by labelEncoder 
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

labl = lable_enc.fit_transform(questions_df_encoded.labels)
labl

mapper = {}
for index,key in enumerate(questions_df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper

array([4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 3,
       3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 0, 0])

{'start_conversation': 4,
 'what_are_you': 6,
 'end_conversation': 2,
 'thanks': 5,
 'options': 3,
 'covid_trials': 1,
 'confused': 0}

In [14]:
# Encoding the labels of responses dataframe df2 with values choosen from questions dataframe df1.
responses_df.labels = responses_df.labels.map(mapper).astype({'labels': 'int32'})
responses_df
responses_df.to_csv('./responses.csv')

Unnamed: 0,responses,labels
0,Hello,4
1,Happy to have you here,4
2,Good to see you again,4
3,"Hi there, how can I help?",4
4,"Hi, I'm Bowhead Bot",6
5,I'm Bowhead Bot,6
6,Call me Bowhead Bot,6
7,Have a lovely Day!,2
8,Bye,2
9,Buhbyee !,2


# Creating train and test for the model based on the above calculated indexes

In [15]:
train = questions_df_encoded.loc[train_index]
test = questions_df_encoded.loc[test_index]


X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels

y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [16]:
y_train[0].shape,y_test[0].shape
X_train.shape

((7,), (7,))

(29, 8)

In [17]:
max_length = X_train.shape[1]
max_length
output = 9

8

# Model construction

In [18]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v1.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]


def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(7, activation='softmax'))
    
    
    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [19]:
model = define_model(vocab_size, max_length)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 300)            16200     
_________________________________________________________________
conv1d (Conv1D)              (None, 5, 64)             76864     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1, 64)             0         
_________________________________________________________________
flatten (Flatten)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 7)                 455       
Total params: 93,5

In [20]:
history = model.fit(X_train, y_train, epochs=500, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

Train on 29 samples, validate on 7 samples
Epoch 1/500

Epoch 00001: val_loss improved from inf to 1.92745, saving model to model-v1.h5
Epoch 2/500

Epoch 00002: val_loss improved from 1.92745 to 1.91414, saving model to model-v1.h5
Epoch 3/500

Epoch 00003: val_loss improved from 1.91414 to 1.90205, saving model to model-v1.h5
Epoch 4/500

Epoch 00004: val_loss improved from 1.90205 to 1.89077, saving model to model-v1.h5
Epoch 5/500

Epoch 00005: val_loss improved from 1.89077 to 1.87997, saving model to model-v1.h5
Epoch 6/500

Epoch 00006: val_loss improved from 1.87997 to 1.86958, saving model to model-v1.h5
Epoch 7/500

Epoch 00007: val_loss improved from 1.86958 to 1.85888, saving model to model-v1.h5
Epoch 8/500

Epoch 00008: val_loss improved from 1.85888 to 1.84790, saving model to model-v1.h5
Epoch 9/500

Epoch 00009: val_loss improved from 1.84790 to 1.83673, saving model to model-v1.h5
Epoch 10/500

Epoch 00010: val_loss improved from 1.83673 to 1.82616, saving model to mo

Epoch 39/500

Epoch 00039: val_loss improved from 1.64542 to 1.63993, saving model to model-v1.h5
Epoch 40/500

Epoch 00040: val_loss improved from 1.63993 to 1.63396, saving model to model-v1.h5
Epoch 41/500

Epoch 00041: val_loss improved from 1.63396 to 1.62738, saving model to model-v1.h5
Epoch 42/500

Epoch 00042: val_loss improved from 1.62738 to 1.62044, saving model to model-v1.h5
Epoch 43/500

Epoch 00043: val_loss improved from 1.62044 to 1.61268, saving model to model-v1.h5
Epoch 44/500

Epoch 00044: val_loss improved from 1.61268 to 1.60416, saving model to model-v1.h5
Epoch 45/500

Epoch 00045: val_loss improved from 1.60416 to 1.59574, saving model to model-v1.h5
Epoch 46/500

Epoch 00046: val_loss improved from 1.59574 to 1.58790, saving model to model-v1.h5
Epoch 47/500

Epoch 00047: val_loss improved from 1.58790 to 1.57848, saving model to model-v1.h5
Epoch 48/500

Epoch 00048: val_loss improved from 1.57848 to 1.56791, saving model to model-v1.h5
Epoch 49/500

Epoch 

Epoch 76/500

Epoch 00076: val_loss did not improve from 1.37603
Epoch 77/500

Epoch 00077: val_loss did not improve from 1.37603
Epoch 78/500

Epoch 00078: val_loss did not improve from 1.37603

Epoch 00078: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 79/500

Epoch 00079: val_loss did not improve from 1.37603
Epoch 80/500

Epoch 00080: val_loss did not improve from 1.37603
Epoch 81/500

Epoch 00081: val_loss did not improve from 1.37603

Epoch 00081: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 82/500

Epoch 00082: val_loss did not improve from 1.37603


# Testing the model

In [21]:
[np.argmax(i) for i in model.predict(X_test)][:8]

[3, 4, 3, 4, 5, 6, 0]

In [22]:
[np.argmax(i) for i in y_test][:8]

[1, 2, 3, 4, 5, 6, 0]

# Posing new question to our model

In [23]:
def get_text():
    input_text  = ['Good evening']
    df_input = pd.DataFrame(input_text,columns=['questions'])
    df_input
    return df_input 

#load artifacts 
from tensorflow.keras.models import load_model
model = load_model('model-v1.h5')
# tokenizer_t to be used in encode_input_text
tokenizer_t = joblib.load('tokenizer_t.pkl')
# vocab to be used in bot_precaution
vocab = joblib.load('vocab.pkl')

def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry = entry = [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=8, padding='post')
    return padded

def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 1
    return pred

def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(pred).shape[0]
    r = np.random.randint(0,upper_bound)
    responses = list(df2.groupby('labels').get_group(pred).responses)
    return responses[r]

def bot_response(response,):
    print(response)
    

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
df_input = get_text()

#load artifacts 
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
df_input
encoded_input = encode_input_text(tokenizer_t,df_input,'questions')
encoded_input

pred = get_pred(model,encoded_input)
pred
pred = bot_precausion(df_input,pred)

response = get_response(responses_df,pred)
bot_response(response)


Unnamed: 0,questions
0,good evening


array([[27,  0,  0,  0,  0,  0,  0,  0]])

4

Hello
