<a href="https://colab.research.google.com/github/abhayraghuwanshi/Chatbot_intent_classification/blob/main/cogno_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> Libraries Import</h1>

In [9]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

<h1>DATA IMPORTS</h1>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
path = "/content/drive/MyDrive/DATA/MachineLearningContest/"

In [4]:
train = pd.read_json(os.path.join(path, "intents.json"),  orient='Table')

In [5]:
test = pd.read_excel(os.path.join(path, "TestingData.xlsx"))

In [6]:
len(train)

203

In [7]:
possible_intent = train["intent"]

In [8]:
test.head()

Unnamed: 0,Test user queries
0,My money deduct my account but not credit
1,passbook delivery courier
2,No
3,It shows me existing customer
4,Pasward block bata raha hai


In [10]:
train.head()

Unnamed: 0,id,variations,intent
0,995,{'0': 'Is there a procedure to open AllinCall ...,Can I open AllinCall bank online?
1,996,{'0': 'Explain the features of AllinCall bank ...,What are the features of AllinCall bank account?
2,997,{'0': 'What are the features of AllinCall bank...,What are the features of AllinCall bank Video ...
3,998,{'0': 'Am i allowed to open a joint AllinCall ...,Can I open a joint AllinCall bank account?
4,999,"{'0': 'I don’t have a PAN', '1': 'how to open ...",I don’t have a PAN card


In [11]:
train["variations"] = train.variations.apply(lambda x: list(x.values()))

In [12]:
train.head()

Unnamed: 0,id,variations,intent
0,995,[Is there a procedure to open AllinCall bank o...,Can I open AllinCall bank online?
1,996,[Explain the features of AllinCall bank accoun...,What are the features of AllinCall bank account?
2,997,[What are the features of AllinCall bank Video...,What are the features of AllinCall bank Video ...
3,998,[Am i allowed to open a joint AllinCall bank a...,Can I open a joint AllinCall bank account?
4,999,"[I don’t have a PAN, how to open an account if...",I don’t have a PAN card


In [13]:
train = train.explode("variations")
train.head()

Unnamed: 0,id,variations,intent
0,995,Is there a procedure to open AllinCall bank on...,Can I open AllinCall bank online?
0,995,How to open AllinCall bank online,Can I open AllinCall bank online?
0,995,Can I open AllinCall bank online?,Can I open AllinCall bank online?
1,996,Explain the features of AllinCall bank account,What are the features of AllinCall bank account?
1,996,Tellme about the features of AllinCall bank ac...,What are the features of AllinCall bank account?


<h1> DATA CLEANING</h1>

In [14]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words

In [16]:
cleaned_words = cleaning(train["variations"])
print(len(cleaned_words))
print(cleaned_words[:2])

1563
[['is', 'there', 'a', 'procedure', 'to', 'open', 'allincall', 'bank', 'online'], ['how', 'to', 'open', 'allincall', 'bank', 'online']]


<h3> Tokenizing</h3>

In [17]:
#creating tokenizer
def create_tokenizer(words,
                  filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [18]:
#getting maximum length
def max_length(words):
  return(len(max(words, key = len)))

In [19]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 561 and Maximum length = 37


In [20]:
#encoding list of words
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [21]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)


<h2>Padding </h2>

In [22]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length,
                        padding =   "post"))

In [23]:
padded_doc = padding_doc(encoded_doc, max_length)

In [24]:
padded_doc[:5]

array([[ 14, 164,  24, 288,   2,  30,   6,  17,  49,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  7,   2,  30,   6,  17,  49,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  4,   1,  30,   6,  17,  49,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 26,   3,  50,  13,   6,  17,  10,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [384,   8,   3,  50,  13,   6,  17,  10,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,  

In [25]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (1563, 37)


<h3> Intent Data Encoding </h3>

In [30]:
out = pd.DataFrame(train.intent)
def one_hot(encode):
  o = OneHotEncoder(sparse = False).fit(encode)
  return(o.transform(encode), o.categories_)

In [31]:
output_one_hot, output_col = one_hot(out)

In [34]:
len(output_col[0])

203

<h3> Train_test_split</h3>

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [37]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (1250, 37) and train_Y = (1250, 203)
Shape of val_X = (313, 37) and val_Y = (313, 203)


<h1> Model </h1>

In [38]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
 # model.add(Dense(32, activation = "relu"))
  #model.add(Dropout(0.5))
  model.add(Dense(203, activation = "softmax"))
  
  return model

In [39]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 37, 128)           71808     
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 203)               52171     
Total params: 387,147
Trainable params: 315,339
Non-trainable params: 71,808
_________________________________________________________________


In [40]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Epoch 1/100
Epoch 00001: val_loss improved from inf to 5.28075, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 5.28075 to 5.18288, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 5.18288 to 4.98587, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 4.98587 to 4.91577, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss improved from 4.91577 to 4.83170, saving model to model.h5
Epoch 6/100
Epoch 00006: val_loss improved from 4.83170 to 4.81467, saving model to model.h5
Epoch 7/100
Epoch 00007: val_loss did not improve from 4.81467
Epoch 8/100
Epoch 00008: val_loss improved from 4.81467 to 4.76798, saving model to model.h5
Epoch 9/100
Epoch 00009: val_loss did not improve from 4.76798
Epoch 10/100
Epoch 00010: val_loss improved from 4.76798 to 4.75932, saving model to model.h5
Epoch 11/100
Epoch 00011: val_loss improved from 4.75932 to 4.72756, saving model to model.h5
Epoch 12/100
Epoch 00012: val_loss did 

In [41]:
model = load_model("model.h5")

In [59]:

def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

In [60]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
  return ("%s has confidence = %s" % (classes[0], (predictions[0])))
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [61]:
pred = predictions("How do I apply for a consumer finance loan?")
get_final_output(pred, output_col[0])

'How do I apply for a consumer finance loan? has confidence = 0.97698784'

In [64]:
for text in output_col[0]:
  pred = predictions(text)
  print("for ", text, "pred is ", get_final_output(pred, output_col[0]))

for  Account related queries pred is  Account related queries has confidence = 0.80287427
for  AllinCall App pred is  AllinCall App has confidence = 0.22690672
for  Are you serious pred is  Are you serious has confidence = 0.8586412
for  Can I apply for AllinCall Gold Loan online? pred is  Can I apply for AllinCall Gold Loan online? has confidence = 0.97562534
for  Can I apply for Sovereign Gold Bond through AllinCall bank Account? How can I apply for Sovereign Gold Bond online? pred is  Can I apply for Sovereign Gold Bond through AllinCall bank Account? How can I apply for Sovereign Gold Bond online? has confidence = 0.99362034
for  Can I apply for life insurance policy through my AllinCall bank app? pred is  Can I apply for life insurance policy through my AllinCall bank app? has confidence = 0.98157066
for  Can I book a Fixed Deposit through my AllinCall bank Account? pred is  Do I get a free debit card with my AllinCall bank Account? has confidence = 0.5618386
for  Can I check my a