In [1]:
#connect to google drive
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd '/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot'

/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot


# Load JSON file

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
with open("GL Bot.json") as file:
  Botpattern = json.load(file)

In [5]:
lsttag = []
lstpattern =[]
lstresponse =[]
dfs =[]

Lets read the JSON file and store the details in dataframe. 
The data in dataframe will be used for NLP processing and model buidling

In [6]:
for intent in Botpattern["intents"] :
  tagval = intent["tag"]                
  res = intent["responses"]

  for pattern in intent["patterns"] :
    lstpattern.append(pattern)
    lsttag.append(tagval)
    lstresponse.append(res[0])
  df = pd.DataFrame({'tag' : lsttag, 'response' : lstresponse, 'pattern' :lstpattern}) 
  dfs.append(df)
  lstpattern.clear()
  lsttag.clear()
  lstresponse.clear()

In [7]:
len(dfs)

8

In [21]:
#Lets merge data of all dataframes

In [8]:
finaldf = pd.concat(dfs)

In [9]:
finaldf.shape

(128, 3)

In [10]:
finaldf['tag'].value_counts()

SL         29
NN         24
Intro      20
Exit       16
Olympus    13
Ticket      9
Profane     9
Bot         8
Name: tag, dtype: int64

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels = le.fit_transform(finaldf['tag'])

In [12]:
finaldf['labels'] = labels

In [13]:
finaldf.sample(5)

Unnamed: 0,tag,response,pattern,labels
12,Intro,Hello! how can i help you ?,aifl batch,2
9,Intro,Hello! how can i help you ?,i am learner from,2
13,Exit,"I hope I was able to assist you, Good Bye",great help,1
11,Intro,Hello! how can i help you ?,aiml batch,2
4,Exit,"I hope I was able to assist you, Good Bye",later,1


In [14]:
#Save the dataframe
from numpy import asarray, save
save('/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/chatdata.npy',finaldf.to_numpy())

# Solution Approach

The chatbot data primary consists of
- pattern : This is corpus and will act as input feature
- tag : This is target variable i.e. output which we need to predict. 
- Response : This will be that value corresponding to predicted tag. The value will be extracted from the dataframe which will have mapping of tag -> Response

Model - Build and test differtn classfier like logistic regression, SVM etc and select that model which gives maximum accuracy. Save the model and load the in runtime. Also load dataframe (of tag-> responses). 

# Assumptions and Exclusions
Assumptions -
1) The corpus is will not contain accented characters
2) The size of corpus is limited.
Exclusions
1) Preprocessing methods like removing stop words, stemming or lemmatisation are not applied as most the patterns (questions) contain stop words e.g. "I am from from", "what is up", "is anyone there" etc. if we remove stop words then document will not have any words and thus features and also does not make sense
2) One of the goal of preprocessing stpes like remove stop words etc is to reduce dimentionality i.e. reduce number of words (features). In this case the corpus is limited and number of words are still managible even without removing stop words.
Assumption


# Preprocessing

In [15]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata

In [18]:
def Remove_SpecialChars(text, remove_digits=False):
  pattern = r'[^a-zA-z\s]' if not remove_digits else r'[^a-zA-z\s]'
  text = re.sub(pattern, ' ', text)  # remove special characters and numbers also
  return text

In [17]:
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def RemoveStopWords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [20]:
def Stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [21]:
import en_core_web_sm
import spacy
nlp = en_core_web_sm.load()

In [22]:
def Lemmatization(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [23]:
# Lets bundle above functions in one function
def TextProcessing(text) :
  txt = text.lower()
  txt = RemoveStopWords(txt)
  txt = Remove_SpecialChars(txt)
  #txt = Stemming(txt)
  #txt = Lemmatization(txt)
  return txt

In [24]:
finaldf['Processedpattern'] = finaldf['pattern'].apply(lambda x : TextProcessing(x))

In [25]:
finaldf[['pattern','Processedpattern']].sample(5)

Unnamed: 0,pattern,Processedpattern
8,artificial intelligence,artificial intelligence
6,unable to understand neural nets,unable understand neural nets
28,hyper parameters,hyper parameters
14,too good,good
17,forward propagation,forward propagation


In [26]:
Nooflabels = pd.unique(finaldf['labels'])

In [27]:
len(Nooflabels)

8

In [28]:
classes = pd.unique(finaldf['tag'])

In [29]:
classes

array(['Intro', 'Exit', 'Olympus', 'SL', 'NN', 'Bot', 'Profane', 'Ticket'],
      dtype=object)

# Build Classifier Model - To preduct tag(response) for given pattern (questions) based

In [30]:
from sklearn.model_selection import train_test_split
#Split data into 80:20 :: train : test 
X_train, X_test, y_train, y_test = train_test_split(finaldf.Processedpattern.values,
                                                    finaldf.labels.values,
                                                    test_size=0.2,
                                                    random_state=24
                                                    )

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,2)) 
x_train1 = vect.fit_transform(X_train)
x_test1 = vect.transform(X_test)

In [62]:
len(vect.vocabulary_)

190

Regression Model

In [63]:
from sklearn.linear_model import LogisticRegression
logisticmodel = LogisticRegression(solver='liblinear')

In [64]:
logisticmodel.fit(x_train1,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
from sklearn.metrics import accuracy_score

Training Accuracy

In [66]:
predicted_labels_train = logisticmodel.predict(x_train1)

In [67]:
print(accuracy_score(y_train,predicted_labels_train))

0.9901960784313726


Test Accuracy

In [68]:
predicted_labels_test = logisticmodel.predict(x_test1)

In [69]:
print(accuracy_score(y_test,predicted_labels_test))

0.5769230769230769


In [70]:
# Save model
import pickle
pickle.dump(logisticmodel,open('/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/logisticmodel.sav','wb'))

SVM Model

In [71]:
from sklearn.svm import SVC

In [72]:
svcmodel = SVC()

In [73]:
svcmodel.fit(x_train1,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [74]:
predicted_labels_train = svcmodel.predict(x_train1)

In [75]:
print(accuracy_score(y_train,predicted_labels_train))

0.7941176470588235


In [76]:
predicted_labels_test = logisticmodel.predict(x_test1)
print(accuracy_score(y_test,predicted_labels_test))

0.5769230769230769


In [77]:
#Save model
pickle.dump(svcmodel,open('/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/svcmodel.sav','wb'))

DNN model

In [78]:
import tensorflow as tf

In [79]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [80]:
x_train1_arr = x_train1.toarray()
x_test1_arr = x_test1.toarray()

In [81]:
type(x_train1)

scipy.sparse.csr.csr_matrix

In [82]:
x_train1_arr.shape

(102, 190)

In [83]:
x_test1_arr.shape

(26, 190)

In [84]:
len(vect.vocabulary_)

190

In [85]:
y_train_onehot = tf.keras.utils.to_categorical(y_train)
y_test_onehot = tf.keras.utils.to_categorical(y_test)

In [86]:
y_train_onehot.shape

(102, 8)

In [87]:
#Add hidden layers
model.add(tf.keras.layers.Dense(100, activation='relu', input_shape=(len(vect.vocabulary_),)))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(30, activation='relu'))

#Add Output layer
#model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.add(tf.keras.layers.Dense(len(Nooflabels), activation='softmax'))

In [88]:
#Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [89]:
model_checkpoint=tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/annmodel.h5', 
                                                    save_best_only=True, 
                                                    monitor='val_accuracy', 
                                                    mode='max', 
                                                    verbose=1)

In [90]:
model.fit(x_train1_arr, y_train_onehot,
           validation_data=(x_test1_arr, y_test_onehot), 
           epochs=80, batch_size=20,callbacks=[model_checkpoint])

Epoch 1/80

Epoch 00001: val_accuracy improved from -inf to 0.19231, saving model to /content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/annmodel.h5
Epoch 2/80

Epoch 00002: val_accuracy did not improve from 0.19231
Epoch 3/80

Epoch 00003: val_accuracy improved from 0.19231 to 0.23077, saving model to /content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/annmodel.h5
Epoch 4/80

Epoch 00004: val_accuracy improved from 0.23077 to 0.30769, saving model to /content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/annmodel.h5
Epoch 5/80

Epoch 00005: val_accuracy improved from 0.30769 to 0.34615, saving model to /content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/annmodel.h5
Epoch 6/80

Epoch 00006: val_accuracy did not improve from 0.34615
Epoch 7/80

Epoch 00007: val_accuracy did not improve from 0.34615
Epoch 8/80

Epoch 00008: val_accuracy did not improve from 0.34615
Epoch 9/80

Epoch 00009: val_accuracy did not improve from 0.34615
Epoch 10/80

Epoch 00010: val_accuracy d

<tensorflow.python.keras.callbacks.History at 0x7fde912a4750>

# Model Comparison

1) Logistic Regression 
   - Training Accuracy - 99%
   - Test Accuracy     - 57 %
2) SVM
   - Training Accuracy - 7986 %
   - Test Accuracy     - 57 %
3) Nueral Network 
   - Training Accuracy - 88% 
   - Test Accuracy     - 61%

# Chat bot flow

The text accuracy provided by neural netwrok model is higher...so lets use ANN model for chat bot flow

In [91]:
from tensorflow.keras.models import load_model
annmodelpredict = load_model('/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/annmodel.h5')
import pickle
#modelpredict = pickle.load(open('/content/drive/MyDrive/LablFiles/Stat_NL_Project/chatbot/svcmodel.sav','rb'))

In [92]:
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()

def clean_text(text): 
  tokens = tokenizer.tokenize(text)
  return tokens

In [93]:
def bag_of_words(text, vocab): 
  tokens = clean_text(TextProcessing(text))
  bow = [0] * len(vocab)
  for w in tokens: 
    for idx, word in enumerate(vocab):
      if word == w: 
        bow[idx] = 1
  return np.array(bow)

In [94]:
def pred_class(text, vocab, labels): 
  bow = bag_of_words(text, vocab)
  result = annmodelpredict.predict(np.array([bow]))[0] #ANN
  #result = modelpredict.predict(np.array([bow]))       #SVM model
  thresh = 0.2
  y_pred = [[idx, res] for idx, res in enumerate(result) if res > thresh]

  y_pred.sort(key=lambda x: x[1], reverse=True)
  return_list = []
  for r in y_pred:
    return_list.append(labels[r[0]])
  return return_list

In [204]:
def pred_class_modified(text, vocab, labels): 
  bow = bag_of_words(text, vocab)
  result = annmodelpredict.predict(np.array([bow]))[0] #ANN
  #result = modelpredict.predict(np.array([bow]))       #SVM model
  lst = list(result)
  max_val = max(lst)
  index = lst.index(max_val)
  return labels[index]

In [205]:
res = pred_class_modified("teach me olympus",vect.vocabulary_,classes)
print(res)

Olympus


In [197]:
def getResponse(tag) :
  df = finaldf[finaldf['tag']==tag]
  if (df.shape[0] >=1) :
    return df['response'].iloc[0:1]
  else :
    return "Can you please change your question"


In [None]:
# running the chatbot
while True:
    message = input("")
    tags = pred_class_modified(message, vect.vocabulary_,classes)
    result = getResponse(tags)
    print(result)