# Intelligent AI ChatBot in Python (NeuralNine)

In [None]:
# Intelligent AI ChatBot in Python
# this is going to dyamically understand and respond to greetings
# but the responses will be static

# our files will be : intents.json, responses.json

## TRAINING CHATBOT

In [1]:
# imports
import random
import json
import pickle   # for serialization
import numpy as np
import pandas as pd
from tqdm import tqdm
import sqlite3

import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import load_model

import spacy
from spacy.tokens import DocBin
from spacy.scorer import Scorer
import contextualSpellCheck

# lemmatizing our words
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package omw-1.4 to C:\Users\Angad
[nltk_data]     Sandhu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


### *loading our intents* (from colab)

In [None]:
# loading file
from google.colab import files
uploaded = files.upload()

In [None]:
print(uploaded)

In [None]:
intents =  json.loads(uploaded['intents_kan.json'])
print(intents['intents'][3]['tag'])

### *loading our intents* (from local)

In [2]:
file_path = "intents_kan.json"

with open(file_path, 'r', encoding="utf8") as j:
     intents = json.loads(j.read())
print(intents['intents'][3]['tag'])

name


### *Creating intermediate Data*

In [24]:
text = []
opt = []

# TODO : remove 'ignore letters'
ignore_letters = ["!", ".", ",", "?"]

# iterating over intents
for intent in intents['intents']:

    for pat in intent['pattern']:
        text.append(pat)
        opt.append(intent["tag"])

data = {
    "text" : text,
    "class" : opt,
}

df = pd.DataFrame(data)


In [25]:
print(df.head())

            text      class
0        ನಮಸ್ಕಾರ  greetings
1         ನಮಸ್ತೆ  greetings
2       ಶುಭೋದಯ  greetings
3  ಹಲೋ ಹೇಗಿದ್ದೀಯ  greetings
4          ವಿದಾಯ    goodbye


### *Shuffling Data*

In [32]:
# Shuffling a Pandas dataframe with .shuffle()
df = df.sample(frac=1)
print(df.head())

                                  text    class
12                              ವಯಸ್ಸು      age
30  ನಾನು ಷೇರುಗಳಲ್ಲಿ ಏನು ಮಾಡುತ್ತಿದ್ದೇನೆ   stocks
4                                ವಿದಾಯ  goodbye
11                  ನಿನ್ನ ವಯಸ್ಸು ಎಷ್ಟು      age
17  ನಾನು ಏನನ್ನಾದರೂ ಖರೀದಿಸಲು ಬಯಸುತ್ತೇನೆ     shop


### *Spliting Data Frame into Training and Validation Sets*

In [38]:
trainDF, validDF = np.split(df, [int(0.9*len(df))])

In [39]:
print(len(df))
print(len(trainDF))
print(len(validDF))

31
27
4


### *Creattig Training Data*

In [26]:
def preprocess(df, embed):
    '''
    Preprocess the dataframe into spacy pipeline for later classification
    ---
    Input:
    df (DataFrame): Pandas dataframe containing the raw text and outputs.
    embed (str): Name of pipeline embedding used

    Output:
    df (DataFrame): Preprocessed input dataframe
    docs (doc): SpaCy doc object that stores text data along with classification
    '''

    # Store the data into tuples
    data = tuple(zip(df.text.tolist(), df["class"].tolist())) 
    
    # Load English library from SpaCy
    nlp=spacy.load(embed)
    print("Sample Input Data: ", data[0])

    # Storage for docs
    docs = []

    # One-hot encoding for the classifications
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        doc.cats['greetings'] = 0
        doc.cats['goodbye'] = 0
        doc.cats['age'] = 0
        doc.cats['name'] = 0
        doc.cats['shop'] = 0
        doc.cats['hours'] = 0
        doc.cats['crop'] = 0
        doc.cats['stocks'] = 0
        
        if label=='greetings': doc.cats['greetings'] = 1
        elif label=='goodbye': doc.cats['goodbye'] = 1
        elif label=='age': doc.cats['age'] = 1
        elif label=='name': doc.cats['name'] = 1
        elif label=='shop': doc.cats['shop'] = 1
        elif label=='hours': doc.cats['hours'] = 1
        elif label=='crop': doc.cats['crop'] = 1
        elif label=='stocks': doc.cats['stocks'] = 1
        # print(doc.cats)
        
        docs.append(doc)
    return df, docs

In [40]:
# Covert the train and test dataframes to .spacy files for training

# Preprocess the dataframes for train data
train_data, train_docs = preprocess(trainDF,"en_core_web_sm")
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/textcat_train.spacy")

# Preprocess the dataframes for test data
train_data, train_docs = preprocess(validDF,"en_core_web_sm")
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/textcat_valid.spacy")

('ವಯಸ್ಸು', 'age')


100%|██████████| 27/27 [00:00<00:00, 1507.64it/s]


('ಕಾರ್ಯಾಚರಣೆಯ ಗಂಟೆಗಳ', 'hours')


100%|██████████| 4/4 [00:00<00:00, 572.29it/s]


### *Configuring Transformer*

To setup the configeration of the model we need to go to [THIS](https://spacy.io/usage/training#quickstart) page to setup basic parameters.

In [1]:
# filling config file
!python -m spacy init fill-config ./config/base_config.cfg ./config/config.cfg

✔ Auto-filled config with all values
✔ Saved config
config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2022-06-25 14:35:49.738546: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-06-25 14:35:49.738970: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### *Running Model*

In [41]:
# running spacy model
!python -m spacy train ./config/config.cfg --output ./output --paths.train ./data/textcat_train.spacy --paths.dev ./data/textcat_valid.spacy

ℹ Saving to output directory: output
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['textcat']
ℹ Initial learn rate: 0.001
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.11        0.00    0.00
200     200         14.86        0.00    0.00
400     400          4.03        0.00    0.00
600     600          1.11        0.00    0.00
800     800          0.52        0.00    0.00
1000    1000          0.30        0.00    0.00
1200    1200          0.20        0.00    0.00
1400    1400          0.14        0.00    0.00
1600    1600          0.11        0.00    0.00
✔ Saved pipeline to output directory
output\model-last


2022-06-24 23:26:45.137488: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-06-24 23:26:45.137892: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2022-06-24 23:26:49,102] [INFO] Set up nlp object from config
[2022-06-24 23:26:49,109] [INFO] Pipeline: ['textcat']
[2022-06-24 23:26:49,112] [INFO] Created vocabulary
[2022-06-24 23:26:49,113] [INFO] Finished initializing nlp object
[2022-06-24 23:26:49,140] [INFO] Initialized pipeline components: ['textcat']


### *Evaluate Model*

In [None]:
!python -m spacy evaluate output/model-best data/textcat_data.spacy --output result/metrics.json

## EXECUTING CHATBOT

### *Loading Our Crop Data*

In [3]:
# checking existing intents file
print(intents['intents'][3]['tag'])

name


### *loading our crops* (from colab)

In [None]:
# loading file
from google.colab import files
uploaded = files.upload()

In [None]:
print(uploaded)

In [None]:
crop_data =  json.loads(uploaded['crop_data_kan.json'])
print(crop_data["ಸಾಸಿವೆ"]["Min_Price"])

### *loading our crops* (from local)

In [4]:
file_path = "crop_data_kan.json"

with open(file_path, 'r', encoding="utf8") as j:
     crop_data = json.loads(j.read())
print(crop_data["ಸಾಸಿವೆ"]["Min_Price"])

6100


### *Creating Crop Data Data Structure by Querying dataset*

In [5]:
crop_names = list(crop_data.keys())

### *Creating Helper Functions*



In [6]:
def predict_class(sentence):

  # Verify model for English model
  nlp_model = spacy.load("./output/model-best")

  doc_valid = nlp_model(sentence)

  results = list(sorted(doc_valid.cats.items(), key = lambda kv:(kv[1], kv[0]), reverse=True))

  return results

def get_response(intents_list, intents_json):
  tag = intents_list[0][0]
  list_of_intents = intents_json['intents']
  for i in list_of_intents:
    if i['tag'] == tag:
      result = random.choice(i['responses'])
      break
  return result

def get_crop_data(msg):
  bow = list(msg.split())
  # bow = [string.lower() for string in bow]
  bow = [string for string in bow]
  # print("message words : ", bow)
  # print("crops : ", crop_names)

  common_crops = []
  for sub in bow:
    common_crops += list(s for s in crop_names if sub.lower() in s.lower())

  common_crops = list(set(bow) & set(crop_names))
  # print("common crops : ", common_crops)

  if not common_crops:
    return "No Crop Data Found, Please Try again"
  else:
    crop = crop_data[common_crops[0]]

    crop_res = "Here is the relevent information on the {} crop: \n".format(common_crops[0])
    crop_res += "Commodity : {}\n".format(crop["Commodity"])
    crop_res += "Variety : {}\n".format(crop["Variety"])
    crop_res += "Area : {}, {}, {}\n".format(crop["Market"], crop["District"], crop["State"])
    crop_res += "Maximum Price : {}\n".format(crop["Max_Price"])
    crop_res += "Average Price : {}\n".format(crop["Modal_Price"])
    crop_res += "Minimum Price : {}\n".format(crop["Min_Price"])
    return crop_res


## Running BOT

In [8]:
while True:
  message = input('Input Message : ')
  ints = predict_class(message)
  print(ints)
  print("predicted class : ", ints[0][0], " | accuracy : ", ints[0][1])

  if ints[0][0] == 'crop':
    res = get_crop_data(message)
  else:
    res = get_response(ints, intents)

  print("Output Message : ", res, "\n")

[('crop', 0.7351760268211365), ('stocks', 0.07618217170238495), ('greetings', 0.046774424612522125), ('goodbye', 0.046774424612522125), ('age', 0.042790431529283524), ('name', 0.019967371597886086), ('hours', 0.01809951476752758), ('shop', 0.014235621318221092)]
predicted class :  crop  | accuracy :  0.7351760268211365
Output Message :  Here is the relevent information on the ಅಕ್ಕಿ crop: 
Commodity : Rice
Variety : III
Area : Nautnava, Maharajganj, Uttar Pradesh
Maximum Price : 2400
Average Price : 2300
Minimum Price : 2200
 



KeyboardInterrupt: Interrupted by user

# Visualization