In [91]:
import json
import numpy as np
import spacy

In [92]:
# !python -m spacy download en_core_web_sm

In [93]:
nlp = spacy.load("en_core_web_sm")

In [94]:
s = "This is a apple and the apple is red."

<h2>01 tokenization</h2>

In [96]:
def tokenize(text):
   tok = [token for token in nlp(text)]
   return tok

In [97]:
tokenize(s)

[This, is, a, apple, and, the, apple, is, red, .]

<h2>02 Lemmatization</h2>

In [99]:
def lemma(text):
   lemm = [token.lemma_.lower() for token in nlp(text)]
   return lemm

In [100]:
l = lemma(s)
l

['this', 'be', 'a', 'apple', 'and', 'the', 'apple', 'be', 'red', '.']

<h2>03 Bag of words</h2>

In [136]:
def bag_of_words(lem, vocab):
   bag = np.zeros(len(vocab), dtype=np.float32)
   for i,w in enumerate(vocab):
      if w in lem:
         bag[i] = 1
      return bag

In [138]:
v = ["This","are","man","is"]
t = ["This","guava","is","red"]
bag_of_words(l,v)

array([0., 0., 0., 0.], dtype=float32)

In [103]:
with open('dataset.json','r') as rf:
   intents = json.load(rf)
intents

{'intents': [{'tag': 'greeting',
   'patterns': ['hello', 'hi', 'hey', 'wake up', 'jarvis'],
   'responses': ['hello sir',
    'how are you sir',
    'always for you sir',
    'hello',
    "Here's your assistant"]},
  {'tag': 'bye',
   'patterns': ['bye', 'see you later', 'goodbye', 'bye bye', 'goodluck'],
   'responses': ['bye sir',
    'good bye sir',
    "it'll be nice to see you again",
    'see you later',
    'goodluck sir']},
  {'tag': 'health',
   'patterns': ['how are you', 'how are you feeling now', 'are you fine?'],
   'responses': ['fine sir', 'perfect', 'cool']}]}

In [104]:
all_words = []
tags = []
xy = []

In [105]:
for intent in intents['intents']:
   tag = intent['tag']
   tags.append(tag)
   for pattern in intent['patterns']:
      w = lemma(pattern)
      all_words.extend(w)
      xy.append((w,tag))

In [106]:
tags

['greeting', 'bye', 'health']

In [107]:
tags = sorted(set(tags)) #to sorted all unique words
tags

['bye', 'greeting', 'health']

In [108]:
all_words

['hello',
 'hi',
 'hey',
 'wake',
 'up',
 'jarvis',
 'bye',
 'see',
 'you',
 'later',
 'goodbye',
 'bye',
 'bye',
 'goodluck',
 'how',
 'be',
 'you',
 'how',
 'be',
 'you',
 'feel',
 'now',
 'be',
 'you',
 'fine',
 '?']

In [109]:
xy

[(['hello'], 'greeting'),
 (['hi'], 'greeting'),
 (['hey'], 'greeting'),
 (['wake', 'up'], 'greeting'),
 (['jarvis'], 'greeting'),
 (['bye'], 'bye'),
 (['see', 'you', 'later'], 'bye'),
 (['goodbye'], 'bye'),
 (['bye', 'bye'], 'bye'),
 (['goodluck'], 'bye'),
 (['how', 'be', 'you'], 'health'),
 (['how', 'be', 'you', 'feel', 'now'], 'health'),
 (['be', 'you', 'fine', '?'], 'health')]

In [110]:
all_words = sorted(set(all_words))
all_words

['?',
 'be',
 'bye',
 'feel',
 'fine',
 'goodbye',
 'goodluck',
 'hello',
 'hey',
 'hi',
 'how',
 'jarvis',
 'later',
 'now',
 'see',
 'up',
 'wake',
 'you']

In [111]:
"""ignore all special charector"""
ignore_words = [',','.','?','/','!']
vocab = [w for w in all_words if w not in ignore_words]
vocab

['be',
 'bye',
 'feel',
 'fine',
 'goodbye',
 'goodluck',
 'hello',
 'hey',
 'hi',
 'how',
 'jarvis',
 'later',
 'now',
 'see',
 'up',
 'wake',
 'you']

In [112]:
x_train = []
y_train = []
for pattern_sentence,tag in xy:
   bag = bag_of_words(pattern_sentence,vocab)
   x_train.append(bag)

   label = tags.index(tag)
   y_train.append(label)