# nltk_utils

In [1]:
import numpy as np
import nltk

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
    return nltk.word_tokenize(sentence)

def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())

def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bag   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
    """
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

In [2]:
a= "Is anyone there?"
print(a)
a = tokenize(a)
print(a)

Is anyone there?
['Is', 'anyone', 'there', '?']


In [3]:
words = ["Organize", "organizes", "organizing"]
words = [stem(w) for w in words]
print(words)

['organ', 'organ', 'organ']


# preprocessing Data

In [4]:
import random
import json

with open('intents.json', 'r') as f:
    intents = json.load(f)
    
print(intents)

{'intents': [{'tag': 'greeting', 'patterns': ['Hi', 'How are you?', 'Is anyone there?', 'Hello', 'Good day', "What's up", 'how are ya', 'heyy', 'whatsup', '??? ??? ??'], 'responses': ['Hello!', 'Good to see you again!', 'Hi there, how can I help?'], 'context_set': ''}, {'tag': 'goodbye', 'patterns': ['cya', 'see you', 'bye bye', 'See you later', 'Goodbye', 'I am Leaving', 'Bye', 'Have a Good day', 'talk to you later', 'ttyl', 'i got to go', 'gtg'], 'responses': ['Sad to see you go :(', 'Talk to you later', 'Goodbye!', 'Come back soon'], 'context_set': ''}, {'tag': 'creator', 'patterns': ['what is the name of your developers', 'what is the name of your creators', 'what is the name of the developers', 'what is the name of the creators', 'who created you', 'your developers', 'your creators', 'who are your developers', 'developers', 'you are made by', 'you are made by whom', 'who created you', 'who create you', 'creators', 'who made you', 'who designed you'], 'responses': ['College student

In [5]:
all_words = []
tags = []
xy = []
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    tag = intent['tag']
    # add to tag list
    tags.append(tag)
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_words.extend(w)
        # add to xy pair
        xy.append((w, tag))

# stem and lower each word
ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

print(len(xy), "patterns")
print(len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)


405 patterns
38 tags: ['admission', 'canteen', 'college intake', 'committee', 'computerhod', 'course', 'creator', 'document', 'event', 'extchod', 'facilities', 'fees', 'floors', 'goodbye', 'greeting', 'hod', 'hostel', 'hours', 'infrastructure', 'ithod', 'library', 'location', 'menu', 'name', 'number', 'placement', 'principal', 'ragging', 'random', 'salutaion', 'scholarship', 'sem', 'sports', 'swear', 'syllabus', 'task', 'uniform', 'vacation']
250 unique stemmed words: ["'s", '(', ')', 'a', 'about', 'ac', 'activ', 'address', 'admis', 'admiss', 'against', 'ai/ml', 'allot', 'am', 'an', 'and', 'ani', 'antirag', 'anyon', 'are', 'ass', 'asshol', 'at', 'attend', 'automobil', 'avail', 'averag', 'be', 'between', 'big', 'bitch', 'book', 'boy', 'branch', 'bring', 'build', 'by', 'bye', 'cafetaria', 'call', 'campu', 'can', 'canteen', 'capac', 'case', 'casual', 'ce', 'chat', 'chemic', 'civil', 'code', 'colleg', 'come', 'committ', 'committe', 'comp', 'compani', 'comput', 'conduct', 'contact', 'cours'

# create training data


In [6]:
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Hyper-parameters 


In [7]:
from torch.utils.data import Dataset, DataLoader

num_epochs = 1000
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)
print(input_size, output_size)

class ChatDataset(Dataset):

    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples
    
    
dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)


250 38


# Build neurel network

In [8]:
import torch
import torch.nn as nn


class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size) 
        self.l2 = nn.Linear(hidden_size, hidden_size) 
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        # no activation and no softmax at the end
        return out
    
    
    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NeuralNet(input_size, hidden_size, output_size).to(device)

# Loss and optimizer


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model


In [10]:
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')


print(f'final loss: {loss.item()}')

Epoch [100/1000], Loss: 0.026848450303077698
Epoch [200/1000], Loss: 0.0010695930104702711
Epoch [300/1000], Loss: 0.0006935623241588473
Epoch [400/1000], Loss: 2.238697743450757e-05
Epoch [500/1000], Loss: 2.1457667287450022e-07
Epoch [600/1000], Loss: 9.536741885085576e-08
Epoch [700/1000], Loss: 0.0
Epoch [800/1000], Loss: 0.0
Epoch [900/1000], Loss: 2.384185648907078e-08
Epoch [1000/1000], Loss: 7.009383352851728e-06
final loss: 7.009383352851728e-06


# save model

In [11]:
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to data.pth
