In [5]:
import json 
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

#### This code snippet reads the content of the 'intents.json' file and stores it in the data variable.

In [None]:
with open('intents.json') as file:
    data = json.load(file)

#### In this section, the code iterates over each intent in the data variable. For each intent, it retrieves the associated patterns and appends them to the training_sentences list. It also retrieves the intent's tag and appends it to the training_labels list. The responses for each intent are stored in the responses list.

#### Here, the code checks if the current intent's tag is not already in the labels list. If it is not present, it appends the tag to the labels list. The variable num_classes is then assigned the length of the labels list, representing the number of distinct classes or categories in the dataset. This code block processes the 'intents.json' file, extracting training sentences, labels, and responses for each intent, and determines the number of distinct classes in the dataset.

In [None]:
training_sentences = []
training_labels = []
labels = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])

    if intent['tag'] not in labels:
        labels.append(intent['tag'])
        
num_classes = len(labels)

In [12]:
print(num_classes)

42


In [13]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)
training_labels = lbl_encoder.transform(training_labels)

## Tokenization and Padding
####  The provided code performs tokenization and padding on text data. It uses a Tokenizer object to convert training sentences into sequences of integers based on a specified vocabulary size. The sequences are then padded or truncated to a maximum length. This preprocessing step prepares the text data for further processing in a machine learning or deep learning model.

In [14]:
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

## Text Classification Model

#### This code implements a text classification model using Keras and TensorFlow. It consists of an embedding layer, a global average pooling layer, two dense layers with ReLU activation, and a final dense layer with softmax activation. The model is compiled with the sparse categorical cross-entropy loss function and the Adam optimizer. It is trained on padded sequences and training labels for a specified number of epochs.


In [18]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

model.summary()
epochs = 500
history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 16)            16000     
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_3 (Dense)             (None, 64)                1088      
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 42)                2730      
                                                                 
Total params: 23,978
Trainable params: 23,978
Non-trainable params: 0
__________________________________________________

In [19]:
# to save the trained model
tf.keras.models.save_model(model, "chat_model")

import pickle

# to save the fitted tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# to save the fitted label encoder
with open('label_encoder.pickle', 'wb') as ecn_file:
    pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)



INFO:tensorflow:Assets written to: chat_model\assets


INFO:tensorflow:Assets written to: chat_model\assets
