In [None]:
import numpy as np
import pickle as pk
import pandas as pd
import json
import re
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Load the JSON data
with open('project_data.json') as file:
    data = json.load(file)

In [None]:
# Extract patterns, labels, and responses from the JSON data
training_patterns = []
training_labels = []
responses = {}

for intent in data['intents']:
    patterns = intent['patterns']
    tag = intent['tag']
    response = intent['responses']

    training_patterns.extend(patterns)
    training_labels.extend([tag] * len(patterns))
    responses[tag] = response

In [None]:
# Clean and preprocess the text patterns using regular expressions and lemmatization
def clean_text(sentences):
    result = []
    lemmatizer = WordNetLemmatizer()
    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        # Tokenize the sentence
        words = word_tokenize(sentence)
        # Lemmatize each word
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        # Join the lemmatized words back into a sentence
        sentence = ' '.join(lemmatized_words)
        # Remove non-alphanumeric characters and extra whitespaces
        sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence)
        # Remove extra whitespaces
        sentence = re.sub(r'\s+', ' ', sentence)
        result.append(sentence.strip())
    return result


In [None]:
# Clean the training patterns
training_patterns = clean_text(training_patterns)

In [None]:
# Encode the training labels
label_encoder = LabelEncoder()
training_labels = label_encoder.fit_transform(training_labels)
training_labels = to_categorical(training_labels)


In [None]:
# Tokenize the training patterns
tokenizer = Tokenizer(num_words=500, oov_token='<OOV>')
tokenizer.fit_on_texts(training_patterns)
sequences = tokenizer.texts_to_sequences(training_patterns)

In [None]:
# Pad sequences to ensure uniform length
max_sequence_length = 32
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_sequence_length)

In [None]:
len(tokenizer.word_index)+1

432

In [None]:
# Define and compile the model with LSTM layer
model = Sequential()
model.add(Embedding(input_dim=432, output_dim=32))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          13824     
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense (Dense)               (None, 256)               33024     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 127)               32639     
                                                                 
Total params: 161919 (632.50 KB)
Trainable params: 161919 (632.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = model.fit(np.array(padded_sequences), training_labels, epochs=50, batch_size=5, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model.save('model.h5') # save model
from tensorflow.keras.models import load_model
model=load_model('model.h5') # load model

  saving_api.save_model(


In [None]:
# Function to preprocess input text using regular expressions and lemmatization
def preprocess_input(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize the sentence
    words = word_tokenize(text)
    # Lemmatize each word
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Join the lemmatized words back into a sentence
    text = ' '.join(lemmatized_words)
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# Function to predict intent
def predict_intent(input_text):
    input_text = preprocess_input(input_text)
    input_sequence = pad_sequences(tokenizer.texts_to_sequences([input_text]), maxlen=max_sequence_length)
    result_index = np.argmax(model.predict(np.array(input_sequence), verbose=0))
    predicted_intent = label_encoder.classes_[result_index]
    return predicted_intent

In [None]:
# Function to get response
def get_response(predicted_intent):
    for intent in data['intents']:
        if intent['tag'] == predicted_intent:
            responses = intent['responses']
            if isinstance(responses, list):
                return np.random.choice(responses)
            else:
                return responses

In [None]:
#save encoder
pk.dump(label_encoder,open("label_encoder.pkl","wb"))

In [None]:
#save tokinizer
pk.dump(tokenizer,open("tokenizer.pkl","wb"))

In [None]:

# Function for user chat
def user_chat():
    while True:
        print("User: ", end="")
        user_input = input()

        if user_input.lower() == 'quit':
            break

        predicted_intent = predict_intent(user_input)
        response = get_response(predicted_intent)
        print('ChatBot:', response)

# Start chatting
user_chat()


User: quit
