In [None]:
import nltk

# Download required NLTK resources
nltk.download('punkt')       # For word_tokenize
nltk.download('wordnet')     # For lemmatizer
nltk.download('omw-1.4')     # For lemmatizer dictionary


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
#chatbot
import json
import nltk
import numpy as np
import pickle
import random
import re
from textblob import TextBlob
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK setup
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove punctuation
    contractions = {
        "i'm":"i am", "you're":"you are", "it's":"it is", "don't":"do not",
        "can't":"cannot", "won't":"will not", "i've":"i have", "isn't":"is not"
    }
    for k, v in contractions.items():
        text = text.replace(k, v)
    # Spell correction
    text = str(TextBlob(text).correct())
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

# Synonym expansion for data augmentation
def synonym_expand(sentence):
    words = sentence.split()
    new_sentences = [sentence]
    for i, word in enumerate(words):
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                syn_word = lemma.name().replace("_", " ")
                if syn_word != word:
                    synonyms.add(syn_word)
        for syn in list(synonyms)[:2]:
            new_words = words.copy()
            new_words[i] = syn
            new_sentences.append(" ".join(new_words))
    return new_sentences

# Load dataset
with open('custom2.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

texts = []
labels = []

for intent_data in data['intents']:
    intent_label = intent_data['intent']
    for pattern in intent_data['text']:
        cleaned = clean_text(pattern)
        texts.append(cleaned)
        labels.append(intent_label)
        expanded = synonym_expand(cleaned)
        for ex in expanded:
            if ex != cleaned:
                texts.append(ex)
                labels.append(intent_label)

print(f"Total samples after expansion: {len(texts)}")
print(f"Unique intents: {len(set(labels))}")

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

# Tokenize and pad
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# Save preprocessing objects
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
with open("max_len.pkl", "wb") as f:
    pickle.dump(max_len, f)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels_encoded, test_size=0.2, random_state=42
)

# Build BiLSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train model
history = model.fit(
    X_train, y_train,
    epochs=60,
    batch_size=16,
    validation_data=(X_test, y_test)
)

# Save trained model
model.save("Chatbot_BiLSTM_Model.h5")

# Prepare intent->responses dictionary
responses = {item['intent']: item['responses'] for item in data['intents']}

# Prepare TF-IDF vectorizer for fuzzy similarity
vectorizer = TfidfVectorizer().fit(texts)
tfidf_matrix = vectorizer.transform(texts)

# Evaluate test accuracy
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc*100:.2f}%")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Total samples after expansion: 6614
Unique intents: 24




Epoch 1/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 25ms/step - accuracy: 0.1644 - loss: 2.8239 - val_accuracy: 0.8919 - val_loss: 0.4249
Epoch 2/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9237 - loss: 0.2677 - val_accuracy: 0.9546 - val_loss: 0.1580
Epoch 3/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.9734 - loss: 0.0884 - val_accuracy: 0.9562 - val_loss: 0.1414
Epoch 4/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9832 - loss: 0.0556 - val_accuracy: 0.9758 - val_loss: 0.0777
Epoch 5/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.9918 - loss: 0.0281 - val_accuracy: 0.9856 - val_loss: 0.0603
Epoch 6/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9913 - loss: 0.0275 - val_accuracy: 0.9675 - val_loss: 0.1047
Epoch 7/60
[1m331/33




=== Manual Testing Mode ===
Type 'quit' to exit.

You: hi
Predicted Intent: Greeting (Confidence: 1.00)
Bot: Greetings! I'm ready to help. Please tell me what you're looking for.

You: pipe is leak who can repair it
Predicted Intent: plumber (Confidence: 1.00)
Bot: A plumber can fix that. You can search 'plumber' in our app to hire one according to your location.

You: hair are to long need hair cut
Predicted Intent: barber (Confidence: 1.00)
Bot: A barber or grooming expert can help. Search 'barber' or 'grooming' in our app.

You: needs car service and repairing
Predicted Intent: automotive (Confidence: 1.00)
Bot: Automotive services include car repair, bike repair, and maintenance. Look under 'automotive' to find providers.

You: need to repair wiring and wires
Predicted Intent: electrician (Confidence: 1.00)
Bot: You should hire an electrician. Search 'electrician' in our app to find one nearby.

You: needs pain therapy who can do it
Predicted Intent: health_wellness (Confidence: 1

KeyboardInterrupt: Interrupted by user

In [None]:
# Manual testing with fuzzy matching
print("\n=== Manual Testing Mode ===")
print("Type 'quit' to exit.\n")
CONFIDENCE_THRESHOLD = 0.5

while True:
    text = input("You: ")
    if text.lower() == "quit":
        break

    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    pred = model.predict(padded, verbose=0)
    intent_idx = np.argmax(pred)
    intent_name = label_encoder.inverse_transform([intent_idx])[0]
    confidence = pred[0][intent_idx]

    if confidence < CONFIDENCE_THRESHOLD:
        # Fuzzy matching with cosine similarity
        query_vec = vectorizer.transform([cleaned])
        similarities = cosine_similarity(query_vec, tfidf_matrix)
        best_idx = np.argmax(similarities)
        intent_name = labels[best_idx]
        print(f"Fuzzy matched intent: {intent_name} (similarity: {similarities[0][best_idx]:.2f})")
        suggestion = random.choice(responses[intent_name])
        print(f"Bot: {suggestion}\n")
    else:
        suggestion = random.choice(responses[intent_name])
        print(f"Predicted Intent: {intent_name} (Confidence: {confidence:.2f})")
        print(f"Bot: {suggestion}\n")



=== Manual Testing Mode ===
Type 'quit' to exit.

Predicted Intent: Greeting (Confidence: 1.00)
Bot: Hi there! What can I assist you with?

Predicted Intent: electrician (Confidence: 1.00)
Bot: Search 'electrician' in our app to find certified electrical service providers near you.



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


=== Manual Testing Mode ===
Type 'quit' to exit.

You: hello
Predicted Intent: Greeting (Confidence: 1.00)
Bot: Hi there! What can I assist you with?

You: help me wth pipe leak
Predicted Intent: plumber (Confidence: 0.62)
Bot: A plumber can fix that. You can search 'plumber' in our app to hire one according to your location.

You: i need hair cut
Predicted Intent: beauty_and_spa (Confidence: 0.98)
Bot: Beauty and spa services include facials, manicures, pedicures, hair styling, and makeup — search 'beauty_and_spa'.

You: i need to repair my vehicle
Predicted Intent: Greeting (Confidence: 0.38)
Bot: Greetings! I'm ready to help. Please tell me what you're looking for.

You: car repair
Predicted Intent: automotive (Confidence: 0.92)
Bot: Automotive services include car repair, bike repair, and maintenance. Look under 'automotive' to find providers.

You: mobile repair
Predicted Intent: barber (Confidence: 0.99)
Bot: Barber and grooming professionals are available under 'barber' services

KeyboardInterrupt: Interrupted by user