In [2]:
import json
import os
import random
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load all intent JSON files
intent_files = [
    'data/intents.json',
    'data/python.json',
    'data/machine_learning.json',
    'data/data_science.json',
    'data/deep_learning.json',
    'data/ml_math.json',
    'data/frameworks.json'
]

# Load all intents from multiple JSON files
intents = []
for file_path in intent_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            loaded_data = json.load(file)
            intents.extend(loaded_data.get('intents', []))
    except UnicodeDecodeError as e:
        print(f"❌ Unicode error in file {file_path}: {e}")
    except json.JSONDecodeError as e:
        print(f"❌ JSON format error in file {file_path}: {e}")

# Load HR data separately and convert to "intent-like" structure
hr_file_path = 'data/hr.json'
try:
    with open(hr_file_path, 'r', encoding='utf-8') as file:
        hr_data = json.load(file)
        for qa in hr_data:
            intents.append({
                "tag": "hr_interview",
                "patterns": [qa["question"]],
                "responses": [qa["answer"]]
            })
except Exception as e:
    print(f"❌ Error loading hr.json: {e}")

# Prepare training data
patterns = []
tags = []

for intent in intents:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        tags.append(intent['tag'])

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(patterns).toarray()

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(tags)
y = to_categorical(y)

# Build model
model = Sequential()
model.add(Dense(128, input_shape=(X.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))

# Compile and train
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, batch_size=8, verbose=1)

# Save trained artifacts
os.makedirs('models', exist_ok=True)
model.save('models/chatbot_model_tf.h5')
joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'models/label_encoder.pkl')

print("✅ Model training complete and saved successfully.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0025 - loss: 5.8056
Epoch 2/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0136 - loss: 5.7698
Epoch 3/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0223 - loss: 5.5944
Epoch 4/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0305 - loss: 5.3092
Epoch 5/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0346 - loss: 5.0940
Epoch 6/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0877 - loss: 4.7368
Epoch 7/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1552 - loss: 4.2952
Epoch 8/200
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2081 - loss: 3.9205
Epoch 9/200
[1m145/145[0m [32



✅ Model training complete and saved successfully.


In [20]:
import json
import joblib
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [21]:
with open("intents.json", "r") as file:
    data = json.load(file)

sentences = []
labels = []

for intent in data["intents"]:
    for pattern in intent["patterns"]:
        sentences.append(pattern)
        labels.append(intent["tag"])


In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)

encoder = LabelEncoder()
y = encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train)

print("Training Accuracy:", model.score(X_train, y_train)*100)
print("Testing Accuracy:", model.score(X_test, y_test))

Training Accuracy: 92.85714285714286
Testing Accuracy: 0.0


In [None]:
joblib.dump(model, "chatbot_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(encoder, "label_encoder.pkl")

['label_encoder.pkl']