In [None]:
!pip install pandas numpy tensorflow scikit-learn nltk



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

# Load the dataset
data = pd.read_csv('/content/questions_answers_dataset.csv')  # Ensure your CSV file is named correctly

# Basic preprocessing
data['Question'] = data['Question'].str.lower()  # Convert to lowercase

# Tokenization and Lemmatization
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized)

data['Processed_Question'] = data['Question'].apply(preprocess_text)

# Encode answers
label_encoder = LabelEncoder()
data['Encoded_Answer'] = label_encoder.fit_transform(data['Answer'])

# Split into training and test sets
X = data['Processed_Question']
y = data['Encoded_Answer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train).toarray()
X_test_vectorized = vectorizer.transform(X_test).toarray()


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model
model = models.Sequential()
model.add(layers.Input(shape=(X_train_vectorized.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train_vectorized, y_train, epochs=30, batch_size=8, validation_split=0.1)


Epoch 1/30
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0317 - loss: 6.5857 - val_accuracy: 0.0265 - val_loss: 5.6905
Epoch 2/30
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0630 - loss: 5.2593 - val_accuracy: 0.0619 - val_loss: 5.4748
Epoch 3/30
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1025 - loss: 4.8363 - val_accuracy: 0.0885 - val_loss: 5.4864
Epoch 4/30
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1328 - loss: 4.3058 - val_accuracy: 0.1150 - val_loss: 5.7520
Epoch 5/30
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1463 - loss: 4.1659 - val_accuracy: 0.1239 - val_loss: 6.1951
Epoch 6/30
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1514 - loss: 4.0167 - val_accuracy: 0.0973 - val_loss: 6.6959
Epoch 7/30
[1m127/127[0m 

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_vectorized, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0947 - loss: 27.3457 
Test Accuracy: 0.10


In [None]:
def chatbot_response(question):
    processed_question = preprocess_text(question)
    vectorized_question = vectorizer.transform([processed_question]).toarray()
    prediction = model.predict(vectorized_question)
    predicted_answer_index = np.argmax(prediction)
    return label_encoder.inverse_transform([predicted_answer_index])[0]

# Example interaction
while True:
    user_question = input("Ask a question: ")
    if user_question.lower() == 'exit':
        break
    response = chatbot_response(user_question)
    print(f"Chatbot: {response}")


Ask a question: what is the salary of norma fisher
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Chatbot: 55306
Ask a question: what is the email of norma fisher
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Chatbot: tammy76@example.com


KeyboardInterrupt: Interrupted by user

In [None]:
# Save the model
model.save('chatbot_model.h5')

# Save the vectorizer and label encoder
import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')




['label_encoder.pkl']

In [None]:
# Load the model and vectorizer
model = tf.keras.models.load_model('chatbot_model.h5')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')
