## Python Chatbot using NLP

In [1]:
import nltk
import random
import string
import json

from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

nltk.download('punkt')
nltk.download('stopwords')

intents = {
    "greeting": {
        "patterns": ["Hi", "Hello", "Hey", "Good morning", "Good evening"],
        "responses": ["Hello!", "Hi there!", "Greetings!"]
    },
    "goodbye": {
        "patterns": ["Bye", "See you later", "Goodbye"],
        "responses": ["Goodbye!", "See you later!", "Take care!"]
    },
    "thanks": {
        "patterns": ["Thanks", "Thank you", "Much appreciated"],
        "responses": ["You're welcome!", "No problem!", "Anytime!"]
    },
    "about": {
        "patterns": ["What is this?", "Tell me about yourself", "Who are you?"],
        "responses": ["I am a chatbot created to assist you.", "I'm here to help you with your questions."]
    }
}

# text preprocessing function

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)


# Prepare training data
X = []

y = []

for intent, data in intents.items():
    for pattern in data['patterns']:
        X.append(preprocess_text(pattern))
        y.append(intent)

vectorizer = TfidfVectorizer()
X_vectors = vectorizer.fit_transform(X)

model = LogisticRegression()
model.fit(X_vectors, y)

# chatbot response function
def get_chatbot_response(user_input):
    processed_input = preprocess_text(user_input)
    input_vector = vectorizer.transform([processed_input])
    predicted_intent = model.predict(input_vector)[0]
    return random.choice(intents[predicted_intent]['responses'])

# run the chatbot

while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        print("Chatbot: Goodbye!")
        break
    response = get_chatbot_response(user_input)
    print(f"Chatbot: {response}")

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Chatbot: Greetings!
Chatbot: I'm here to help you with your questions.
Chatbot: No problem!
Chatbot: Goodbye!
Chatbot: Goodbye!


## Text Classification using SNIPS Dataset

In [None]:
# Install required packages for SNIPS dataset
# !pip install datasets transformers scikit-learn pandas numpy

import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================================
# Load SNIPS Dataset
# ============================================================
print("Loading SNIPS dataset...")
dataset = load_dataset("snips_built_in_intents")

# Display dataset information
print("Dataset structure:", dataset)
print("\nTrain dataset size:", len(dataset['train']))
print("Test dataset size:", len(dataset['test']))
print("\nSample data:")
print(dataset['train'][0])

# ============================================================
# Data Preparation and Preprocessing
# ============================================================
print("\n" + "="*80)
print("Data Preparation")
print("="*80)

# Convert dataset to pandas DataFrame for easier manipulation
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

print("Train DataFrame shape:", train_df.shape)
print("\nLabel distribution in training data:")
print(train_df['label'].value_counts())

# Get unique labels
label_names = dataset['train'].features['label'].names
print("\nLabel names:")
for idx, name in enumerate(label_names):
    print(f"{idx}: {name}")

# Extract text and labels
X_train = train_df['text'].tolist()
y_train = train_df['label'].tolist()
X_test = test_df['text'].tolist()
y_test = test_df['label'].tolist()

print("\nTraining samples:", len(X_train))
print("Test samples:", len(X_test))
print("\nExample texts:")
for i in range(3):
    print(f"{i+1}. Text: '{X_train[i]}' -> Label: {label_names[y_train[i]]}")

# ============================================================
# Feature Extraction using TF-IDF
# ============================================================
print("\n" + "="*80)
print("Feature Extraction using TF-IDF")
print("="*80)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2)

# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF feature shape (train):", X_train_tfidf.shape)
print("TF-IDF feature shape (test):", X_test_tfidf.shape)
print("Vocabulary size:", len(tfidf_vectorizer.vocabulary_))

# ============================================================
# Model Training - Logistic Regression
# ============================================================
print("\n" + "="*80)
print("Model Training - Logistic Regression")
print("="*80)

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_tfidf)

# Calculate accuracy
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

# Classification report
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr, target_names=label_names))

# ============================================================
# Model Training - Naive Bayes
# ============================================================
print("\n" + "="*80)
print("Model Training - Naive Bayes")
print("="*80)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_nb = nb_model.predict(X_test_tfidf)

# Calculate accuracy
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

# Classification report
print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb, target_names=label_names))

# ============================================================
# Model Training - Support Vector Machine (SVM)
# ============================================================
print("\n" + "="*80)
print("Model Training - Support Vector Machine (SVM)")
print("="*80)

# Train SVM model (linear kernel)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_tfidf)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# Classification report
print("\nClassification Report (SVM):")
print(classification_report(y_test, y_pred_svm, target_names=label_names))

# ============================================================
# Model Comparison
# ============================================================
print("\n" + "="*80)
print("Model Comparison")
print("="*80)

# Compare all models
models_comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'SVM'],
    'Accuracy': [lr_accuracy, nb_accuracy, svm_accuracy]
})

models_comparison = models_comparison.sort_values('Accuracy', ascending=False)
print("Model Comparison:")
print(models_comparison)

# Visualize model comparison
plt.figure(figsize=(10, 6))
plt.bar(models_comparison['Model'], models_comparison['Accuracy'], color=['blue', 'green', 'red'])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Comparison on SNIPS Dataset')
plt.ylim([0, 1])
for i, v in enumerate(models_comparison['Accuracy']):
    plt.text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()

# ============================================================
# Confusion Matrix Visualization
# ============================================================
print("\n" + "="*80)
print("Confusion Matrix Visualization")
print("="*80)

# Create confusion matrix for best model (usually Logistic Regression or SVM)
cm = confusion_matrix(y_test, y_pred_lr)

# Visualize confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_names, yticklabels=label_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Logistic Regression')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# ============================================================
# Testing with Custom Inputs
# ============================================================
print("\n" + "="*80)
print("Testing with Custom Inputs")
print("="*80)

# Function to predict intent for custom text
def predict_intent(text, model=lr_model, vectorizer=tfidf_vectorizer):
    """
    Predict the intent of a given text
    
    Args:
        text: Input text string
        model: Trained classification model
        vectorizer: Fitted TF-IDF vectorizer
    
    Returns:
        Predicted intent label and probability
    """
    # Transform the input text
    text_tfidf = vectorizer.transform([text])
    
    # Predict intent
    predicted_label = model.predict(text_tfidf)[0]
    predicted_intent = label_names[predicted_label]
    
    # Get prediction probabilities (if available)
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(text_tfidf)[0]
        confidence = probabilities[predicted_label]
        return predicted_intent, confidence
    else:
        return predicted_intent, None

# Test with custom inputs
test_texts = [
    "Play some jazz music",
    "What's the weather like today?",
    "Book a table at an Italian restaurant",
    "Add this song to my workout playlist",
    "Find me a good action movie",
    "I want to rate this book 5 stars"
]

print("Custom Text Predictions:")
print("=" * 80)
for text in test_texts:
    intent, confidence = predict_intent(text)
    if confidence:
        print(f"Text: '{text}'")
        print(f"Predicted Intent: {intent} (Confidence: {confidence:.4f})")
    else:
        print(f"Text: '{text}'")
        print(f"Predicted Intent: {intent}")
    print("-" * 80)

# ============================================================
# Interactive Intent Classifier
# ============================================================
print("\n" + "="*80)
print("Interactive Intent Classifier")
print("="*80)

# Interactive loop for intent classification
print("SNIPS Intent Classifier")
print("Available intents:", ", ".join(label_names))
print("Type 'quit' to exit\n")

while True:
    user_text = input("Enter your text: ")
    
    if user_text.lower() in ['quit', 'exit', 'q']:
        print("Exiting intent classifier...")
        break
    
    if user_text.strip():
        intent, confidence = predict_intent(user_text)
        print(f"âœ“ Predicted Intent: {intent}")
        if confidence:
            print(f"  Confidence: {confidence:.2%}")
        print()