### **Install Required Dependencies**

In [None]:
!pip install transformers datasets

In [None]:
pip install transformers[torch]

### **Load the dataset**

In [None]:
import json

dataset_path = "/content/drive/MyDrive/chatbot_models/ankita/intent.json"
with open(dataset_path, "r") as file:
    data = json.load(file)

intents = data["intents"]
questions = []
responses = []
for intent in intents:
    for pattern in intent["patterns"]:
        questions.append(pattern)
        # Append each response individually
        for response in intent["responses"]:
            responses.append(response)

### **Tokenization of Input and Model Training**

In [None]:
import json
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch
from torch.utils.data import DataLoader, Dataset

# Load dataset from JSON file
dataset_path = "/content/drive/MyDrive/chatbot_models/ankita/intent.json"
with open(dataset_path, "r") as file:
    data = json.load(file)

intents = data["intents"]
questions = []
responses = []
for intent in intents:
    for pattern in intent["patterns"]:
        questions.append(pattern)
        # Append each response individually
        responses.append(intent["responses"][0])

train_questions, val_questions, train_responses, val_responses = train_test_split(
    questions, responses, test_size=0.1, random_state=42
)

# Define a custom dataset class
class IntentDataset(Dataset):
    def __init__(self, questions, responses, tokenizer, max_length=128):
        self.questions = questions
        self.responses = responses
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        response = self.responses[idx]

        encoding = self.tokenizer(question, response,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_length,
                                  return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(1)  # We use a constant label since this is not a classification task
        }

# Load pre-trained RoBERTa model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Create train and validation datasets
train_dataset = IntentDataset(train_questions, train_responses, tokenizer)
val_dataset = IntentDataset(val_questions, val_responses, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Fine-tune the RoBERTa model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(25):  # Train for 3 epochs
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    val_loss /= len(val_loader)
    print(f'Epoch {epoch + 1}, Validation Loss: {val_loss:.4f}')

print("Training finished!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Validation Loss: 0.4162
Epoch 2, Validation Loss: 0.1126
Epoch 3, Validation Loss: 0.0605
Epoch 4, Validation Loss: 0.0140
Epoch 5, Validation Loss: 0.0066
Epoch 6, Validation Loss: 0.0032
Epoch 7, Validation Loss: 0.0028
Epoch 8, Validation Loss: 0.0020
Epoch 9, Validation Loss: 0.0013
Epoch 10, Validation Loss: 0.0009
Epoch 11, Validation Loss: 0.0008
Epoch 12, Validation Loss: 0.0007
Epoch 13, Validation Loss: 0.0006
Epoch 14, Validation Loss: 0.0005
Epoch 15, Validation Loss: 0.0005
Epoch 16, Validation Loss: 0.0005
Epoch 17, Validation Loss: 0.0004
Epoch 18, Validation Loss: 0.0004
Epoch 19, Validation Loss: 0.0004
Epoch 20, Validation Loss: 0.0004
Epoch 21, Validation Loss: 0.0004
Epoch 22, Validation Loss: 0.0004
Epoch 23, Validation Loss: 0.0003
Epoch 24, Validation Loss: 0.0003
Epoch 25, Validation Loss: 0.0003
Training finished!


In [None]:
# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/chatbot_models/roberta.pth")

### **Chatbot**

In [None]:
import random
import json

import torch

from nltk_utils import bag_of_words, tokenize

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open('intent.json', 'r') as json_data:
    intents = json.load(json_data)

FILE = "/content/drive/MyDrive/chatbot_models/ankita/intent.json"
data = torch.load(FILE)

bot_name = "Chetana"
print("Let's chat! (type 'quit' to exit)")
while True:
    # sentence = "do you use credit cards?"
    sentence = input("You: ")
    if sentence == "quit":
        break

    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.75:
        for intent in intents['intents']:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")
    else:
        print(f"{bot_name}: I do not understand...")


Let's chat! (type 'quit' to exit)
You: Is anyone there
Chetana: Hello! What's new with you lately?
You: How can I contact customer supp
Chetana: I do not understand...
You: How can I contact customer support?
Chetana: Set preferences for order notifications by contacting our support team.
You: I need help with my account.
Chetana: Get information about initiating a chargeback for an order by contacting our support team.
You: Can you assist me with a problem?
Chetana: Purchase a gift card from our website with assistance from our support team.
You: My order has an issue, what should I do?
Chetana: Want to expedite shipping for your order? Contact our support team for options.
You: Is there someone I can speak to about my order?
Chetana: Our customer support team can be reached at [phone number] or [email].
You: I'm having trouble with your website, can you help?
Chetana: Concerned about the security of your payment information? Contact our support team for information.
You: Who can I ta

### **Evaluation**

In [3]:
from transformers import RobertaForSequenceClassification

# Load the saved model from the directory
model = RobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/chatbot_models/roberta.pth")


In [14]:
# Define a function to evaluate the model
import nltk
from nltk.tokenize import word_tokenize
import json
import numpy as np
import torch

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def tokenize(sentence):
    return word_tokenize(sentence)

def bag_of_words(sentence, words):
    bag = np.zeros(len(words), dtype=np.float32)
    for word in sentence:
        if word in words:
            bag[words.index(word)] = 1
    return bag


def evaluate_model(model, test_data, intents):
    num_correct = 0
    total = 0

    for example in test_data:
        sentence = example['sentence']
        expected_tag = example['tag']

        sentence = tokenize(sentence)
        X = bag_of_words(sentence, all_words)
        X = X.reshape(1, X.shape[0])
        X = torch.from_numpy(X).to(device)

        # Convert input tensor to torch.long
        X = torch.tensor(X, dtype=torch.long, device=device).clone().detach()

# Call the model for inference
        output = model(X)

        # Extract logits from the output
        logits = output.logits

# Apply torch.max() to find the index of the maximum value along the specified dimension
        _, predicted = torch.max(logits, dim=1)
        # _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        # print(tag, expected_tag)
        if tag == expected_tag:
            num_correct += 1

        total += 1

    accuracy = num_correct / total
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Load the test dataset
test_data = [
    {"sentence": "Hi there!", "tag": "greeting"},
    {"sentence": "What kinds of items do you have?", "tag": "items"},
    {"sentence": "You're welcome! Happy to assist.", "tag": "goodbye"},
    {"sentence": "Hi there! How's everything been going with you lately?", "tag": "greeting"},
    {"sentence": "Bye! Come back again soon.", "tag": "goodbye"},
    {"sentence": "No problem! Glad I could help.", "tag": "goodbye"},
    {"sentence": "Bubye. Let's meet again soon", "tag": "goodbye"},
    {"sentence": "Delivery typically takes 2-4 business days.", "tag": "delivery"},
    {"sentence": "You're welcome! Happy to assist. Goodbye", "tag": "goodbye"},
    {"sentence": "Bye. Have a good day", "tag":"goodbye"}

]

with open('/content/drive/MyDrive/chatbot_models/ankita/intents.json', 'r') as json_data:
    intents = json.load(json_data)

# Assuming your training data is stored in a variable called intents
all_words = []
for intent in intents['intents']:
    for pattern in intent['patterns']:
        words = tokenize(pattern)
        all_words.extend(words)

# Extracting unique tags from the intents data
tags = []
for intent in intents['intents']:
    tags.append(intent['tag'])


# Remove duplicates
all_words = list(set(all_words))


# Call the evaluate_model function
evaluate_model(model, test_data, intents)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  X = torch.tensor(X, dtype=torch.long, device=device).clone().detach()


Accuracy: 60.00%


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
