In [1]:
# Sample dialogue data
dialogue = {
    "dialogue_id": 13,
    "turns": [
        {
            "speaker": "User",
            "text": "Where is my package? I need to track it.",
            "intent": "track_order",
            "entities": []
        },
        {
            "speaker": "Bot",
            "text": "Can you please share the order number so I can check?"
        },
        {
            "speaker": "User",
            "text": "It's 98765.",
            "intent": "give_order_id",
            "entities": [
                {
                    "entity": "98765",
                    "type": "order_number",
                    "start": 5,
                    "end": 10
                }
            ]
        }
    ]
}

entity_labels = {
    "order_number": "ORDER_NUMBER"
}

def create_bert_input(dialogue):
    # Initialize the previous intent (for the first turn, it's empty)
    previous_intent = None
    bot_response = None
    bert_input = []
    
    # Loop through each turn to format the state
    for turn in dialogue["turns"]:
        speaker = turn["speaker"]
        text = turn["text"]
        
        if speaker == "User":
            # Concatenate the dialogue for User's query and Bot's response
            if previous_intent is not None and bot_response is not None:
                # Create the formatted string for the previous intent, bot response, and user query
                state_input = f"[INT] {previous_intent} [BOT] {bot_response} [USR] {text}"
                bert_input.append(state_input)
            # Update the previous intent with the current turn's intent
            previous_intent = turn["intent"]
        
        elif speaker == "Bot":
            # Save the bot response to include it in the next user query
            bot_response = text
    
    return bert_input

# Get the formatted BERT input
bert_input = create_bert_input(dialogue)

# Print the resulting list of inputs for BERT
for input_str in bert_input:
    print(input_str)


[INT] track_order [BOT] Can you please share the order number so I can check? [USR] It's 98765.


In [12]:
# Sample dialogue data
dialogue = {
    "dialogue_id": 13,
    "turns": [
        {
            "speaker": "User",
            "text": "Where is my package? I need to track it.",
            "intent": "track_order",
            "entities": []
        },
        {
            "speaker": "Bot",
            "text": "Can you please share the order number so I can check?"
        },
        {
            "speaker": "User",
            "text": "It's 98765.",
            "intent": "give_order_id",
            "entities": [
                {
                    "entity": "98765",
                    "type": "order_number",
                    "start": 5,
                    "end": 10
                }
            ]
        }
    ]
}

def create_bert_input_split(dialogue):
    previous_intent = None
    bot_response = None
    bert_input = []
    labels = []

    # First, create the first data point (without previous intent)
    first_turn = dialogue["turns"][0]  # The first user query
    first_user_query = first_turn["text"]
    
    # Create the first data point with the user query and the bot response
    first_data_point = f"[INT] [BOT] {bot_response or ''} [USR] {first_user_query}"
    bert_input.append(first_data_point)
    
    # Now, process the second data point
    for i, turn in enumerate(dialogue["turns"]):
        if turn["speaker"] == "User":
            user_query = turn["text"]
            if previous_intent is not None and bot_response is not None:
                # Create the second data point with the previous intent, bot response, and user query
                second_data_point = f"[INT] {previous_intent} [BOT] {bot_response} [USR] {user_query}"
                bert_input.append(second_data_point)
                labels.append(turn["intent"])
        elif turn["speaker"] == "Bot":
            # Save the bot response for the second data point
            bot_response = turn["text"]
        
        # Update the previous intent
        if turn["speaker"] == "User":
            previous_intent = turn["intent"]

    return bert_input, labels

# Get the formatted BERT input (split into data points)
bert_input, labels = create_bert_input_split(dialogue)

# Print the resulting list of inputs for BERT
for i, input_str in enumerate(bert_input):
    print(f"Data point {i + 1}: {input_str}")


Data point 1: [INT] [BOT]  [USR] Where is my package? I need to track it.
Data point 2: [INT] track_order [BOT] Can you please share the order number so I can check? [USR] It's 98765.


In [10]:
print(bert_input)

['[INT] [BOT]  [USR] Where is my package? I need to track it.', "[INT] track_order [BOT] Can you please share the order number so I can check? [USR] It's 98765."]


In [9]:
import json

# Sample dataset file (replace with your actual file path)
data_file = '../data/dialogues_fixed.json'

def create_bert_input_split(dialogue):
    previous_intent = None
    bot_response = None
    bert_input = []
    labels = []
    entities = []
    
    # Process the first turn to create the first data point
    first_turn = dialogue["turns"][0]  # First user query
    first_user_query = first_turn["text"]
    
    # Create the first data point (without previous intent)
    first_data_point = f"[INT] [BOT] {bot_response or ''} [USR] {first_user_query}"
    labels.append(first_turn["intent"])
    entities.append(first_turn["entities"])
    bert_input.append(first_data_point)
    
    # Process the rest of the turns
    for i, turn in enumerate(dialogue["turns"]):
        if turn["speaker"] == "User":
            user_query = turn["text"]
            if previous_intent is not None and bot_response is not None:
                # Create the subsequent data point with the previous intent and bot response
                second_data_point = f"[INT] {previous_intent} [BOT] {bot_response} [USR] {user_query}"
                bert_input.append(second_data_point)
                labels.append(turn["intent"])
        elif turn["speaker"] == "Bot":
            # Capture the bot response
            bot_response = turn["text"]
        
        # Update the previous intent
        if turn["speaker"] == "User":
            previous_intent = turn["intent"]

    return bert_input, labels

with open(data_file, 'r') as f:
    dialogues = json.load(f)

def process_all_dialogues(data_file):
    # Load the dataset
    
    all_bert_inputs = []
    all_labels = []
    
    # Process each dialogue in the dataset
    for dialogue in dialogues:
        bert_input, label = create_bert_input_split(dialogue)
        all_bert_inputs.extend(bert_input)
        all_labels.extend(label)
    
    return all_bert_inputs, all_labels


# Get all data points for BERT
all_bert_inputs, all_labels = process_all_dialogues(data_file)

# Print the resulting data points (for example, print the first few data points)
# for i, input_str in enumerate(all_bert_inputs):  # Limiting to the first 10 data points for viewing
#     print(f"Data point {i + 1}: {input_str}")


In [10]:
print(all_bert_inputs[:10])
print(all_labels[:10])

['[INT] [BOT]  [USR] Hey can i get the status of my latest order?', '[INT] track_order [BOT] Sure! Could you provide me with the order ID? [USR] Yeah, the order ID is 121212.', '[INT] [BOT]  [USR] I want to get the status of order 12121.', '[INT] [BOT]  [USR] Can you share the details of an order?', '[INT] track_order [BOT] Yeah can you provide me with the order ID? [USR] Yeah, I will share the order ID. It is 212131413.', '[INT] [BOT]  [USR] Could you tell me the status of my recent order?', '[INT] track_order [BOT] Sure! Could you provide me with the order ID? [USR] The order ID is 54321', '[INT] [BOT]  [USR] I want to know where my order is. Can you help me?', '[INT] track_order [BOT] Absolutely! Could you provide me with the order number? [USR] Yeah the order number is 987654321', '[INT] [BOT]  [USR] Can you provide me with an update on my order?']
['track_order', 'give_order_id', 'track_order', 'track_order', 'give_order_id', 'track_order', 'give_order_id', 'track_order', 'give_or

In [11]:
# dump this to a csv, first column is instructions and second is intent
import csv

# Output file path
output_file = '../data/bert_input.csv'

# Write the BERT input data to a CSV file
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["instruction", "intent"])
    for instruction, intent in zip(all_bert_inputs, all_labels):
        writer.writerow([instruction, intent])

In [12]:
# get the number of intents
unique_intents = set()
for dialogue in dialogues:
    for turn in dialogue["turns"]:
        if turn["speaker"] == "User":
            unique_intents.add(turn["intent"])
            
num_intents = len(unique_intents)

intent_to_label = {intent: i for i, intent in enumerate(unique_intents)}
print(intent_to_label)

{'list_orders': 0, 'track_order': 1, 'give_order_id': 2, 'give_list_order_params': 3, 'give_reason': 4, 'confirm_command': 5, 'cancel_order': 6}


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function
def preprocess_data(data_points, max_length=128):
    input_ids = []
    attention_masks = []
    labels = []

    # Assuming 'data_points' is a list of formatted strings like '[INT] [BOT] [USR] ...'
    for data in data_points:
        # Tokenize and encode the data points
        encoding = tokenizer(data, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        
        input_ids.append(encoding['input_ids'].squeeze())
        attention_masks.append(encoding['attention_mask'].squeeze())
        
        # Labels: map intent names to integers
        # Example: {"track_order": 0, "give_order_id": 1, ...}
        intent = data.split()[1]  # Assuming the intent is always at index 1
        labels.append(intent_to_label.get(intent, -1))  # Handle unknown intents with -1

    return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels)

# Create custom Dataset class
class IntentDataset(Dataset):
    def __init__(self, data_points, max_length=128):
        self.data_points = data_points
        self.max_length = max_length
        self.input_ids, self.attention_masks, self.labels = preprocess_data(data_points, max_length)

    def __len__(self):
        return len(self.data_points)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Split the data into training and validation sets
train_data, val_data = train_test_split(all_bert_inputs, test_size=0.2)

train_dataset = IntentDataset(train_data)
val_dataset = IntentDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)



  from .autonotebook import tqdm as notebook_tqdm
2024-11-27 21:11:39.454294: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-27 21:11:39.463302: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732722099.473462    6112 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732722099.476423    6112 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-27 21:11:39.488165: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [6]:
# Load pre-trained BERT for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_intents)  # Adjust num_labels

# Move model to GPU if available
device = torch.device("cpu")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Training function
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

# Evaluation function
def evaluate(model, val_loader, device):
    model.eval()
    total_correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Prediction
            _, preds = torch.max(logits, dim=1)

            total_correct += torch.sum(preds == labels)
            total += labels.size(0)

    accuracy = total_correct / total
    return accuracy.item()

# Train and evaluate the model
num_epochs = 3

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Training phase
    print(intent_to_label)
    train_loss = train(model, train_loader, optimizer, criterion, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    # Evaluation phase
    accuracy = evaluate(model, val_loader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")




Epoch 1/3
{'list_orders': 0, 'track_order': 1, 'give_order_id': 2}


  0%|          | 0/3 [00:01<?, ?it/s]


IndexError: Target -1 is out of bounds.