In [None]:
import json
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, Features, ClassLabel, Sequence, Value
import re

# Load the JSON file
with open("../data/dialogues_fixed.json", "r") as f:
    data = json.load(f)

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Function to extract all unique entity types
def get_entity_types_from_data(data):
    entity_types = set()  # Set to store unique entity types
    
    # Iterate over all dialogues and turns
    for dialogue in data:
        for turn in dialogue['turns']:
            if 'entities' in turn:
                for entity in turn['entities']:
                    entity_types.add(entity['type'])
    
    return entity_types

# Function to generate BIO labels for a sentence
def create_bio_labels(sentence, entities, entity_types):
    words = sentence.split()  # Tokenize the sentence into words
    labels = ['O'] * len(words)  # Initialize all labels to 'O'
    
    for entity in entities:
        start, end, entity_type = entity['start'], entity['end'], entity['type']
        
        # Set BIO labels for the entity span
        for idx in range(start, end + 1):
            if idx == start:
                labels[idx] = f'B-{entity_type}'  # Begin of entity
            else:
                labels[idx] = f'I-{entity_type}'  # Inside entity
                
    return words, labels

# Preprocessing for Intent Classification
def preprocess_intent_data(data):
    texts = []
    intents = []

    for dialogue in data:
        for turn in dialogue['turns']:
            text = turn.get('text', '')
            intent = turn.get('intent', '')

            texts.append(text)
            intents.append(intent if intent else "UNKNOWN")  # Replace empty intents with "UNKNOWN"

    # Convert intents to a set of unique labels for encoding
    intent_labels = list(set(intents))
    intent_labels.sort()  # Sort to ensure consistent label ordering
    intent_label_map = {label: idx for idx, label in enumerate(intent_labels)}

    # Encode the intents as numeric labels
    encoded_intents = [intent_label_map[intent] for intent in intents]

    intent_dataset = Dataset.from_dict({
        'text': texts,
        'intent': encoded_intents
    })

    # Add 'labels' key to dataset for the Trainer
    intent_dataset = intent_dataset.map(lambda e: {'labels': e['intent']}, batched=True)
    return intent_dataset, intent_label_map


entity_types = get_entity_types_from_data(data)  # Implement this based on your dataset structure
label_map = {f'B-{entity}': idx for idx, entity in enumerate(entity_types)}
label_map.update({f'I-{entity}': idx + len(entity_types) for idx, entity in enumerate(entity_types)})
label_map['O'] = len(entity_types) * 2  # 'O' (outside) label


# Preprocessing for NER
def preprocess_ner_data(data, tokenizer):
    from datasets import Dataset

    tokens_list = []
    labels_list = []

    for dialogue in data:
        for turn in dialogue['turns']:
            if "entities" in turn and turn['entities']:
                sentence = turn['text']

                # Tokenize with offsets for word alignment
                tokens = tokenizer(
                    sentence,
                    padding=True,
                    truncation=True,
                    return_offsets_mapping=True,
                    return_tensors='pt'
                )
                input_ids = tokens["input_ids"][0].tolist()  # Extract token IDs
                offsets = tokens["offset_mapping"][0].tolist()  # Character offsets
                labels = [label_map["O"]] * len(input_ids)  # Initialize all labels to 'O'

                for entity in turn['entities']:
                    entity_start = entity['start']
                    entity_end = entity['end']
                    entity_type = entity['type']

                    # Map entity span to tokens
                    for idx, (start, end) in enumerate(offsets):
                        if start is None or end is None:  # Skip special tokens
                            continue
                        if start >= entity_start and end <= entity_end:
                            if start == entity_start:  # Beginning of entity
                                labels[idx] = label_map[f"B-{entity_type}"]
                            else:  # Inside entity
                                labels[idx] = label_map[f"I-{entity_type}"]

                tokens_list.append(input_ids)
                labels_list.append(labels)

    # Return as a Hugging Face Dataset
    return Dataset.from_dict({
        'tokens': tokens_list,
        'labels': labels_list
    })


# Prepare the datasets
intent_dataset, intent_label_map = preprocess_intent_data(data)
ner_dataset = preprocess_ner_data(data, tokenizer)

# Check the size of ner_dataset
print(f"ner_dataset size: {len(ner_dataset)}")

# Ensure train_size is within bounds
train_size = int(0.8 * len(ner_dataset))  # 80% for training

# Split the ner_dataset for training and validation
train_ner_dataset = ner_dataset.select(range(train_size))  # Selecting the first 80% of the data
val_ner_dataset = ner_dataset.select(range(train_size, len(ner_dataset)))  # Selecting the remaining 20%

# For intent dataset, split similarly
train_size_intent = int(0.8 * len(intent_dataset))  # 80% for training
train_intent_dataset = intent_dataset.select(range(train_size_intent))  # Selecting the first 80% of the intent data
val_intent_dataset = intent_dataset.select(range(train_size_intent, len(intent_dataset)))  # Selecting the remaining 20%

# Tokenize the datasets
def tokenize_intent(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=128)

def tokenize_ner(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        padding=True,
        truncation=True,
        max_length=128,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                label_ids.append(-100)
            elif word_id != previous_word_id:  # Beginning of a new word
                label_ids.append(label[word_id])
            else:  # Inside a word
                label_ids.append(label[word_id])
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



# Apply the tokenization
train_intent_dataset = train_intent_dataset.map(tokenize_intent, batched=True)
val_intent_dataset = val_intent_dataset.map(tokenize_intent, batched=True)

train_ner_dataset = train_ner_dataset.map(tokenize_ner, batched=True)
val_ner_dataset = val_ner_dataset.map(tokenize_ner, batched=True)

# Define the model for Intent Classification
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(intent_label_map))

# Define the model for NER
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))  # Number of unique labels for NER

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save after each epoch to match evaluation strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    load_best_model_at_end=True,  # Load the best model based on evaluation metric
)

# Define the Trainer for Intent Classification
intent_trainer = Trainer(
    model=intent_model,
    args=training_args,
    train_dataset=train_intent_dataset,
    eval_dataset=val_intent_dataset,
    tokenizer=tokenizer
)

# Define the Trainer for NER
ner_trainer = Trainer(
    model=ner_model,
    args=training_args,
    train_dataset=train_ner_dataset,
    eval_dataset=val_ner_dataset,
    tokenizer=tokenizer
)

# Training the models
intent_trainer.train()
ner_trainer.train()

# Save the trained models
intent_model.save_pretrained('./intent_model')
ner_model.save_pretrained('./ner_model')


In [16]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load dialogue data from the JSON file
dialogue_file_path = '../data/dialogues_fixed.json'

with open(dialogue_file_path, 'r') as file:
    dialogues = json.load(file)

unique_entity_labels = set()
for dialogue in dialogues:
    for turn in dialogue["turns"]:
        if "entities" in turn:
            for entity in turn["entities"]:
                unique_entity_labels.add(entity["entity"])

unique_intent_labels = set()
for dialogue in dialogues:
    for turn in dialogue["turns"]:
        if "intent" in turn:
            unique_intent_labels.add(turn["intent"])

intent_label_map = {label: idx for idx, label in enumerate(unique_intent_labels)}

# Now create the label map, assigning unique integers to each label
entity_label_map = {"O": 0}  # Add "O" for padding or non-entities first
for idx, label in enumerate(unique_entity_labels, start=1):
    entity_label_map[label] = idx

print(entity_label_map)

# Dataset for intent classification and NER
class DialogueDataset(Dataset):
    def __init__(self, dialogues, tokenizer, max_length=512, max_entities_length=6):
        self.dialogues = dialogues
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.max_entities_length = max_entities_length  # Set max_entities_length for padding

        # Define entity label mapping (e.g., "O" -> 0, "entity1" -> 1, etc.)
        self.entity_label_map = entity_label_map
        self.intent_label_map = intent_label_map

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        dialogue = self.dialogues[idx]
        turns = dialogue["turns"]
        
        # Concatenate previous turns with the current turn for intent classification
        dialogue_history = " ".join([turn["text"] for turn in turns[:-1]])  # All except the last turn
        current_turn = turns[-1]["text"]  # Last turn (usually the user's query)
        
        # Combine dialogue history and current turn (concatenating previous dialogue turns)
        input_text = dialogue_history + " " + current_turn
        
        # Tokenize the input text
        encoding = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        # Intent label (extract the intent of the current turn)
        intent_label = turns[-1].get("intent", None)  # Could be None if the Bot turn is considered
        if intent_label is None:
            intent_label = "default"  # Default label if no intent is present
        
        # Use an intent label map to convert string intent labels to integers
        intent_label = self.intent_label_map.get(intent_label, 0)  # 0 as default if not found
        
        # Named Entity Recognition: Extract entities from current user turn
        entities = turns[-1].get("entities", [])
        entity_labels = [entity["entity"] for entity in entities]  # List of entities (e.g., order number)
        
        # Pad entity labels to self.max_entities_length (use "O" for padding)
        padded_entity_labels = entity_labels + ["O"] * (self.max_entities_length - len(entity_labels))
        
        # If entities are fewer than max_entities_length, pad the rest with "O"
        # If entities are more than max_entities_length, truncate the list to ensure that it fits within the specified length
        padded_entity_labels = padded_entity_labels[:self.max_entities_length]
        
        # Convert entity labels to integers using the entity label map
        integer_entity_labels = [self.entity_label_map.get(label, 0) for label in padded_entity_labels]  # Default to "O" (0)
        
        # Convert entity labels to tensors
        padded_entity_labels_tensor = torch.tensor(integer_entity_labels)

        return {
            "input_ids": encoding['input_ids'].squeeze(),
            "attention_mask": encoding['attention_mask'].squeeze(),
            "intent_label": intent_label,  # Now intent_label is an integer
            "entities": padded_entity_labels_tensor
        }


# Create dataset
dataset = DialogueDataset(dialogues, tokenizer)

# Define a function to handle the batching process and avoid NoneType errors
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    intent_labels = [item['intent_label'] for item in batch]
    entity_labels = [item['entities'] for item in batch]
    
    # Handle padding for entity labels
    max_entities_length = max(len(entities) for entities in entity_labels)
    
    # Convert entity labels to a list if they are tensors
    if isinstance(entity_labels[0], torch.Tensor):
        entity_labels = [entities.tolist() for entities in entity_labels]
    
    padded_entity_labels = [entities + ["O"] * (max_entities_length - len(entities)) for entities in entity_labels]
    
    # Convert padded_entity_labels back to a tensor
    padded_entity_labels_tensor = torch.tensor(padded_entity_labels)
    
    # Convert intent labels to tensor (ensure they are integers)
    intent_labels_tensor = torch.tensor(intent_labels, dtype=torch.long)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "intent_labels": intent_labels_tensor,
        "entity_labels": padded_entity_labels_tensor
    }



# Define DataLoader for batching
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Load the pre-trained models (intent classification and NER)
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_intent_labels))  # Adjust num_labels based on your intents
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=)  # Adjust num_labels based on NER classes (e.g., 'O', 'order_number')

# Define a function to predict intent and NER from the model
def predict_intent_and_entities(input_ids, attention_mask):
    # Predict intent using the intent model
    intent_outputs = intent_model(input_ids, attention_mask=attention_mask)
    intent_preds = torch.argmax(intent_outputs.logits, dim=1)
    
    # Predict NER using the NER model
    ner_outputs = ner_model(input_ids, attention_mask=attention_mask)
    ner_preds = torch.argmax(ner_outputs.logits, dim=2)
    
    return intent_preds, ner_preds

# Process the DataLoader
for batch in dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    
    # Get intent and NER predictions
    intent_preds, ner_preds = predict_intent_and_entities(input_ids, attention_mask)
    
    # Convert predictions to readable format
    intent_labels = intent_preds.numpy()
    ner_labels = ner_preds.numpy()
    
    print(f"Intent predictions: {intent_labels}")
    print(f"NER predictions: {ner_labels}")


{'O': 0, '34567': 1, '12121': 2, '12 Nov 2024': 3, '121212': 4, '21 November 2024': 5, '5643': 6, '12345': 7, '67890': 8, '10 April 2024': 9, '23456': 10, '98765': 11, '6': 12, '987654': 13, '121243': 14, '987654321': 15, '5': 16, '12912': 17, '212142': 18, '54321': 19, '24 Oct 2024': 20, '212131413': 21, '121435': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Intent predictions: [9 9]
NER predictions: [[1 1 1 ... 1 0 0]
 [1 1 1 ... 0 0 0]]
Intent predictions: [9 6]
NER predictions: [[1 1 1 ... 1 1 1]
 [1 1 0 ... 1 0 0]]
Intent predictions: [6 6]
NER predictions: [[1 1 1 ... 1 1 0]
 [1 1 0 ... 1 1 0]]
Intent predictions: [6 9]
NER predictions: [[0 1 0 ... 0 0 0]
 [1 1 0 ... 1 1 1]]
Intent predictions: [6 6]
NER predictions: [[0 1 1 ... 0 0 0]
 [1 1 1 ... 1 0 0]]
Intent predictions: [6 9]
NER predictions: [[1 0 0 ... 1 0 1]
 [0 1 0 ... 0 1 0]]
Intent predictions: [6 6]
NER predictions: [[1 1 1 ... 1 0 1]
 [1 1 1 ... 1 1 1]]
Intent predictions: [9 6]
NER predictions: [[0 1 1 ... 0 0 0]
 [1 1 1 ... 1 1 1]]
Intent predictions: [6 9]
NER predictions: [[1 0 1 ... 0 1 0]
 [1 1 1 ... 1 1 0]]
Intent predictions: [6 9]
NER predictions: [[0 1 1 ... 1 1 0]
 [1 1 1 ... 1 0 0]]
Intent predictions: [6 6]
NER predictions: [[1 1 1 ... 1 1 1]
 [0 0 1 ... 0 1 0]]
Intent predictions: [9 6]
NER predictions: [[1 1 0 ... 1 0 0]
 [1 1 1 ... 1 0 0]]
Intent predictio

In [17]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example conversation history
text = "[USER] Can you help me track my order? [SEP] Sure! Could you provide me with your order number? [SEP] It's 12345."

# Tokenize the text
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

# The tokenized output will include input_ids, attention_mask, etc.
print(inputs)


{'input_ids': tensor([[  101,  1031,  5310,  1033,  2064,  2017,  2393,  2033,  2650,  2026,
          2344,  1029,   102,  2469,   999,  2071,  2017,  3073,  2033,  2007,
          2115,  2344,  2193,  1029,   102,  2009,  1005,  1055, 13138, 19961,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}


In [19]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=10)

# Define the input and tokenize it
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

# Get predictions
outputs = model(**inputs)
logits = outputs.logits

# Get predicted intent
predicted_class_id = torch.argmax(logits, dim=1).item()
print("Predicted intent:", predicted_class_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted intent: 2


In [20]:
from transformers import BertForTokenClassification
import torch

# Load pre-trained BERT model for token classification
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the input and tokenize it
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

# Get predictions for each token
outputs = ner_model(**inputs)
logits = outputs.logits

# Convert logits to predicted labels
predicted_labels = torch.argmax(logits, dim=2).squeeze().tolist()

# Map labels to entities (you would need a mapping of label IDs to entity names)
label_map = {0: "O", 1: "B-ORDER_NUMBER", 2: "I-ORDER_NUMBER"}
entities = []

# Extract entities
for token_id, label_id in zip(inputs.input_ids[0], predicted_labels):
    token = tokenizer.decode([token_id])
    label = label_map[label_id]
    if label != "O":
        entities.append({"entity": token, "type": label})

print("Extracted entities:", entities)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted entities: [{'entity': '[CLS]', 'type': 'B-ORDER_NUMBER'}, {'entity': '[', 'type': 'B-ORDER_NUMBER'}, {'entity': '[SEP]', 'type': 'B-ORDER_NUMBER'}, {'entity': 'sure', 'type': 'B-ORDER_NUMBER'}, {'entity': '!', 'type': 'B-ORDER_NUMBER'}, {'entity': 'number', 'type': 'B-ORDER_NUMBER'}, {'entity': '[SEP]', 'type': 'B-ORDER_NUMBER'}, {'entity': "'", 'type': 'B-ORDER_NUMBER'}, {'entity': 's', 'type': 'B-ORDER_NUMBER'}, {'entity': '123', 'type': 'B-ORDER_NUMBER'}, {'entity': '.', 'type': 'B-ORDER_NUMBER'}, {'entity': '[SEP]', 'type': 'B-ORDER_NUMBER'}]


In [21]:
# Example for maintaining dialogue history
history = []
history.append("[USER] Can you help me track my order?")
history.append("[BOT] Sure! Could you provide me with your order number?")
history.append("[USER] It's 12345.")
history_text = " [SEP] ".join(history)
inputs = tokenizer(history_text, padding=True, truncation=True, return_tensors="pt")

In [1]:
import json
from transformers import BertTokenizer

# Sample dialogue data
dialogue = {
    "dialogue_id": 8,
    "turns": [
        {
            "speaker": "User",
            "text": "Can you help me track my order?",
            "intent": "track_order",
            "entities": []
        },
        {
            "speaker": "Bot",
            "text": "Sure! Could you provide me with your order number?"
        },
        {
            "speaker": "User",
            "text": "It's 12345.",
            "intent": "give_order_id",
            "entities": [
                {
                    "entity": "12345",
                    "type": "order_number",
                    "start": 4,
                    "end": 9
                }
            ]
        },
        {
            "speaker": "Bot",
            "text": "Thank you! Let me check the status of order 12345."
        }
    ]
}

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess function
def preprocess_dialogue(dialogue):
    dialogue_history = []
    intent_labels = []
    entity_labels = []

    # Iterate through turns to build dialogue history and extract labels
    primary_label = None
    for turn in dialogue['turns']:
        text = turn['text']
        speaker = turn['speaker']

        # Combine the speaker's text in the form of [SPEAKER] message
        dialogue_history.append(f"[{speaker}] {text}")

        # If it's the user turn and there are entities, process them
        if speaker == "User":
            # Tokenize the text
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

            # Entity tagging (For example, "12345" as order_number)
            token_labels = ["O"] * len(tokens)  # "O" means no entity
            
            for entity in turn.get("entities", []):
                entity_text = entity["entity"]
                entity_type = entity["type"]
                start = entity["start"]
                end = entity["end"]

                # Match the entity text with the tokenized form
                entity_tokens = tokenizer.tokenize(entity_text)  # Subword tokens of the entity
                entity_token_count = len(entity_tokens)

                # Iterate through the tokens and mark the entity span
                for idx, token in enumerate(tokens):
                    if token == entity_tokens[0]:  # Found the start token of the entity
                        token_labels[idx] = f"B-{entity_type}"  # Beginning of the entity
                        for j in range(1, entity_token_count):
                            token_labels[idx + j] = f"I-{entity_type}"  # Inside the entity

            # Append the labels for entity extraction
            entity_labels.append(token_labels)

            # Append the intent label for user turns
            if primary_label is not None:
                intent_labels.append(f"{primary_label}.{turn['intent']}")
            else:
                primary_label = turn['intent']
                intent_labels.append(turn['intent'])

    # Join dialogue history into one input sequence
    input_text = " [SEP] ".join(dialogue_history)

    return input_text, intent_labels, entity_labels

# Get the preprocessed input, labels for intent and entities
input_text, intent_labels, entity_labels = preprocess_dialogue(dialogue)

# Display the preprocessed data
print("Input Text:")
print(input_text)

print("\nIntent Labels:")
print(intent_labels)

print("\nEntity Labels:")
print(entity_labels)


  from .autonotebook import tqdm as notebook_tqdm


Input Text:
[User] Can you help me track my order? [SEP] [Bot] Sure! Could you provide me with your order number? [SEP] [User] It's 12345. [SEP] [Bot] Thank you! Let me check the status of order 12345.

Intent Labels:
['track_order', 'track_order.give_order_id']

Entity Labels:
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-order_number', 'I-order_number', 'O', 'O']]


In [1]:
import json
from transformers import BertTokenizer

# Load the dialogues data from the provided path
data_path = '../data/dialogues_fixed.json'

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to process the entire dataset and save it
def process_dataset(data_path):
    # Load the dataset from the JSON file
    with open(data_path, 'r') as f:
        dialogues_data = json.load(f)

    # Initialize lists to hold the processed data
    inputs = []
    intents = []
    entities = []

    # Process each dialogue
    for dialogue in dialogues_data:
        input_text, intent_labels, entity_labels = preprocess_dialogue(dialogue)
        inputs.append(input_text)
        intents.append(intent_labels)
        entities.append(entity_labels)

    # Save the processed data to a new JSON file or CSV
    processed_data = {
        "inputs": inputs,
        "intents": intents,
        "entities": entities
    }

    # Save as JSON
    with open('../data/processed_dialogues.json', 'w') as f:
        json.dump(processed_data, f, indent=4)

    print(f"Data preprocessing complete! Processed data saved to '../data/processed_dialogues.json'")

# Run the preprocessing function
process_dataset(data_path)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'preprocess_dialogue' is not defined

In [3]:
import json

# Load the raw dataset (dialogues_fixed.json)
with open('../data/dialogues_fixed.json', 'r') as f:
    data = json.load(f)

# Extract all entity types from the dataset
entity_types = set()

# Iterate through the dialogues and extract entity types
for dialogue in data:
    for turn in dialogue['turns']:
        if 'entities' in turn:
            for entity in turn['entities']:
                # Add the entity type to the set
                entity_types.add(entity['type'])

# Create a mapping from entity type to a unique ID
entity_map = {f"B-{entity}": idx*2 for idx, entity in enumerate(sorted(entity_types))}
entity_map.update({f"I-{entity}": idx*2+1 for idx, entity in enumerate(sorted(entity_types))})

# 'O' (Outside) will have a special index for non-entity tokens
entity_map['O'] = len(entity_map)

# Print the resulting entity map
print("Entity Map:", entity_map)


Entity Map: {'B-count': 0, 'B-end_date': 2, 'B-order_id': 4, 'B-order_number': 6, 'B-start_date': 8, 'I-count': 1, 'I-end_date': 3, 'I-order_id': 5, 'I-order_number': 7, 'I-start_date': 9, 'O': 10}


In [4]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification, AdamW
# from sklearn.metrics import accuracy_score
# from tqdm import tqdm
# import json

# # Load the tokenizer and model for BERT
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Load the preprocessed dataset
# with open('../data/processed_dialogues.json', 'r') as f:
#     data = json.load(f)

# with open('../data/dialogues_fixed.json', 'r') as f:
#     complete_data = json.load(f)

# # Define the dataset class for loading the data
# class MultiTaskDataset(Dataset):
#     def __init__(self, inputs, intents, entities, tokenizer, intent_map, entity_map, max_length=512):
#         self.inputs = inputs
#         self.intents = intents
#         self.entities = entities
#         self.tokenizer = tokenizer
#         self.intent_map = intent_map
#         self.entity_map = entity_map
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.inputs)

#     def __getitem__(self, idx):
#         input_text = self.inputs[idx]
#         intent_label = self.intents[idx]
#         entity_labels = self.entities[idx]

#         # Tokenize the input text for BERT
#         encoding = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

#         # Prepare token labels for entity extraction, mapping to integer IDs
#         entity_labels_padded = [self.entity_map.get(entity, self.entity_map['O']) for entity in entity_labels]  # Default 'O' (0) if not found
#         entity_labels_padded = entity_labels_padded + [self.entity_map['O']] * (self.max_length - len(entity_labels_padded))  # Padding

#         # Convert intent labels to numeric IDs
#         intent_label = self.intent_map.get(intent_label[-1], -1)  # Ensure intent is mapped

#         # Convert everything to tensors
#         input_ids = encoding['input_ids'].squeeze(0)
#         attention_mask = encoding['attention_mask'].squeeze(0)

#         return {
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'intent_labels': torch.tensor(intent_label, dtype=torch.long),
#             'entity_labels': torch.tensor(entity_labels_padded, dtype=torch.long)
#         }

# # Prepare the dataset
# inputs = data['inputs']
# intents = data['intents']
# entities = data['entities']

# # For simplicity, let's assume we have a simple mapping of intent labels and entity labels
# intent_map = {}
# # get all unique intents from complete_data
# unique_intents = set()
# for dialogue in complete_data:
#     current_intent = None
#     for idx, turn in enumerate(dialogue['turns']):
#         if 'intent' in turn:
#             if current_intent is not None:
#                 unique_intents.add(f"{current_intent}.{turn['intent']}")
#             else:
#                 current_intent = turn['intent']
#                 unique_intents.add(current_intent)
# unique_intents = list(unique_intents)
# intent_map = {intent: idx for idx, intent in enumerate(unique_intents)}

# # Convert intents and entities to numeric labels
# numeric_intents = [[intent_map[intent] for intent in sublist] for sublist in intents]
# # Convert entities to numeric labels
# numeric_entities = [
#     [entity_map.get(entity, 10) for entity in sublist]  # Default to 'O' (10) if entity not found
#     for sublist in [sub for sentence in entities for sub in sentence]  # Flatten the list of lists
# ]


# # Create the DataLoader for training
# dataset = MultiTaskDataset(inputs, numeric_intents, numeric_entities, tokenizer, intent_map, entity_map)
# dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# # Define the BERT models for both tasks
# class MultiTaskBERTModel(torch.nn.Module):
#     def __init__(self):
#         super(MultiTaskBERTModel, self).__init__()
#         self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(intent_map))
#         self.ner_head = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(entity_map))

#     def forward(self, input_ids, attention_mask):
#         # Intent classification
#         intent_logits = self.bert(input_ids=input_ids, attention_mask=attention_mask).logits

#         # Entity recognition
#         ner_logits = self.ner_head(input_ids=input_ids, attention_mask=attention_mask).logits

#         return intent_logits, ner_logits

# # Initialize the model
# model = MultiTaskBERTModel()

# # Define the optimizer
# optimizer = AdamW(model.parameters(), lr=2e-5)

# # Define training loop
# def train(model, dataloader, optimizer, device):
#     model.train()
#     total_loss = 0
#     accumulation_steps = 4  # Gradients will be accumulated over 4 steps
#     optimizer.zero_grad()  # Zero gradients at the start

#     for step, batch in enumerate(tqdm(dataloader)):
#         # Move batch to device (GPU/CPU)
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         intent_labels = batch['intent_labels'].to(device)
#         entity_labels = batch['entity_labels'].to(device)

#         # Forward pass
#         intent_logits, ner_logits = model(input_ids, attention_mask)

#         # Calculate losses
#         intent_loss = torch.nn.CrossEntropyLoss()(intent_logits.view(-1, len(intent_map)), intent_labels.view(-1))
#         ner_loss = torch.nn.CrossEntropyLoss()(ner_logits.view(-1, len(entity_map)), entity_labels.view(-1))
        
#         # Combine losses
#         loss = intent_loss + ner_loss
#         loss = loss / accumulation_steps  # Normalize the loss for gradient accumulation

#         loss.backward()  # Backpropagate the loss

#         # Accumulate gradients and step only after `accumulation_steps`
#         if (step + 1) % accumulation_steps == 0:
#             optimizer.step()
#             optimizer.zero_grad()  # Reset gradients

#         # Clean up memory after each step
#         del input_ids, attention_mask, intent_labels, entity_labels
#         torch.cuda.empty_cache()

#         total_loss += loss.item()

#     return total_loss / len(dataloader)


# # Training loop
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# epochs = 3
# for epoch in range(epochs):
#     avg_loss = train(model, dataloader, optimizer, device)
#     print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# # Save the trained model
# model.save_pretrained('../models/multitask_bert')


In [5]:
import tensorflow as tf
tf.keras.backend.clear_session()
from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertForTokenClassification
from tensorflow.keras.optimizers import Adam
import json
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the preprocessed dataset
with open('../data/processed_dialogues.json', 'r') as f:
    data = json.load(f)

with open('../data/dialogues_fixed.json', 'r') as f:
    complete_data = json.load(f)

# Define the dataset class for TensorFlow
class MultiTaskDataset(tf.data.Dataset):
    def __new__(cls, inputs, intents, entities, tokenizer, intent_map, entity_map, max_length=512):
        # Tokenize the input text
        def _parse_function(input_text, intent_label, entity_labels):
            encoding = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")

            # Convert entity labels to padded labels
            entity_labels_padded = [entity_map.get(entity, entity_map['O']) for entity in entity_labels]
            entity_labels_padded += [entity_map['O']] * (max_length - len(entity_labels_padded))

            intent_label = intent_map.get(intent_label[-1], -1)

            return (
                {'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask']},
                {'intent_labels': intent_label, 'entity_labels': entity_labels_padded}
            )
        
        # Create TensorFlow Dataset from the data
        dataset = tf.data.Dataset.from_tensor_slices((inputs, intents, entities))
        dataset = dataset.map(lambda x, y, z: _parse_function(x, y, z))
        dataset = dataset.batch(8)

        return dataset

# Prepare the dataset
inputs = data['inputs']
intents = data['intents']
entities = data['entities']

# For simplicity, let's assume we have a simple mapping of intent labels and entity labels
intent_map = {}
# get all unique intents from complete_data
unique_intents = set()
for dialogue in complete_data:
    current_intent = None
    for idx, turn in enumerate(dialogue['turns']):
        if 'intent' in turn:
            if current_intent is not None:
                unique_intents.add(f"{current_intent}.{turn['intent']}")
            else:
                current_intent = turn['intent']
                unique_intents.add(current_intent)
unique_intents = list(unique_intents)
intent_map = {intent: idx for idx, intent in enumerate(unique_intents)}

# Convert intents and entities to numeric labels
numeric_intents = [[intent_map[intent] for intent in sublist] for sublist in intents]
numeric_entities = [
    [entity_map.get(entity, 10) for entity in sublist]  # Default 'O' (10) if not found
    for sublist in [sub for sentence in entities for sub in sentence]  # Flatten the list of lists
]

# Create the TensorFlow Dataset
dataset = MultiTaskDataset(inputs, numeric_intents, numeric_entities, tokenizer, intent_map, entity_map)
dataset = dataset.batch(4)

# Define the TensorFlow model for multi-task learning
class MultiTaskBERTModel(tf.keras.Model):
    def __init__(self, intent_map, entity_map):
        super(MultiTaskBERTModel, self).__init__()
        # Load BERT models for sequence classification and token classification
        self.bert_sequence = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(intent_map))
        self.bert_token = TFBertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(entity_map))

    def call(self, inputs):
        # Get the BERT model outputs for both tasks
        outputs_seq = self.bert_sequence(**inputs)
        outputs_token = self.bert_token(**inputs)

        return outputs_seq.logits, outputs_token.logits

# Initialize the model
model = MultiTaskBERTModel(intent_map, entity_map)

# Define optimizer
optimizer = Adam(learning_rate=2e-5)

# Loss function for intent and entity tasks
loss_fn_intent = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn_entity = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Training step function
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        intent_logits, ner_logits = model(inputs)
        intent_loss = loss_fn_intent(labels['intent_labels'], intent_logits)
        entity_loss = loss_fn_entity(labels['entity_labels'], ner_logits)
        
        total_loss = intent_loss + entity_loss
        
    grads = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    return total_loss

# Training loop
epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(dataset):
        inputs, labels = batch
        loss = train_step(inputs, labels)
        total_loss += loss
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss.numpy():.4f}")

# Save the trained model
model.save_pretrained('../models/multitask_bert_tf')


I0000 00:00:1732719697.768598    6903 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6250 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


ValueError: Can't convert non-rectangular Python sequence to Tensor.