In [4]:
from transformers import BertTokenizer, BertModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
model = BertModel.from_pretrained("bert-base-uncased")

In [7]:
!pip install -q datasets
!git clone https://github.com/sonos/nlu-benchmark.git
# !pip install -q torch

Cloning into 'nlu-benchmark'...
remote: Enumerating objects: 400, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 400 (delta 2), reused 11 (delta 2), pack-reused 389 (from 1)[K
Receiving objects: 100% (400/400), 1.19 MiB | 1.33 MiB/s, done.
Resolving deltas: 100% (248/248), done.


In [8]:
import json
from datasets import Dataset

final_data = []
intent_list = []
intentpath_list = [
    "/content/nlu-benchmark/2017-06-custom-intent-engines/PlayMusic/train_PlayMusic_full.json",
    "/content/nlu-benchmark/2017-06-custom-intent-engines/AddToPlaylist/train_AddToPlaylist_full.json",
    "/content/nlu-benchmark/2017-06-custom-intent-engines/BookRestaurant/train_BookRestaurant_full.json",
    "/content/nlu-benchmark/2017-06-custom-intent-engines/GetWeather/train_GetWeather_full.json",
    "/content/nlu-benchmark/2017-06-custom-intent-engines/RateBook/train_RateBook_full.json",
    "/content/nlu-benchmark/2017-06-custom-intent-engines/SearchCreativeWork/train_SearchCreativeWork_full.json",
    "/content/nlu-benchmark/2017-06-custom-intent-engines/SearchScreeningEvent/train_SearchScreeningEvent_full.json"
]

for path in intentpath_list:
    with open(path, "r", encoding="latin-1") as f:
        data = json.load(f)

    intent_name = (list(data.keys())[0])
    data = data[intent_name]
    for i in data:
        i['intent'] = intent_name
    final_data.extend(data)
data = Dataset.from_list(final_data)

In [9]:
data = data.shuffle()
for i in range(10):
    print(data[i]['intent'])

BookRestaurant
PlayMusic
BookRestaurant
BookRestaurant
BookRestaurant
SearchScreeningEvent
RateBook
GetWeather
RateBook
SearchCreativeWork


In [10]:
entity_set = set()
for idx in range(len(data)):
    for j in data[idx]['data']:
        slot_I = "I-" + str(j['entity'])
        entity_set.add(slot_I)
        slot_B = "B-" + str(j['entity'])
        entity_set.add(slot_B)

In [11]:
intent_set = set()
for idx in range(len(data)):
    intent_set.add(data[idx]['intent'])

In [13]:
entity_to_idx = {}
idx_to_entity = {}
for idx, ent in enumerate(entity_set):
    entity_to_idx[ent] = idx
    idx_to_entity[idx] = ent
entity_to_idx['O'] = 80
entity_to_idx[tokenizer.pad_token] = -100
idx_to_entity[-100] = tokenizer.pad_token
idx_to_entity[80] = 'O'

In [14]:
len(entity_to_idx)

82

In [15]:
intent_to_idx = {}
idx_to_intent = {}
for idx, intent in enumerate(intent_set):
    intent_to_idx[intent] = idx
    idx_to_intent[idx] = intent

In [18]:
import torch
from torch.utils.data import Dataset as torchDataset
from torch.utils.data import DataLoader

import torch
from torch.utils.data import Dataset as torchDataset
from torch.utils.data import DataLoader

class SnipsDataset(torchDataset):
    """
    Dataset for SNIPS NLU task.
    """
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.data = data

    def __getitem__(self, idx):
        sentence_tokens, slots_list = self.sample_example(idx)

        inputs = self.tokenizer(
            sentence_tokens,
            padding="max_length",
            truncation=True,
            max_length=50,
            return_tensors="pt",
            is_split_into_words=True
        )

        word_ids = inputs.word_ids()
        aligned_slots = []
        prev_word_id = None

        for word_id in word_ids:
            if word_id is None:
                aligned_slots.append(-100)
            elif word_id != prev_word_id:
                aligned_slots.append(slots_list[word_id])
            else:
                base_slot = slots_list[word_id]
                if base_slot != entity_to_idx["O"]:
                    entity_name = list(entity_to_idx.keys())[list(entity_to_idx.values()).index(base_slot)][2:]
                    aligned_slots.append(entity_to_idx["I-" + entity_name])
                else:
                    aligned_slots.append(base_slot)
            prev_word_id = word_id

        assert len(aligned_slots) == inputs["input_ids"].shape[1], "Slots and tokens length mismatch"

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "slots": torch.tensor(aligned_slots, dtype=torch.long),
            "intent": torch.tensor(intent_to_idx[self.data[idx]["intent"]], dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

    def sample_example(self, idx):
        instance = self.data[idx]
        sentence_tokens = []
        slot_labels = []

        for word_info in instance['data']:
            word_text = word_info['text']
            word_tokens = self.tokenizer.tokenize(word_text)
            sentence_tokens.extend(word_tokens)

            if 'entity' in word_info and word_info['entity']:
                entity = word_info['entity']
                slot_list = [f"I-{entity}"] * len(word_tokens)
                slot_list[0] = f"B-{entity}"
            else:
                slot_list = ["O"] * len(word_tokens)

            slot_labels.extend(slot_list)

        sentence_tokens = ["[CLS]"] + sentence_tokens + ["[SEP]"]
        slot_labels = ["O"] + slot_labels + ["O"]

        assert len(sentence_tokens) == len(slot_labels), "Tokens and slots length mismatch!"

        return sentence_tokens, [entity_to_idx[slot] for slot in slot_labels]

In [26]:
import torch

def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    slot_labels = [item['slots'] for item in batch]
    intent_labels = [item['intent'] for item in batch]

    max_len = max(len(x) for x in input_ids)

    def pad_sequence(seq, pad_value):
        return seq + [pad_value] * (max_len - len(seq))

    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
    input_ids = [pad_sequence(x.tolist(), pad_token_id) for x in input_ids]

    attention_masks = [pad_sequence(x.tolist(), 0) for x in attention_masks]

    slot_labels = [pad_sequence(x.tolist(), -100) for x in slot_labels]

    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_masks = torch.tensor(attention_masks, dtype=torch.long)
    slot_labels = torch.tensor(slot_labels, dtype=torch.long)
    intent_labels = torch.tensor(intent_labels, dtype=torch.long)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'slots': slot_labels,
        'intent': intent_labels
    }

In [27]:
import torch
from transformers import BertModel

class JointBert(torch.nn.Module):
    def __init__(self, num_intent, num_slots,  bert_model=None):
        super(JointBert, self).__init__()
        if bert_model is None:
            self.model = BertModel.from_pretrained("bert-base-uncased")
        else:
            self.model = bert_model
        self.num_slots = num_slots
        self.num_intent = num_intent

        self.dropout = torch.nn.Dropout(0.2)
        self.intent_classifier = torch.nn.Linear(768, num_intent)
        self.slot_classifier = torch.nn.Linear(768, num_slots)

    def forward(self, input_ids, attention_mask, slot_labels=None, intent_labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # (batch, seq_length, 768)
        hidden_states = self.dropout(hidden_states)
        # Intent Classification
        intent_logits = self.intent_classifier(hidden_states[:, 0, :])  # (batch, num_intents)

        # Slots Classification
        slot_logits = self.slot_classifier(hidden_states)  # (batch, seq_length, num_slots)

        if slot_labels is not None and intent_labels is not None:
            loss_fn_slots = torch.nn.CrossEntropyLoss(ignore_index=-100)
            loss_slot = loss_fn_slots(slot_logits.view(-1, self.num_slots), slot_labels.view(-1))

            loss_fn_intent = torch.nn.CrossEntropyLoss()
            loss_intent = loss_fn_intent(intent_logits, intent_labels)

            total_loss = loss_slot + loss_intent

            return total_loss, intent_logits, slot_logits
        else:
            slot_preds = torch.argmax(slot_logits, dim=-1)
            intent_preds = torch.argmax(intent_logits, dim=-1)
            return intent_preds, slot_preds


In [30]:
from sklearn.model_selection import train_test_split

import random

random.seed(42)
random.shuffle(final_data)

train_data, val_data = train_test_split(final_data, test_size=0.2, random_state=42)

batch_size = 128

train_dataloader = DataLoader(SnipsDataset(train_data, tokenizer), batch_size=batch_size, collate_fn=custom_collate_fn, shuffle=True)
val_dataloader = DataLoader(SnipsDataset(val_data, tokenizer), batch_size=batch_size, collate_fn=custom_collate_fn, shuffle=True)

In [29]:
next(iter(train_dataloader))['input_ids'].shape

torch.Size([64, 50])

In [34]:
jb = JointBert(num_intent=len(intent_to_idx), num_slots=len(entity_to_idx))

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
jb.to(device)

epochs = 10
optimizer = torch.optim.Adam(lr=3e-5, params=jb.parameters())

for epoch in range(epochs):
    epoch_loss = 0
    jb.train()

    for idx, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        slots = batch["slots"].to(device)
        intent = batch["intent"].to(device)

        optimizer.zero_grad()

        total_loss_batch, intent_logits, slot_logits = jb(
            input_ids, attention_mask, slot_labels=slots, intent_labels=intent
        )

        total_loss_batch.backward()
        optimizer.step()

        epoch_loss += total_loss_batch.item()

        # if idx % 20 == 0:
        #     print(f"Epoch: {epoch} | Step: {idx} | Loss: {total_loss_batch.item()}")

    print(f"Epoch {epoch} finished | Avg Train Loss: {epoch_loss:.4f}")

    jb.eval()
    val_loss = 0
    intent_correct, total_intent = 0, 0
    slot_correct, total_slot = 0, 0
    total_samples = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            slots = batch["slots"].to(device)
            intent = batch["intent"].to(device)

            total_loss_batch, intent_logits, slot_logits = jb(
                input_ids, attention_mask, slot_labels=slots, intent_labels=intent
            )

            val_loss += total_loss_batch.item() * input_ids.size(0)
            total_samples += input_ids.size(0)

            intent_preds = intent_logits.argmax(dim=-1)
            intent_correct += (intent_preds == intent).sum().item()
            total_intent += intent.size(0)

            valid_mask = slots != -100
            slot_preds = slot_logits.argmax(dim=-1)
            slot_correct += ((slot_preds == slots) & valid_mask).sum().item()
            total_slot += valid_mask.sum().item()

    val_loss /= total_samples
    intent_acc = intent_correct / total_intent
    slot_acc = slot_correct / total_slot

    print(f"🔹 Validation | Loss: {val_loss:.4f} | Intent Acc: {intent_acc:.4f} | Slot Acc: {slot_acc:.4f}")

Epoch 0 finished | Avg Train Loss: 1.6140
🔹 Validation | Loss: 0.2326 | Intent Acc: 0.9888 | Slot Acc: 0.9705
Epoch 1 finished | Avg Train Loss: 1.4758
🔹 Validation | Loss: 0.2425 | Intent Acc: 0.9898 | Slot Acc: 0.9707
Epoch 2 finished | Avg Train Loss: 1.5539
🔹 Validation | Loss: 0.2286 | Intent Acc: 0.9898 | Slot Acc: 0.9707
Epoch 3 finished | Avg Train Loss: 1.5637
🔹 Validation | Loss: 0.2479 | Intent Acc: 0.9822 | Slot Acc: 0.9710
Epoch 4 finished | Avg Train Loss: 1.3523
🔹 Validation | Loss: 0.2398 | Intent Acc: 0.9859 | Slot Acc: 0.9702
Epoch 5 finished | Avg Train Loss: 0.9289
🔹 Validation | Loss: 0.2776 | Intent Acc: 0.9833 | Slot Acc: 0.9712
Epoch 6 finished | Avg Train Loss: 0.9119
🔹 Validation | Loss: 0.2469 | Intent Acc: 0.9884 | Slot Acc: 0.9712
Epoch 7 finished | Avg Train Loss: 0.6840
🔹 Validation | Loss: 0.2272 | Intent Acc: 0.9884 | Slot Acc: 0.9712
Epoch 8 finished | Avg Train Loss: 0.4094
🔹 Validation | Loss: 0.2398 | Intent Acc: 0.9888 | Slot Acc: 0.9723
Epoch 9 fi

In [43]:
from sklearn.metrics import f1_score, accuracy_score
import torch

def evaluate_model(jb, val_dataloader, device):
    jb.eval()
    intent_preds_all, intent_labels_all = [], []
    slot_preds_all, slot_labels_all = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            slots = batch["slots"].to(device)
            intent = batch["intent"].to(device)

            intent_preds, slot_preds = jb(input_ids, attention_mask)

            # Intent Accuracy
            intent_preds = intent_preds.cpu().numpy()
            intent_labels = intent.cpu().numpy()
            intent_preds_all.extend(intent_preds)
            intent_labels_all.extend(intent_labels)

            # Slot Predictions
            slot_preds = slot_preds.cpu().numpy()
            slot_labels = slots.cpu().numpy()

            for pred, true in zip(slot_preds, slot_labels):
                valid_mask = true != -100
                slot_preds_all.append(pred[valid_mask].tolist())
                slot_labels_all.append(true[valid_mask].tolist())

    # Intent Accuracy
    intent_acc = accuracy_score(intent_labels_all, intent_preds_all)

    # Slot F1 (macro)
    slot_f1 = f1_score(
        [label for sublist in slot_labels_all for label in sublist],  # Flatten
        [pred for sublist in slot_preds_all for pred in sublist],  # Flatten
        average="macro"
    )

    correct_sentences = sum(
        (intent_preds_all[i] == intent_labels_all[i]) and
        (slot_preds_all[i] == slot_labels_all[i])
        for i in range(len(intent_preds_all))
    )
    sentence_acc = correct_sentences / len(intent_preds_all)

    print(f"🔹 Validation Results:")
    print(f"Intent Accuracy: {intent_acc:.4f}")
    print(f"Slot F1 Score: {slot_f1:.4f}")
    print(f"Sentence-Level Semantic Frame Accuracy: {sentence_acc:.4f}")

    return intent_acc, slot_f1, sentence_acc

In [44]:
intent_acc, slot_f1, sentence_acc = evaluate_model(jb, val_dataloader, device)

🔹 Validation Results:
Intent Accuracy: 0.9884
Slot F1 Score: 0.9304
Sentence-Level Semantic Frame Accuracy: 0.8894


In [51]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [54]:
!apt install git
!git config --global user.name "antonItachi"
!git config --global user.email "mrkrasnyuk21@gmail.com"

In [55]:
!git clone https://github.com/antonItachi/intent-slot-classification.git

In [64]:
!cp -r /content/drive/MyDrive/Colab\Notebooks/bert.ipynb /content/intent-slot-classification

In [65]:
!cd intent-slot-classification && git add .

In [66]:
!cd intent-slot-classification && git commit -m "Added NLU model code"
!cd intent-slot-classification && git branch -M main
!cd intent-slot-classification && git remote add origin https://github.com/antonItachi/intent-slot-classification.git
!cd intent-slot-classification && git push -u origin main

In [68]:
from google.colab import drive
drive.mount('/content/drive')