In [None]:
import json
def preprocess_slot_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)

    examples = []
    for dialogue in data:
        domain = dialogue["domains"][0]  # Assuming a single domain per dialogue
        first_turn = dialogue["turns"][0]
        if first_turn["speaker"] == "user":
            text = first_turn["utterance"]
            slots = []
            if "non-categorical" in first_turn["dialogue_acts"]:
                for slot in first_turn["dialogue_acts"]["non-categorical"]:
                    if "start" in slot and "end" in slot:
                        start, end = slot["start"], slot["end"]
                    else:
                        value = slot["value"]
                        start = text.find(value)
                        end = start + len(value) - 1
                    slots.append((start, end, slot["slot"]))
            if "binary" in first_turn["dialogue_acts"]:
                for slot in first_turn["dialogue_acts"]["binary"]:
                    slots.append((None, None, slot["slot"]))
            examples.append((text, slots, domain))

    return examples


slot_dialogues = preprocess_slot_data("drive/MyDrive/KVRET/dialogues.json")



In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class SlotDataset(Dataset):
    def __init__(self, examples, tokenizer, label_encoder):
        self.examples = examples
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        input_text = example[0]
        slots = example[1]

        tokenized_input = tokenizer(input_text, return_offsets_mapping=True)
        labels = ['O'] * len(tokenized_input['input_ids'])  # Initialize labels with 'O' (Outside) for each token

        for start, end, slot in slots:
            if start is not None and end is not None:
                start_token_idx = tokenized_input.char_to_token(max(start, 0))
                end_token_idx = tokenized_input.char_to_token(max(end - 1, 0))

                # Assign labels to tokens
                if start_token_idx is not None and end_token_idx is not None:
                    labels[start_token_idx] = f"B-{slot}"  # Beginning of slot
                    for idx in range(start_token_idx + 1, end_token_idx + 1):
                        labels[idx] = f"I-{slot}"  # Inside of slot

        encoded_labels = self.label_encoder.transform(labels)
        tokenized_input['labels'] = torch.tensor(encoded_labels, dtype=torch.long)

        return tokenized_input.data  # Convert BatchEncoding to dictionary







In [None]:
slot_label_encoder = LabelEncoder()
slot_labels = [label for example in slot_dialogues for _, _, label in example[1]]
slot_labels = list(set(slot_labels))
slot_labels = ['O'] + [f"B-{label}" for label in slot_labels] + [f"I-{label}" for label in slot_labels]
slot_label_encoder.fit(slot_labels)
slot_dataset = SlotDataset(slot_dialogues, tokenizer, slot_label_encoder)

In [None]:
train_size = int(0.8 * len(slot_dataset))
test_size = len(slot_dataset) - train_size
train_dataset, test_dataset = random_split(slot_dataset, [train_size, test_size])

from torch.nn.utils.rnn import pad_sequence

from torch.nn.utils.rnn import pad_sequence

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    keys = batch[0].keys()
    collated = {}

    for key in keys:
        if isinstance(batch[0][key], torch.Tensor):
            collated[key] = pad_sequence([item[key] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
        else:
            collated[key] = [item[key] for item in batch]

    return collated


train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

slot_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(slot_label_encoder.classes_))

slot_model.to(device)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
num_epochs = 3
optimizer = AdamW(slot_model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)



In [None]:
from torch.nn.utils.rnn import pad_sequence


In [None]:
for epoch in range(num_epochs):
    # Training loop
    slot_model.train()
    for batch in train_dataloader:
        batch = {k: pad_sequence([torch.tensor(vv, dtype=torch.long, device=device) for vv in v], batch_first=True) if isinstance(v, list) else v.to(device) for k, v in batch.items()}
        batch.pop('offset_mapping', None)  # Remove 'offset_mapping' from the batch

        outputs = slot_model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # Evaluation loop
    slot_model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: pad_sequence([torch.tensor(vv, dtype=torch.long, device=device) for vv in v], batch_first=True) if isinstance(v, list) else v.to(device) for k, v in batch.items()}
            batch.pop('offset_mapping', None)  # Remove 'offset_mapping' from the batch

            outputs = slot_model(**batch)
            loss = outputs.loss
            logits = outputs.logits

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=-1)
            mask = batch['attention_mask']
            accuracy = (predictions[mask == 1] == batch['labels'][mask == 1]).sum().item() / (mask == 1).sum().item()
            total_eval_accuracy += accuracy
            total_eval_loss += loss.item()

    # Print evaluation metrics
    avg_eval_loss = total_eval_loss / len(test_dataloader)
    avg_eval_accuracy = total_eval_accuracy / len(test_dataloader)
    print(f"Epoch: {epoch + 1}, Loss: {avg_eval_loss}, Accuracy: {avg_eval_accuracy}")


Epoch: 1, Loss: 0.12754385241944538, Accuracy: 0.9309156612235265
Epoch: 2, Loss: 0.12148526899124447, Accuracy: 0.930284743490878
Epoch: 3, Loss: 0.11964777808048223, Accuracy: 0.9320291266659456


In [None]:
import pickle

save_directory = "drive/MyDrive/KVRET/trained_kvret_slot_model"

# Save the model
slot_model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the label encoder
with open(f"{save_directory}/label_encoder.pkl", "wb") as file:
    pickle.dump(slot_label_encoder, file)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import pickle

load_directory = "drive/MyDrive/KVRET/trained_kvret_slot_model"

# Load the model
loaded_model = BertForSequenceClassification.from_pretrained(load_directory)

# Load the tokenizer
loaded_tokenizer = BertTokenizer.from_pretrained(load_directory)

# Load the label encoder
with open(f"{load_directory}/label_encoder.pkl", "rb") as file:
    loaded_label_encoder = pickle.load(file)

OSError: ignored