In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# change the path to where your notebook is located
%cd "gdrive/My Drive/Colab Notebooks"

/content/gdrive/My Drive/Colab Notebooks


In [None]:
!pip install datasets -q
!pip install transformers -q
!pip install transformers[torch] -q
!pip install accelerate -U -q
!pip install seqeval -q
!pip install evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m931.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Run it only once
from datasets import load_dataset
dataset = load_dataset("multi_woz_v22")

In [None]:
# Run it only once
dataset.save_to_disk("dataset.hf")

Saving the dataset (0/1 shards):   0%|          | 0/8437 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from datasets import DatasetDict
dataset = DatasetDict.load_from_disk("dataset.hf")

In [None]:
#we are only going to care about this information, for each dialogue  in the train dataset we go to the turns and inside of it, in dialogue acts we will
#find the span info which covers 'act type' which we have already classified, and then the act_slot_names and values, and the start and end of those slots.
dataset['train'][0]['turns']['dialogue_acts'][0]['span_info']

{'act_type': ['Restaurant-Inform', 'Restaurant-Inform'],
 'act_slot_name': ['area', 'pricerange'],
 'act_slot_value': ['centre', 'expensive'],
 'span_start': [30, 43],
 'span_end': [36, 52]}

In [None]:
dataset['train'][0]['turns']['dialogue_acts'][0]['dialog_act']

{'act_type': ['Restaurant-Inform'],
 'act_slots': [{'slot_name': ['area', 'pricerange'],
   'slot_value': ['centre', 'expensive']}]}

In [None]:
def make_df(split):

  train_domain = []
  extracted_information_not_mapped_ground_truth = []
  extracted_information_ground_truth = []
  count = 0

  for d,dial in enumerate(dataset[split]):
      speaker_str = {0: 'User', 1: 'Agent'}
      turns = dial['turns']
      if not any(set(dial['turns']['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(dial['turns']['utterance'])):
        continue
      for turn_id,utt in enumerate(turns['utterance']):
          extracted_information_ground_truth = []
          start = []
          end = []
          speaker = speaker_str[turns['speaker'][turn_id]]
          if speaker == 'Agent':
            continue
          dialogue_acts = turns['dialogue_acts'][turn_id]['span_info']['act_type']
          slot_names_per_act = turns['dialogue_acts'][turn_id]['span_info']['act_slot_name']
          slot_values_per_act = turns['dialogue_acts'][turn_id]['span_info']['act_slot_value']
          span_values_start = turns['dialogue_acts'][turn_id]['span_info']['span_start']
          span_values_end = turns['dialogue_acts'][turn_id]['span_info']['span_end']

          for act_i in range(len(slot_names_per_act)):

              if dialogue_acts[act_i].startswith("Hotel") or dialogue_acts[act_i].startswith("Restaurant"):
                if len(slot_names_per_act) > 0:
                  span_start = span_values_start[act_i]
                  span_end = span_values_end[act_i]
                  extracted_information_ground_truth.append(tuple([dialogue_acts[act_i].split("-")[0].lower()+"-"+slot_names_per_act[act_i], slot_values_per_act[act_i]]))
                  start.append(span_start)
                  end.append(span_end)


          if len(extracted_information_ground_truth) > 0:
            train_domain.append([dial['dialogue_id'], utt.lower(), turn_id, speaker, extracted_information_ground_truth, start, end])

  frame=pd.DataFrame(train_domain, columns=['conv_id', 'utterance', 'turn_id', 'speaker', 'slots_dict','slots_begin','slots_end'])
  columns_to_remove=['conv_id','turn_id']

  frame.drop(columns=columns_to_remove,inplace=True)
  return frame

In [None]:
#let's now generate the dataframes for train, test and validation
data_train = make_df('train')
data_test = make_df('test')
data_val = make_df('validation')

In [None]:
data_train

Unnamed: 0,utterance,speaker,slots_dict,slots_begin,slots_end
0,i need a place to dine in the center thats exp...,User,"[(restaurant-area, centre), (restaurant-pricer...","[30, 43]","[36, 52]"
1,"sounds good, could i get that phone number? al...",User,"[(hotel-pricerange, expensive), (hotel-type, h...","[76, 86]","[85, 91]"
2,i want to book it for 2 people and 2 nights st...,User,"[(hotel-bookstay, 2), (hotel-bookpeople, 2), (...","[22, 35, 58]","[23, 36, 66]"
3,"no, but i'd really like to be on the south end...",User,"[(hotel-area, south)]",[37],[42]
4,no i don't care about the price. which one do ...,User,"[(hotel-pricerange, dontcare), (restaurant-pri...","[5, 5]","[15, 15]"
...,...,...,...,...,...
19496,i would like indian food please.,User,"[(restaurant-food, indian)]",[13],[19]
19497,"hello, i am looking for a cheap restaurant tha...",User,"[(restaurant-pricerange, cheap), (restaurant-f...","[26, 55]","[31, 61]"
19498,"yes, how about portuguese food?",User,"[(restaurant-food, portuguese)]",[15],[25]
19499,it doesn't matter.,User,"[(restaurant-area, dontcare)]",[3],[17]


In [1]:
def id_to_word_id(sentence, char_index):
    words = sentence.split()
    char_count = 0
    return next((i for i, word in enumerate(words) if (char_count := char_count + len(word) + 1) > char_index), len(words) - 1)

def slot_filling(sentence, slot_types, starts, ends):
    labels = ['O'] * len(sentence.split())

    for slot_type, start, end in zip(slot_types, starts, ends):
        start_word_index = id_to_word_id(sentence, start)
        end_word_index = id_to_word_id(sentence, end - 1)

        labels[start_word_index] = 'B-' + slot_type[0]
        labels[start_word_index + 1:end_word_index + 1] = ['I-' + slot_type[0]] * (end_word_index - start_word_index)

    return labels


In [None]:
def set_labels_frame(row):
  labels_utt = slot_filling(row['utterance'], row['slots_dict'], row['slots_begin'], row['slots_end'])
  return labels_utt

In [None]:
data_train['labels'] = data_train.apply(set_labels_frame, axis=1)
data_test['labels'] = data_test.apply(set_labels_frame, axis=1)
data_val['labels'] = data_val.apply(set_labels_frame, axis=1)

Now, we will save in a set all the labels that appear in the utterances in the training set

In [None]:
labels_conjunto=set()
for label in data_train['labels']:
  labels_conjunto.update(label)

In [None]:
len_labels=len(labels_conjunto)

In [None]:
labels_list=list(labels_conjunto)

In [None]:
id2label = {idx:label for idx, label in enumerate(labels_list)}
label2id = {label:idx for idx, label in enumerate(labels_list)}

In [None]:
def set_labels_ids_frame(row):
  labels_id=[]
  for label_name in row['labels']:
    labels_id.append(label2id[label_name])
  return labels_id

In [None]:
data_train['labels_id'] = data_train.apply(set_labels_ids_frame, axis=1)
data_test['labels_id'] = data_test.apply(set_labels_ids_frame, axis=1)
data_val['labels_id'] = data_val.apply(set_labels_ids_frame, axis=1)

In [None]:
data_train['utterance_split'] = data_train['utterance'].apply(lambda x: x.split())
data_test['utterance_split'] = data_test['utterance'].apply(lambda x: x.split())
data_val['utterance_split'] = data_val['utterance'].apply(lambda x: x.split())

In [None]:
columnas_mantener=['utterance_split','labels_id']

In [None]:
data_train=data_train[columnas_mantener]
data_test=data_test[columnas_mantener]
data_val=data_val[columnas_mantener]

In [None]:
from datasets import Dataset

train_dict = {"text": [utt for utt in data_train['utterance_split']], "label": [ids for ids in data_train['labels_id']]}
val_dict = {"text": [utt for utt in data_val['utterance_split']], "label": [ids for ids in data_val['labels_id']]}
test_dict = {"text": [utt for utt in data_test['utterance_split']], "label": [ids for ids in data_test['labels_id']]}

train_dataset = Dataset.from_dict(train_dict)
val_dataset = Dataset.from_dict(val_dict)
test_dataset = Dataset.from_dict(test_dict)

slot_filling_dataset = DatasetDict({"train": train_dataset, "val": val_dataset, "test": test_dataset})

In [None]:
slot_filling_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 19501
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2316
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2294
    })
})

In [None]:
slot_filling_dataset['train']['label'][0]

[13, 13, 13, 13, 13, 13, 13, 13, 5, 13, 18]

In [None]:
#we get this code online and what it does is tokenizing input text using the Hugging Face tokenizer, aligning labels with individual tokens based on word
#boundaries, and returning a dictionary (tokenized_inputs) containing the tokenized information along with aligned labels.

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=True)



    labels = []
    for i, label in enumerate(examples['label']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
from transformers import BertTokenizer, AutoModelForTokenClassification
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenized_data = slot_filling_dataset.map(tokenize_and_align_labels, batched=True)



In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
#import the classification report to get individual scores
from seqeval.metrics import classification_report
import numpy as np

In [None]:
def calculate_performance_metrics(prediction_data):
    predictions, labels = prediction_data
    predictions = np.argmax(predictions, axis=2)

    extract_labels = lambda prediction, label: [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]

    true_predictions = [extract_labels(prediction, label) for prediction, label in zip(predictions, labels)]
    true_labels = [extract_labels(prediction, label) for prediction, label in zip(predictions, labels)]

    evaluation_results = seqeval.compute(predictions=true_predictions, references=true_labels)
    print(classification_report(true_predictions, true_labels))

    return {
        "precision": evaluation_results["overall_precision"],
        "recall": evaluation_results["overall_recall"],
        "f1": evaluation_results["overall_f1"],
        "accuracy": evaluation_results["overall_accuracy"],
    }


In [None]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len_labels, id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.to('cuda')

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="model-bert-test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model='f1',
    load_best_model_at_end=True,
)

In [None]:
#if we do not remove the column names we get an error when training
tokenized_data = tokenized_data.remove_columns(slot_filling_dataset["train"].column_names)

In [None]:
tokenized_data['train']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 19501
})

In [None]:
#generate tensors for BERT
tokenized_data.set_format('torch')

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_performance_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0958,0.070794,0.875087,0.923775,0.898772,0.97783
2,0.0683,0.062283,0.890102,0.938971,0.913884,0.978884
3,0.0536,0.060614,0.901205,0.934559,0.917579,0.980257
4,0.047,0.061984,0.896189,0.939461,0.917315,0.980194
5,0.038,0.064219,0.899503,0.932353,0.915634,0.980034


                       precision    recall  f1-score   support

           hotel-area       0.74      0.83      0.78       212
        hotel-bookday       0.96      0.94      0.95       272
     hotel-bookpeople       0.90      0.90      0.90       253
       hotel-bookstay       0.95      0.97      0.96       313
           hotel-name       0.90      0.87      0.88       163
     hotel-pricerange       0.78      0.84      0.81       246
          hotel-stars       1.00      0.94      0.97       219
           hotel-type       0.82      0.76      0.79       287
      restaurant-area       0.95      0.79      0.87       390
   restaurant-bookday       0.99      0.91      0.95       299
restaurant-bookpeople       0.99      0.86      0.92       302
  restaurant-booktime       0.98      0.95      0.97       355
      restaurant-food       0.97      0.91      0.94       447
      restaurant-name       0.86      0.83      0.85       162
restaurant-pricerange       0.97      0.81      0.88  

TrainOutput(global_step=6095, training_loss=0.07199889895757561, metrics={'train_runtime': 872.2889, 'train_samples_per_second': 111.781, 'train_steps_per_second': 6.987, 'total_flos': 1602812086688160.0, 'train_loss': 0.07199889895757561, 'epoch': 5.0})

In [None]:
model.save_pretrained('bert-base-uncased-slot-filling')

In [None]:
from transformers import AutoModelForTokenClassification

# Replace 'path_to_your_model_directory' with the actual path to your saved model directory
model_directory = 'bert-base-uncased-slot-filling'

# Load the pre-trained BERT model
loaded_model = AutoModelForTokenClassification.from_pretrained(model_directory)

In [None]:
model.to('cpu')

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el