In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# change the path to where your notebook is located
%cd "gdrive/My Drive/Colab Notebooks"

[Errno 2] No such file or directory: 'gdrive/My Drive/Colab Notebooks'
/content/gdrive/My Drive/Colab Notebooks


In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U




In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [None]:
# Run it only once
from datasets import load_dataset
dataset = load_dataset("multi_woz_v22")

In [None]:
# Run it only once
dataset.save_to_disk("dataset.hf")

Saving the dataset (0/1 shards):   0%|          | 0/8437 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from datasets import DatasetDict
dataset = DatasetDict.load_from_disk("dataset.hf")

In [None]:
import pandas as pd

def make_df(split):
  to_be_retrieved_train_X = []
  to_be_requested_train_X=[]
  labels = set()
  for d,dial in enumerate(dataset[split]):
      # skip dialogues that are not in the hotel or restaurant domain
      if not any(set(dial['turns']['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(dial['turns']['utterance'])):
          continue
      compulsory_slots_hotel  = set(['hotel-bookpeople', 'hotel-bookstay', 'hotel-name', 'hotel-bookday']) # as an example, to be adjusted
      compulsory_slots_restaurant  = set(['restaurant-name']) # as an example, to be adjusted
      filled_slots = set()
      speaker_str = {0: 'User', 1: 'Agent'}
      turns = dial['turns']
      prev = []
      previous_slots = {}
      prev_user_utt = None
      for turn_id,utt in enumerate(turns['utterance']):
          speaker = speaker_str[turns['speaker'][turn_id]]
          if speaker == "User":
            prev_user_utt = utt
            indent = " "*4
            dialogue_acts = turns['dialogue_acts'][turn_id]['dialog_act']['act_type']
            user_dialogue_acts_ground_truth = []
            for da in dialogue_acts:
              if da.startswith('Hotel') or da.startswith('Restaurant') or da.startswith('Booking') or da.startswith('general'):
                user_dialogue_acts_ground_truth.append(da)
              else:
                act_single = da.split('-')[0]
                if act_single not in user_dialogue_acts_ground_truth:
                  user_dialogue_acts_ground_truth.append(act_single)

            extracted_information_not_mapped_ground_truth = []
            extracted_information_ground_truth = []
            extracted_information_per_dialogue_act_ground_truth = {}
            span_info = turns['dialogue_acts'][turn_id]['span_info']
            for span_i in range(len(span_info['span_start'])):
                act_type = span_info['act_type'][span_i]
                span_name = span_info['act_slot_name'][span_i]
                span_value = span_info['act_slot_value'][span_i]
                span_range = [span_info['span_start'][span_i], span_info['span_end'][span_i]]
                span_value_as_in_utterance = utt[span_info['span_start'][span_i]: span_info['span_end'][span_i]]

                if not act_type in extracted_information_per_dialogue_act_ground_truth:
                    extracted_information_per_dialogue_act_ground_truth[act_type] = []
                act_category = act_type.split("-")[0].lower()
                extracted_information_not_mapped_ground_truth.append(tuple([act_category+"-"+span_name, span_value_as_in_utterance]))
                if act_category in ["hotel", "restaurant", "general"]:
                    extracted_information_ground_truth.append(tuple([act_category+"-"+span_name, span_value]))
                extracted_information_per_dialogue_act_ground_truth[act_type].append(tuple([span_name, span_value]))


            slot_names_per_act = [slot['slot_name'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
            slot_values_per_act = [slot['slot_value'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
            for act_i in range(len(slot_names_per_act)):
                slot_names_values_per_act = [slot_names_per_act[act_i][slot_i]+":"+slot_values_per_act[act_i][slot_i] for slot_i in range(len(slot_names_per_act[act_i]))]

                if dialogue_acts[act_i].startswith("Hotel") or dialogue_acts[act_i].startswith("Restaurant"):
                    for slot_i in range(len(slot_names_per_act[act_i])):
                        if slot_names_per_act[act_i][slot_i] != "none" and slot_values_per_act[act_i][slot_i] == "?":
                            if not dialogue_acts[act_i] in extracted_information_per_dialogue_act_ground_truth:
                                extracted_information_per_dialogue_act_ground_truth[dialogue_acts[act_i]] = []
                            extracted_information_not_mapped_ground_truth.append(tuple([dialogue_acts[act_i].split("-")[0].lower()+"-"+slot_names_per_act[act_i][slot_i], slot_values_per_act[act_i][slot_i]]))
                            extracted_information_ground_truth.append(tuple([dialogue_acts[act_i].split("-")[0].lower()+"-"+slot_names_per_act[act_i][slot_i], slot_values_per_act[act_i][slot_i]]))
                            extracted_information_per_dialogue_act_ground_truth[dialogue_acts[act_i]].append(tuple([slot_names_per_act[act_i][slot_i], slot_values_per_act[act_i][slot_i]]))


            services = turns['frames'][turn_id]['service']
            current_booking_service = [service for service in services if service in ["hotel", "restaurant"]]

            not_empty_intents = [intent for intent in turns['frames'][turn_id]['state'] if intent['requested_slots'] or intent['slots_values']['slots_values_name']]
            if not_empty_intents:
                for intent in not_empty_intents:
                    requested_slots = intent['requested_slots']
                    if requested_slots:
                      pass
                    if intent['slots_values']['slots_values_name']:
                        slot_names = intent['slots_values']['slots_values_name']
                        slot_values = intent['slots_values']['slots_values_list']
                        filled_slots.update(slot_names)

          elif speaker == "Agent":
            indent = " "*4

            dialogue_acts = turns['dialogue_acts'][turn_id]['dialog_act']['act_type']

            do_evaluate_agent_turn = True
            if not any(da.startswith("Hotel") or da.startswith("Restaurant") or da.startswith("Booking") for da in dialogue_acts):
                continue

            slot_names_per_act = [slot['slot_name'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
            slot_values_per_act = [slot['slot_value'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]

            to_be_retrieved_ground_truth = []

            for act_i in range(len(slot_names_per_act)):
                domain = dialogue_acts[act_i].split("-")[0].lower()
                if domain == "booking" and len(current_booking_service)==1:
                    domain = current_booking_service[0]
                slot_names = [domain+"-"+slot_names_per_act[act_i][slot_i] for slot_i in range(len(slot_names_per_act[act_i]))
                              if slot_values_per_act[act_i][slot_i]!="?" and slot_names_per_act[act_i][slot_i]!="none"]
                if slot_names:
                    to_be_retrieved_slot_names = ["%s-availability" % (domain)] + slot_names
                    while domain+"-choice" in to_be_retrieved_slot_names:
                        del to_be_retrieved_slot_names[to_be_retrieved_slot_names.index(domain+"-choice")]
                    to_be_retrieved_ground_truth.extend(to_be_retrieved_slot_names)
            to_be_retrieved_ground_truth = sorted(list(set(to_be_retrieved_ground_truth)))


            if not to_be_retrieved_ground_truth:
              to_be_retrieved_ground_truth.append('none')


            result_string = ""
            for dialogue_act in user_dialogue_acts_ground_truth:
                result_string += f' | {dialogue_act} '

                if dialogue_act in extracted_information_per_dialogue_act_ground_truth:
                    result_string += '|'
                    slots = extracted_information_per_dialogue_act_ground_truth[dialogue_act]

                    for name, value in slots:
                        result_string += f' ({name}, {value}) '



            train_ut = prev_user_utt + string


            to_be_requested_ground_truth = []
            for act_i in range(len(slot_names_per_act)):
                domain = dialogue_acts[act_i].split("-")[0].lower()
                if domain == "booking" and len(current_booking_service)==1:
                    domain = current_booking_service[0]
                if domain in ["hotel", "restaurant", "booking", "general"]:
                    to_be_requested = [domain+"-"+slot_names_per_act[act_i][slot_i] for slot_i in range(len(slot_names_per_act[act_i])) if slot_values_per_act[act_i][slot_i]=="?"]
                    to_be_requested_ground_truth.extend(to_be_requested)
                    for name in to_be_requested:
                      labels.add(name)
            to_be_requested_ground_truth = sorted(list(set(to_be_requested_ground_truth)))


            if not to_be_requested_ground_truth:
              to_be_requested_ground_truth.append('none')

            result_string = ""
            for dialogue_act in user_dialogue_acts_ground_truth:
                result_string += f' | {dialogue_act} '

                if dialogue_act in extracted_information_per_dialogue_act_ground_truth:
                    result_string += '|'
                    slots = extracted_information_per_dialogue_act_ground_truth[dialogue_act]

                    for name, value in slots:
                        result_string += f' ({name}, {value}) '


            to_be_retrieved_train_X.append([train_ut, to_be_retrieved_ground_truth])
            to_be_requested_train_X.append([train_ut, to_be_requested_ground_truth])
  return pd.DataFrame(to_be_requested_train_X, columns=['utterance', 'acts'])




In [None]:
train_df = make_df('train')
val_df = make_df('validation')
test_df = make_df('test')

In [None]:
def encoding(df):
  dummies = pd.get_dummies(df['acts'].apply(pd.Series).stack()).sum(level=0)

  # Concatenate the dummies DataFrame with the original DataFrame
  result = pd.concat([df, dummies], axis=1)

  # Drop the original 'acts' column
  result = result.drop('acts', axis=1)
  return result

In [None]:
train_df_enc = encoding(train_df)
val_df_enc = encoding(val_df)
test_df_enc = encoding(test_df)

  dummies = pd.get_dummies(df['acts'].apply(pd.Series).stack()).sum(level=0)
  dummies = pd.get_dummies(df['acts'].apply(pd.Series).stack()).sum(level=0)
  dummies = pd.get_dummies(df['acts'].apply(pd.Series).stack()).sum(level=0)


In [None]:
labels_requested = [label for label in train_df_enc.columns if label not in ['utterance']]
id2label = {idx:label for idx, label in enumerate(labels_requested)}
label2id = {label:idx for idx, label in enumerate(labels_requested)}
labels_requested

['booking-bookday',
 'booking-bookpeople',
 'booking-bookstay',
 'booking-booktime',
 'hotel-area',
 'hotel-bookday',
 'hotel-bookpeople',
 'hotel-bookstay',
 'hotel-booktime',
 'hotel-internet',
 'hotel-name',
 'hotel-parking',
 'hotel-pricerange',
 'hotel-stars',
 'hotel-type',
 'none',
 'restaurant-area',
 'restaurant-bookday',
 'restaurant-bookpeople',
 'restaurant-bookstay',
 'restaurant-booktime',
 'restaurant-food',
 'restaurant-name',
 'restaurant-none',
 'restaurant-pricerange']

In [None]:
labels=labels_requested

In [None]:
from transformers import AutoTokenizer
# import numpy as np

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocessing(examples):
    # take a batch of texts
    text = examples["utterance"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}

    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))

    # fill numpy array
    for idx, label in enumerate(labels):
        # Check if the label is present in the batch
        if label in labels_batch:
            labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding


In [None]:
from datasets import Dataset


In [None]:
train_dataset = Dataset.from_pandas(train_df_enc)
test_dataset = Dataset.from_pandas(test_df_enc)
val_dataset = Dataset.from_pandas(val_df_enc)

In [None]:
encoded_train = train_dataset.map(preprocessing, batched=True, remove_columns=train_dataset.column_names)
encoded_test = test_dataset.map(preprocessing, batched=True, remove_columns=test_dataset.column_names)
encoded_val = val_dataset.map(preprocessing, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/25603 [00:00<?, ? examples/s]

Map:   0%|          | 0/2979 [00:00<?, ? examples/s]

Map:   0%|          | 0/2988 [00:00<?, ? examples/s]

In [None]:
encoded_train.set_format("torch")
encoded_test.set_format("torch")
encoded_val.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
device = 'cuda'

model = AutoModelForSequenceClassification.from_pretrained("roberta-base",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
batch_size = 16
metric_name = "f1"

In [None]:
!pip install accelerate -U
!pip install transformers[torch]



In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
args = TrainingArguments(
    "confs",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    print(f'y_true = {y_true}')
    print(f'y_pred = {y_pred}')
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def calculate_performance_metrics(predictions: EvalPrediction):
    model_predictions = predictions.predictions[0] if isinstance(predictions.predictions, tuple) else predictions.predictions
    metrics_result = evaluate_multi_label_metrics(
        model_predictions=model_predictions,
        true_labels=predictions.label_ids)
    return metrics_result

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train,
    eval_dataset=encoded_val,
    tokenizer=tokenizer,
    compute_metrics=calculate_performance_metrics
)

In [None]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
def predict(utt):
  inputs = tokenizer(utt, return_tensors="pt")

  with torch.no_grad():
      logits = loaded_model(**inputs).logits

  predictions = torch.argmax(logits, dim=1)

  predicted_class=id2label[predictions.item()]
  return predicted_class

In [None]:
true_preds = list(train_df['acts'][:])

In [None]:
pred_preds=[]
for utt in train_df['utterance']:
  predicted = predict(utt)
  pred_preds.append([predicted])
  print([predicted])

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=list(labels))
y_true_list = [mlb.fit_transform([true_labels]) for true_labels in true_preds]
y_pred_list = [mlb.transform([predicted_labels]) for predicted_labels in pred_preds]

y_true_list

y_true = []
y_pred = []

for i in range(len(y_true_list)):
  print()
  y_true.append(y_true_list[i][0])
  y_pred.append(y_pred_list[i][0])

from sklearn.metrics import classification_report

report = classification_report(y_true, y_pred, target_names=list(labels))

# Print the report
print(report)