In [None]:
!pip install -U datasets > /dev/null
!pip install contractions > /dev/null
!pip install -U tqdm > /dev/null
!pip install fasttext > /dev/null

In [None]:
data_path = "drive/MyDrive/Datasets/MultiWOZ_2.2"
!mkdir -p {data_path}

db_path = f"{data_path}/db"

# Prepare Data

In [None]:
import os
import json
import pandas as pd
import glob
import random
import re
from collections import defaultdict
import contractions
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [None]:
from datasets import load_dataset

dataset = load_dataset("multi_woz_v22", ignore_verifications=True)

Downloading:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

No config specified, defaulting to: multi_woz_v22/v2.2_active_only


Downloading and preparing dataset multi_woz_v22/v2.2_active_only (download: 263.97 MiB, generated: 49.24 MiB, post-processed: Unknown size, total: 313.21 MiB) to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset multi_woz_v22 downloaded and prepared to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a. Subsequent calls will reuse this data.


In [None]:
# !git clone https://github.com/budzianowski/multiwoz.git > /dev/null
# !cp -r multiwoz/db {data_path}
# !tail -n +7 {db_path}/hospital_db.json > tmp.json && mv tmp.json {db_path}/hospital_db.json && rm -rf tmp.json
# !rm -rf multiwoz

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def build_db_regexes():
    items = defaultdict(set)
    regexes = defaultdict(str)

    for db_name in ["attraction", "hotel", "restaurant"]:
        with open(f"{db_path}/{db_name}_db.json", "r") as json_file:
            db = json.load(json_file)
            regexes[db_name] = "|".join(set(f'({db_obj["name"]})' for db_obj in db))
            items["phone"].update([db_obj["phone"] for db_obj in db if "phone" in db_obj.keys()])
            items["postcode"].update([db_obj["postcode"] for db_obj in db])

    with open(f"{db_path}/hospital_db.json", "r") as json_file:
        db = json.load(json_file)
        regexes["hospital"] = "|".join(set(f'({db_obj["department"]})' for db_obj in db))
        items["phone"].update([db_obj["phone"] for db_obj in db])

    with open(f"{db_path}/train_db.json", "r") as json_file:
        db = json.load(json_file)
        items["trainID"] = set(db_obj["trainID"] for db_obj in db)

    for key in items.keys():
        # print(key, len(items[key]))
        regexes[key] = "|".join([f"({elem})" for elem in items[key]])

    return items, regexes

In [None]:
replace_invalid_chars =  {
    "": ['"', '#', '$', '%', '&', "'", '(', ')', '*', '+', '-', '/', ':', '<', '=', '>', '@', '\\', '`', '~', '’'],
    " ": [',', '.', '!', '?']
}


num_pattern = re.compile("[0-9]+")
def replace_num(text):
    text = num_pattern.sub("NUM", text)
    return text

clock12_pattern = re.compile("(1[012]|[1-9]):[0-5][0-9](\\s)?(?i)(am|pm)")
clock24_pattern = re.compile("([01]?[0-9]|2[0-3]):[0-5][0-9]")
def replace_clock(text):
    text = clock12_pattern.sub("CLOCK", text)
    text = clock24_pattern.sub("CLOCK", text)
    return text

db_items, db_regexes = build_db_regexes()
db_patterns = {db_name: re.compile(db_regex) for db_name, db_regex in db_regexes.items()}
def replace_dbs(text):
    words = text.split()
    for db_name in db_regexes.keys():
        if db_name not in db_items.keys() or db_items[db_name].intersection(words):
            text = db_patterns[db_name].sub(db_name.upper(), text)
    return text

def clean_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = replace_dbs(text)
    text = replace_clock(text)
    for replace_char, invalid_chars in replace_invalid_chars.items():
        for invalid_char in invalid_chars:
            text = text.replace(invalid_char, replace_char)
    text = replace_num(text)
    return text

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def tokenize(text):
    words = text.split()
    words = [stemmer.stem(word) if not word.isupper() else word for word in words]
    # words = [word for word in words if word not in stop_words]
    return words

  if sys.path[0] == '':


In [None]:
def parse_data(tvt):
    # filtered_domains = set(["bus", "police", "hospital"])
    filtered_domains = set()

    cleaned_dialogues = {}
    for dialogue in dataset[tvt]:
        dialogue_id = dialogue["dialogue_id"]
        turns = dialogue["turns"]
        cleaned_turns = []
        for speaker, utterance, frames, dialogue_acts in zip(turns["speaker"], turns["utterance"], turns["frames"], turns["dialogue_acts"]):
            domains, intents = [], []
            for service, state in zip(frames["service"], frames["state"]):
                if state["active_intent"] != "NONE":
                    domains.append(service)
                    intents.append(state["active_intent"])

            if filtered_domains.intersection(domains):
                continue

            slot_positions = sorted(list(zip(dialogue_acts["span_info"]["span_start"], dialogue_acts["span_info"]["span_end"])))
            slot_names0, slot_names1, slot_values = [], [], []
            for idx1, pos in enumerate(slot_positions):
                si, ei = pos
                slot_value = clean_text(utterance[si: ei])
                slot_value = tokenize(slot_value)
                slot_name0 = dialogue_acts["span_info"]["act_slot_name"][idx1]
                slot_type = dialogue_acts["span_info"]["act_type"][idx1].split('-')[0].lower()
                for idx2, word_slot_value in enumerate(slot_value):
                    slot_values.append(word_slot_value)
                    # slot_names0.append(f"{'I' if idx2 else 'B'}-{slot_name0}")
                    slot_names0.append(slot_name0)
                    # slot_names1.append(f"{'I' if idx2 else 'B'}-{slot_type}_{slot_name0}")
                    slot_names1.append(f"{slot_type}_{slot_name0}")

            text = clean_text(utterance)
            words = tokenize(text)

            slots0 = []
            slots1 = []
            idx = 0
            for word in words:
                if idx < len(slot_values) and word == slot_values[idx]:
                    slots0.append(slot_names0[idx])
                    slots1.append(slot_names1[idx])
                    idx += 1
                else:
                    slots0.append("O")
                    slots1.append("O")

            if idx != len(slot_values):
                continue

            cleaned_turns.append({
                "speaker": speaker,
                "words": words,
                "slots0": slots0,
                "slots1": slots1,  # TODO: check and revise
                "domains": domains,
                "intents": intents
            })

        cleaned_dialogues[dialogue_id] = cleaned_turns

    print(f"{tvt} done!")    
    return cleaned_dialogues


train_data = parse_data("train")
val_data = parse_data("validation")
test_data = parse_data("test")

train done!
validation done!
test done!


In [None]:
with open(f"{data_path}/train.json", "w") as json_file:
    json.dump(train_data, json_file)
with open(f"{data_path}/validation.json", "w") as json_file:
    json.dump(val_data, json_file)
with open(f"{data_path}/test.json", "w") as json_file:
    json.dump(test_data, json_file)

In [None]:
import gc

del dataset
gc.collect()

196

# Label Refinement

In [None]:
import math
import os
import json
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import fasttext
from collections import defaultdict

In [None]:
with open(f"{data_path}/train.json", "r") as json_file:
    train_data = json.load(json_file)
with open(f"{data_path}/validation.json", "r") as json_file:
    val_data = json.load(json_file)
with open(f"{data_path}/test.json", "r") as json_file:
    test_data = json.load(json_file)

In [None]:
def read_data(data):
    X, y = [], []

    for id, dlg in data.items():
        for trn in dlg:
            X.append(" ".join(trn["words"]))
            
            if len(trn["domains"]):
                y.append(1)
            else:
                y.append(0)

    return X, y

X_train, y_train = read_data(train_data)
X_val, y_val = read_data(val_data)
X_test, y_test = read_data(test_data)

In [None]:
print(len(y_train), len(y_val), len(y_test))

108139 14012 13993


In [None]:
def prepare_fasttext_file(filename, X, Y):
    with open(f"{filename}.txt", "w") as txt_file:
        for x, y in zip(X, Y):
            txt_file.write(f"__label__{y} {x}\n")

prepare_fasttext_file("train", X_train, y_train)
prepare_fasttext_file("val", X_val, y_val)
prepare_fasttext_file("test", X_test, y_test)

In [None]:
import gc

run_flag = True
model_path = "drive/MyDrive/Development/ID_in_CRS/label_model.bin"

if run_flag or not os.path.exists(model_path):
    for rnd in range(6):
        print(f"ROUND {rnd + 1}")

        model = fasttext.train_supervised(
            input="train.txt",
            autotuneValidationFile="val.txt",
            epoch=5
        )

        preds, _ = model.predict(X_test)
        preds = [int(pred_label[0][-1]) for pred_label in preds]
        print(classification_report(y_test, preds, digits=4))
        print(confusion_matrix(y_test, preds))
        print("#" * 100)

        num_changes = 0
        # for X_list, y_list in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
        for X_list, y_list in [(X_train, y_train)]:
            preds, probs = model.predict(X_list)
            preds = [int(pred_label[0][-1]) for pred_label in preds]
            probs = [prob[0] for prob in probs]

            for idx in range(len(y_list)):
                if y_list[idx] != preds[idx] and probs[idx] > (1 - (0.15 / math.log2(rnd + 2))):
                    y_list[idx] = preds[idx]
                    num_changes += 1  

        prepare_fasttext_file("train", X_train, y_train)
        # prepare_fasttext_file("val", X_val, y_val)
        # prepare_fasttext_file("test", X_test, y_test)
        model.save_model(model_path)

        del model
        gc.collect()

        if num_changes == 0:
            break
else:
    model = fasttext.load_model(model_path)

    preds, _ = model.predict(X_test)
    preds = [int(pred_label[0][-1]) for pred_label in preds]
    print(classification_report(y_test, preds, digits=4))
    print(confusion_matrix(y_test, preds))

    # for X_list, y_list in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
    for X_list, y_list in [(X_train, y_train)]:
        preds, probs = model.predict(X_list)
        preds = [int(pred_label[0][-1]) for pred_label in preds]
        probs = [prob[0] for prob in probs]

        for idx in range(len(y_list)):
            if y_list[idx] == 1 and preds[idx] == 0 and probs[idx] > 0.95:
                y_list[idx] = 0
            if y_list[idx] == 0 and preds[idx] == 1 and probs[idx] > 0.95:
                y_list[idx] = 1

    prepare_fasttext_file("train", X_train, y_train)
    # prepare_fasttext_file("val", X_val, y_val)
    # prepare_fasttext_file("test", X_test, y_test)

ROUND 1
              precision    recall  f1-score   support

           0     0.9778    0.9864    0.9821      7875
           1     0.9823    0.9712    0.9767      6118

    accuracy                         0.9798     13993
   macro avg     0.9801    0.9788    0.9794     13993
weighted avg     0.9798    0.9798    0.9798     13993

[[7768  107]
 [ 176 5942]]
####################################################################################################
ROUND 2
              precision    recall  f1-score   support

           0     0.9778    0.9858    0.9818      7875
           1     0.9815    0.9712    0.9763      6118

    accuracy                         0.9794     13993
   macro avg     0.9797    0.9785    0.9791     13993
weighted avg     0.9794    0.9794    0.9794     13993

[[7763  112]
 [ 176 5942]]
####################################################################################################
ROUND 3
              precision    recall  f1-score   support

           

In [None]:
model = fasttext.train_supervised(  # TODO: refine
    input="train.txt",
    autotuneValidationFile="val.txt",
    epoch=5
)

preds, _ = model.predict(X_test)
preds = [int(pred_label[0][-1]) for pred_label in preds]
print(classification_report(y_test, preds, digits=4))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

           0     0.9769    0.9862    0.9815      7875
           1     0.9820    0.9699    0.9759      6118

    accuracy                         0.9791     13993
   macro avg     0.9794    0.9780    0.9787     13993
weighted avg     0.9791    0.9791    0.9790     13993

[[7766  109]
 [ 184 5934]]


In [None]:
refined_train_data, refined_val_data, refined_test_data = defaultdict(list), defaultdict(list), defaultdict(list)
for data, y_list, refined_data in zip([train_data, val_data, test_data], [y_train, y_val, y_test], [refined_train_data, refined_val_data, refined_test_data]):
    idx = 0

    for id, dlg in data.items():
        for trn in dlg:
            if (len(trn["intents"]) and y_list[idx] == 1) or (len(trn["intents"]) == 0 and y_list[idx] == 0):
                refined_data[id].append(trn)
            idx += 1

train_data = refined_train_data
# val_data = refined_val_data
# test_data = refined_test_data

del refined_train_data
del refined_val_data
del refined_test_data

In [None]:
import gc
gc.collect()

50

# Save Data

In [None]:
all_words, all_domains, all_intents, all_slots0, all_slots1 = set(), set(), set(), set(), set()
for id, dlg in train_data.items():
    for trn in dlg:
        all_words.update(trn["words"])
        all_domains.update(trn["domains"])
        all_intents.update(trn["intents"])
        all_slots0.update(trn["slots0"])
        all_slots1.update(trn["slots1"])

with open(f"{data_path}/words.json", "w") as json_file:
    json.dump(sorted(all_words), json_file)
with open(f"{data_path}/domains.json", "w") as json_file:
    json.dump(sorted(all_domains), json_file)
with open(f"{data_path}/intents.json", "w") as json_file:
    json.dump(sorted(all_intents), json_file)
with open(f"{data_path}/slots0.json", "w") as json_file:
    json.dump(sorted(all_slots0), json_file)
with open(f"{data_path}/slots1.json", "w") as json_file:
    json.dump(sorted(all_slots1), json_file)

with open(f"{data_path}/train.json", "w") as json_file:
    json.dump(train_data, json_file)
with open(f"{data_path}/validation.json", "w") as json_file:
    json.dump(val_data, json_file)
with open(f"{data_path}/test.json", "w") as json_file:
    json.dump(test_data, json_file)

# Format to txt

In [None]:
import json
import random

In [None]:
intent_label = "intents"
slot_label = "slots1"

for src_name, dst_name in zip(["train", "validation", "test"], ["train", "dev", "test"]):
    idx = 0

    with open(f"{data_path}/{src_name}.json", "r") as json_file:
        data = json.load(json_file)

    all_lines = []
    for id, dialogue in data.items():
        for turn in dialogue:
            turn_lines = [f"{word} {slot}\n" for word, slot in zip(turn["words"], turn[slot_label])]
            if len(turn[intent_label]) == 1:
                turn_lines.append(f"{turn[intent_label][0]}\n")
                turn_lines.append("\n")
                all_lines.extend(turn_lines)
            # elif len(turn[intent_label]) == 0:
            #     turn_lines.append(f"general\n")
            #     turn_lines.append("\n")
            #     all_lines.extend(turn_lines)

    with open(f"{data_path}/{dst_name}.txt", "w") as txt_file:
        txt_file.writelines(all_lines)