In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ipython-autotime > /dev/null
%load_ext autotime

time: 208 µs (started: 2022-02-21 18:41:17 +00:00)


In [None]:
!pip install -U datasets > /dev/null
!pip install contractions > /dev/null
!pip install -U tqdm > /dev/null
!pip install fasttext > /dev/null

time: 1min 8s (started: 2022-02-21 18:41:17 +00:00)


In [None]:
import sys
sys.path.append("drive/MyDrive/Dev/ID_in_CRS")

time: 1.98 ms (started: 2022-02-21 18:42:26 +00:00)


In [None]:
data_path = "drive/MyDrive/Datasets/MultiWOZ_2.2_v2"
!mkdir -p {data_path}

time: 115 ms (started: 2022-02-21 18:42:26 +00:00)


In [None]:
!git clone https://github.com/budzianowski/multiwoz.git > /dev/null
!tail -n +7 multiwoz/db/hospital_db.json > tmp.json && mv tmp.json multiwoz/db/hospital_db.json && rm -rf tmp.json

db_path = "multiwoz/db"
mapping_path = "multiwoz/utils/mapping.pair"

Cloning into 'multiwoz'...
remote: Enumerating objects: 612, done.[K
remote: Counting objects: 100% (248/248), done.[K
remote: Compressing objects: 100% (203/203), done.[K
remote: Total 612 (delta 143), reused 76 (delta 44), pack-reused 364[K
Receiving objects: 100% (612/612), 126.37 MiB | 15.87 MiB/s, done.
Resolving deltas: 100% (357/357), done.
Checking out files: 100% (61/61), done.
time: 21.6 s (started: 2022-02-21 18:42:26 +00:00)


# Prepare Data

In [None]:
import os
import json
import pandas as pd
import glob
import random
import re
from tqdm import tqdm
from collections import defaultdict
import contractions
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

time: 1.18 s (started: 2022-02-21 18:42:48 +00:00)


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

time: 732 ms (started: 2022-02-21 18:42:49 +00:00)


In [None]:
from cleaners.multiwoz import Cleaner

cleaner = Cleaner(mapping_path, db_path)

time: 790 ms (started: 2022-02-21 18:42:49 +00:00)


In [None]:
from datasets import load_dataset

dataset = load_dataset("multi_woz_v22", ignore_verifications=True)

Downloading:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

No config specified, defaulting to: multi_woz_v22/v2.2_active_only


Downloading and preparing dataset multi_woz_v22/v2.2_active_only (download: 263.97 MiB, generated: 49.24 MiB, post-processed: Unknown size, total: 313.21 MiB) to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a...


  0%|          | 0/22 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset multi_woz_v22 downloaded and prepared to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

time: 52.1 s (started: 2022-02-21 18:42:50 +00:00)


In [None]:
def parse_data(tvt):
#     filtered_domains = set(["bus", "police", "hospital"])
    filtered_domains = set()

    cleaned_dialogues = {}
    for dialogue in tqdm(dataset[tvt]):
        dialogue_id = dialogue["dialogue_id"]
        turns = dialogue["turns"]
        cleaned_turns = []
        for speaker, utterance, frames, dialogue_acts in zip(turns["speaker"], turns["utterance"], turns["frames"], turns["dialogue_acts"]):
            if speaker == 1:
                continue

            domains, intents = [], []
            for service, state in zip(frames["service"], frames["state"]):
                if state["active_intent"] != "NONE":
                    domains.append(service)
                    intents.append(state["active_intent"])

            if filtered_domains.intersection(domains):
                continue

            slot_positions = sorted(list(zip(dialogue_acts["span_info"]["span_start"], dialogue_acts["span_info"]["span_end"])))
            slot_names0, slot_names1, slot_values = [], [], []
            for idx1, pos in enumerate(slot_positions):
                si, ei = pos
                slot_value = cleaner.clean(utterance[si: ei])
                slot_value = cleaner.tokenize(slot_value)
                slot_name0 = dialogue_acts["span_info"]["act_slot_name"][idx1]
                slot_type = dialogue_acts["span_info"]["act_type"][idx1].split('-')[0].lower()
                # slot_type = slot_type if slot_type != "booking" else domains[0]

                for idx2, word_slot_value in enumerate(slot_value):
                    slot_values.append(word_slot_value)
                    slot_names0.append(f"{'I' if idx2 else 'B'}-{slot_name0}")
                    slot_names1.append(f"{'I' if idx2 else 'B'}-{slot_type}_{slot_name0}")
                    # slot_names0.append(slot_name0)
                    # slot_names1.append(f"{slot_type}_{slot_name0}")

            text = cleaner.clean(utterance)
            words = cleaner.tokenize(text)

            slots0 = []
            slots1 = []
            idx = 0
            for word in words:
                if idx < len(slot_values) and word == slot_values[idx]:
                    slots0.append(slot_names0[idx])
                    slots1.append(slot_names1[idx])
                    idx += 1
                else:
                    slots0.append("O")
                    slots1.append("O")

            if idx != len(slot_values):
                continue

            cleaned_turns.append({
                "speaker": speaker,
                "words": words,
                "slots0": slots0,
                "slots1": slots1,  # TODO: check and revise
                "domains": domains,
                "intents": intents
            })

        cleaned_dialogues[dialogue_id] = cleaned_turns

    return cleaned_dialogues


train_data = parse_data("train")
val_data = parse_data("validation")
test_data = parse_data("test")

100%|██████████| 8437/8437 [13:10<00:00, 10.67it/s]
100%|██████████| 1000/1000 [01:43<00:00,  9.66it/s]
100%|██████████| 1000/1000 [01:43<00:00,  9.62it/s]

time: 16min 38s (started: 2022-02-21 18:43:42 +00:00)





In [None]:
with open(f"{data_path}/train.json", "w") as json_file:
    json.dump(train_data, json_file)
with open(f"{data_path}/validation.json", "w") as json_file:
    json.dump(val_data, json_file)
with open(f"{data_path}/test.json", "w") as json_file:
    json.dump(test_data, json_file)

time: 4.22 s (started: 2022-02-21 19:00:21 +00:00)


In [None]:
import gc

del dataset
gc.collect()

196

time: 356 ms (started: 2022-02-21 19:00:25 +00:00)


# UNK Words

In [None]:
from collections import defaultdict

with open(f"{data_path}/train.json", "r") as json_file:
    data = json.load(json_file)

words_count = defaultdict(int)
for id, dialogue in data.items():
    for turn in dialogue:
        for word in turn["words"]:
            words_count[word] += 1

min_count = 3
vocab = set(word for word, freq in words_count.items() if freq >= min_count)

for tvt in ["train", "validation", "test"]:
    with open(f"{data_path}/{tvt}.json", "r") as json_file:
        data = json.load(json_file)

    for dlg_id, dialogue in data.items():
        for trn_id, turn in enumerate(dialogue):
            for idx, word in enumerate(turn["words"]):
                if word not in vocab:
                    turn["words"][idx] = "UNK"

    with open(f"{data_path}/{tvt}.json", "w") as json_file:
        json.dump(data, json_file)

time: 5.5 s (started: 2022-02-21 19:00:25 +00:00)


# Label Refinement

In [None]:
# import math
# import os
# import json
# from sklearn.pipeline import make_pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import classification_report, confusion_matrix
# import fasttext
# from collections import defaultdict

time: 1.68 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# with open(f"{data_path}/train.json", "r") as json_file:
#     train_data = json.load(json_file)
# with open(f"{data_path}/validation.json", "r") as json_file:
#     val_data = json.load(json_file)
# with open(f"{data_path}/test.json", "r") as json_file:
#     test_data = json.load(json_file)

time: 2.32 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# def read_data(data):
#     X, y = [], []

#     for id, dlg in data.items():
#         for trn in dlg:
#             X.append(" ".join(trn["words"]))
            
#             if len(trn["domains"]):
#                 y.append(1)
#             else:
#                 y.append(0)

#     return X, y

# X_train, y_train = read_data(train_data)
# X_val, y_val = read_data(val_data)
# X_test, y_test = read_data(test_data)

time: 3.33 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# print(len(y_train), len(y_val), len(y_test))

time: 1.54 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# def prepare_fasttext_file(filename, X, Y):
#     with open(f"{filename}.txt", "w") as txt_file:
#         for x, y in zip(X, Y):
#             txt_file.write(f"__label__{y} {x}\n")

# prepare_fasttext_file("train", X_train, y_train)
# prepare_fasttext_file("val", X_val, y_val)
# prepare_fasttext_file("test", X_test, y_test)

time: 2.62 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# import gc

# run_flag = False
# model_path = "drive/MyDrive/Development/ID_in_CRS/label_model.bin"

# if run_flag or not os.path.exists(model_path):
#     for rnd in range(6):
#         print(f"ROUND {rnd + 1}")

#         model = fasttext.train_supervised(
#             input="train.txt",
#             autotuneValidationFile="val.txt",
#             epoch=5
#         )

#         preds, _ = model.predict(X_test)
#         preds = [int(pred_label[0][-1]) for pred_label in preds]
#         print(classification_report(y_test, preds, digits=4))
#         print(confusion_matrix(y_test, preds))
#         print("#" * 100)

#         num_changes = 0
#         # for X_list, y_list in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
#         for X_list, y_list in [(X_train, y_train)]:
#             preds, probs = model.predict(X_list)
#             preds = [int(pred_label[0][-1]) for pred_label in preds]
#             probs = [prob[0] for prob in probs]

#             for idx in range(len(y_list)):
#                 if y_list[idx] != preds[idx] and probs[idx] > (1 - (0.15 / math.log2(rnd + 2))):
#                     y_list[idx] = preds[idx]
#                     num_changes += 1  

#         prepare_fasttext_file("train", X_train, y_train)
#         # prepare_fasttext_file("val", X_val, y_val)
#         # prepare_fasttext_file("test", X_test, y_test)
#         model.save_model(model_path)

#         del model
#         gc.collect()

#         if num_changes == 0:
#             break
# else:
#     model = fasttext.load_model(model_path)

#     preds, _ = model.predict(X_test)
#     preds = [int(pred_label[0][-1]) for pred_label in preds]
#     print(classification_report(y_test, preds, digits=4))
#     print(confusion_matrix(y_test, preds))

#     # for X_list, y_list in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
#     for X_list, y_list in [(X_train, y_train)]:
#         preds, probs = model.predict(X_list)
#         preds = [int(pred_label[0][-1]) for pred_label in preds]
#         probs = [prob[0] for prob in probs]

#         for idx in range(len(y_list)):
#             if y_list[idx] == 1 and preds[idx] == 0 and probs[idx] > 0.95:
#                 y_list[idx] = 0
#             if y_list[idx] == 0 and preds[idx] == 1 and probs[idx] > 0.95:
#                 y_list[idx] = 1

#     prepare_fasttext_file("train", X_train, y_train)
#     # prepare_fasttext_file("val", X_val, y_val)
#     # prepare_fasttext_file("test", X_test, y_test)

time: 18.6 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# refined_train_data, refined_val_data, refined_test_data = defaultdict(list), defaultdict(list), defaultdict(list)
# for data, y_list, refined_data in zip([train_data, val_data, test_data], [y_train, y_val, y_test], [refined_train_data, refined_val_data, refined_test_data]):
#     idx = 0
#     num_turns = 0
#     for id, dlg in data.items():
#         for trn in dlg:
#             if (len(trn["intents"]) and y_list[idx] == 1) or (len(trn["intents"]) == 0 and y_list[idx] == 0):
#                 refined_data[id].append(trn)
#                 num_turns += 1
#             idx += 1
#     print(num_turns)    

# train_data = refined_train_data
# # val_data = refined_val_data
# # test_data = refined_test_data

# del refined_train_data
# del refined_val_data
# del refined_test_data

time: 3.63 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
# import gc
# gc.collect()

time: 913 µs (started: 2022-02-21 19:00:31 +00:00)


# Save Data

In [None]:
# import json

# with open(f"{data_path}/train.json", "r") as json_file:
#     train_data = json.load(json_file)
# with open(f"{data_path}/validation.json", "r") as json_file:
#     val_data = json.load(json_file)
# with open(f"{data_path}/test.json", "r") as json_file:
#     test_data = json.load(json_file)

time: 1.75 ms (started: 2022-02-21 19:00:31 +00:00)


In [None]:
with open(f"{data_path}/train.json", "r") as json_file:
    data = json.load(json_file)

all_words, all_domains, all_intents, all_slots0, all_slots1 = set(), set(), set(), set(), set()
for id, dlg in data.items():
    for trn in dlg:
        all_words.update(trn["words"])
        all_domains.update(trn["domains"])
        all_intents.update(trn["intents"])
        all_slots0.update(trn["slots0"])
        all_slots1.update(trn["slots1"])

print(len(all_words))

with open(f"{data_path}/words.json", "w") as json_file:
    json.dump(sorted(all_words), json_file)
with open(f"{data_path}/domains.json", "w") as json_file:
    json.dump(sorted(all_domains), json_file)
with open(f"{data_path}/intents.json", "w") as json_file:
    json.dump(sorted(all_intents), json_file)
with open(f"{data_path}/slots0.json", "w") as json_file:
    json.dump(sorted(all_slots0), json_file)
with open(f"{data_path}/slots1.json", "w") as json_file:
    json.dump(sorted(all_slots1), json_file)

1455
time: 1.85 s (started: 2022-02-21 19:00:31 +00:00)


# Format to txt

In [None]:
import json
import random

time: 1.23 ms (started: 2022-02-21 19:00:33 +00:00)


In [None]:
intent_label = "intents"
slot_label = "slots1"

for src_name, dst_name in zip(["train", "validation", "test"], ["train", "dev", "test"]):
    idx = 0

    with open(f"{data_path}/{src_name}.json", "r") as json_file:
        data = json.load(json_file)

    all_lines = []
    for id, dialogue in data.items():
        for turn in dialogue:
            if turn["speaker"] == 1:
                continue

            turn_lines = [f"{word} {slot}\n" for word, slot in zip(turn["words"], turn[slot_label])]
            if len(turn[intent_label]) == 1:
                turn_lines.append(f"{turn[intent_label][0]}\n")
                turn_lines.append("\n")
                all_lines.extend(turn_lines)
            elif len(turn[intent_label]) == 0:
                turn_lines.append(f"general\n")
                turn_lines.append("\n")
                all_lines.extend(turn_lines)

    with open(f"{data_path}/{dst_name}.txt", "w") as txt_file:
        txt_file.writelines(all_lines)

time: 1.65 s (started: 2022-02-21 19:00:33 +00:00)
