In [None]:
__author__ = "Adrian Sarno, Jennifer Arnold"
__version__ = "CS224u, Stanford, Spring 2020"

In [1]:
import os
import json
from pprint import pprint
from itertools import repeat
from nltk.tokenize import WordPunctTokenizer

In [2]:
def outside(sentence):
    labels = ""
    for token in WordPunctTokenizer().tokenize(sentence):
        labels += "O "
    return labels

def inside(sentence, slot):  
    prefix = "B-"
    labels = ""
    for token in WordPunctTokenizer().tokenize(sentence):
        labels += prefix + slot + " "
        prefix = "I-"
    return labels

In [3]:
inside("Red Hot Chili Peppers", "author")

'B-author I-author I-author I-author '

In [19]:
def slots_to_string(sentence, slots):
    end_prev_slot = 0
    slots_str = ""
    for slot in sorted(slots, key = lambda x: x["start"]):
    #for slot in slots:
        left = sentence[end_prev_slot:slot["start"]]
        slots_str += outside(left)
        middle = sentence[slot["start"]:slot["exclusive_end"]]
        slots_str += inside(middle, slot["slot"])
        end_prev_slot = slot["exclusive_end"]
    right = sentence[end_prev_slot:]
    slots_str += outside(right)

    return slots_str.strip()

In [20]:
ex_sentence = "Okay, how about the song Dark Necessities by the Red Hot Chili Peppers on their album The Getaway?"
ex_slots = {"exclusive_end": 41,
            "slot": "song_name",
            "start": 25}, {
            "exclusive_end": 70,
            "slot": "artist",
            "start": 49
            }, {
            "exclusive_end": 97,
            "slot": "album",
            "start": 86}
        
slots_to_string(ex_sentence, ex_slots)

'O O O O O O B-song_name I-song_name O O B-artist I-artist I-artist I-artist O O O B-album I-album O'

In [21]:
def dstc8_reader(src_filename, class_func=None):
    """Iterator for the Schema-Guided Dialogue State Tracking (DSTC 8) dataset
    The iterator yields (sentence, label) pairs.

    The labels are tuples consisting of a list of IOB-tagged slot names,
    followed by the domain and intent names

    Parameters
    ----------
    src_filename : str

    Yields
    ------
    (domain, intent, sentence, label)
    
    """
    with open(src_filename) as json_file:
        dataset = json.load(json_file)
    
    turns_output = []
    sentence_output = []

    for dialog in dataset:
        for turn in dialog["turns"]:
            frames_output = []
            for frame in turn["frames"]:
                frames_output.append({
                    "service": frame["service"],
                    "intent": frame["state"]["active_intent"] if turn["speaker"] == "USER" else "",
                    "slots": frame["slots"]
                })
            turns_output.append({
                "utterance": turn["utterance"],
                "speaker": turn["speaker"],
                "frames": frames_output
            })
    
    for turn in turns_output:
        for frame in turn["frames"]: 
            sentence_output.append({
                "sentence": turn["utterance"],
                "intent": frame["intent"],
                "domain": frame["service"],
                "IOB_tags": slots_to_string(turn["utterance"], frame["slots"]),
                "slots": frame["slots"]
            })
    return sentence_output

In [34]:
import glob

SIMDIALOG_TRAIN = os.path.join("data", "dstc8-schema-guided-dialogue/train/dialogues_*.json")
SIMDIALOG_DEV = os.path.join("data", "dstc8-schema-guided-dialogue/dev/dialogues_*.json")
SIMDIALOG_TEST = os.path.join("data", "dstc8-schema-guided-dialogue/test/dialogues_*.json")

def find_files(path):
    txtfiles = []
    for file in glob.glob(path):
        txtfiles.append(file)
    return txtfiles

def read_sim_dialog(path):
    output = []
    for file in find_files(path):
        output.extend(dstc8_reader(file))
    return output


In [35]:
train_data = read_sim_dialog(SIMDIALOG_TRAIN)
print(len(train_data))

dev_data = read_sim_dialog(SIMDIALOG_DEV)
print(len(dev_data))

test_data = read_sim_dialog(SIMDIALOG_TEST)
print(len(test_data))

340762
50440
88413


In [31]:
print(train_data[17])

{'sentence': 'Just to make sure, you want a 2 room reservation at 11 Howard in New York with a check in for March 5th and checkout on March 8th.', 'intent': '', 'domain': 'Hotels_3', 'IOB_tags': 'O O O O O O O O O O O O B-hotel_name I-hotel_name O B-location I-location O O O O O B-check_in_date I-check_in_date O O O B-check_out_date I-check_out_date O', 'slots': [{'exclusive_end': 61, 'slot': 'hotel_name', 'start': 52}, {'exclusive_end': 73, 'slot': 'location', 'start': 65}, {'exclusive_end': 103, 'slot': 'check_in_date', 'start': 94}, {'exclusive_end': 129, 'slot': 'check_out_date', 'start': 120}]}


In [23]:
for i, row in enumerate(sentence_output):
    #if len(row["slots"]) > 0 and len(WordPunctTokenizer().tokenize(row["sentence"])) != len(row["IOB_tags"].split()):
    if i == 17:
        print(i, row)
        print(len(WordPunctTokenizer().tokenize(row["sentence"])))
        print(len(row["IOB_tags"].split()))
        print(row["slots"])
        for slot in row["slots"]:
            print (slot["slot"], slot["start"], slot["exclusive_end"])
        break
    

17 {'sentence': "I found 5 songs for you. How about Sabaton 's Night Witches, from the album Heroes", 'intent': '', 'domain': 'Music_2', 'IOB_tags': 'O O O O O O O O O B-artist O O B-song_name I-song_name O O O O B-album', 'slots': [{'exclusive_end': 59, 'slot': 'song_name', 'start': 46}, {'exclusive_end': 42, 'slot': 'artist', 'start': 35}, {'exclusive_end': 82, 'slot': 'album', 'start': 76}]}
19
19
[{'exclusive_end': 59, 'slot': 'song_name', 'start': 46}, {'exclusive_end': 42, 'slot': 'artist', 'start': 35}, {'exclusive_end': 82, 'slot': 'album', 'start': 76}]
song_name 46 59
artist 35 42
album 76 82
