In [None]:
__author__ = "Adrian Sarno, Jennifer Arnold"
__version__ = "CS224u, Stanford, Spring 2020"

In [69]:
import os
import json
from pprint import pprint
from itertools import repeat

In [93]:
def simulated_dialogue_reader(src_filename, class_func=None):
    """Iterator for the Google Simulated Dialogue dataset
    The iterator yields (sentence, label) pairs.

    The labels are tuples consisting of a list of IOB-tagged slot names,
    followed by the domain and intent names

    Parameters
    ----------
    src_filename : str
        Full path to the file to be read e.g.: atis-2.dev.w-intent.iob
    class_func : function mapping labels to labels.
        If this is not defined, then the default function will return 
        IOB-tagged slot names, followed by the domain and intent labels.
        Other options: `intent_class_func` and `domain_class_func`
        (or you could write your own).

    Yields
    ------
    (sentence, label)
        nltk.Tree, str in {'0','1','2','3','4'}
    
    """
    with open(src_filename) as json_file:
        dataset = json.load(json_file)
        
    tokens_output = []
    IOB_output = []
    for dialog in dataset:
        for turn in dialog["turns"]:
            token_list = turn["user_utterance"]["tokens"]
            slots= turn["user_utterance"]["slots"]    

            IOBtag_list = list(repeat("O", len(token_list)))

            for slot_entry in slots:
                start = slot_entry["start"]
                end = slot_entry["exclusive_end"]
                slot = slot_entry["slot"]
                prefix = "B-"

                for index in range(start,end):
                    IOBtag_list[index] = prefix + slot
                    prefix = "I-"
            IOB_output.append(IOBtag_list)
            tokens_output.append(token_list)
    return IOB_output, tokens_output
        

In [96]:
SIMDIALOG_HOME = os.path.join("data", "simulated-dialogue/sim-M/dev.json")

IOB_output, tokens_output = simulated_dialogue_reader(SIMDIALOG_HOME)
print(IOB_output[119], tokens_output[119])

['B-date', 'O', 'B-time', 'I-time'] ['sunday', 'at', '1', 'pm']
