In [1]:
import sys
sys.path.append('../src')

import decouple
import json
import numpy as np
import os
import pickle
import utils

NFS_DIR = decouple.config('NFS_PARENT_DIR')

In [5]:
data_config = utils.read_yaml('../config/data/craigslist.yaml')

# strategy_vector_data_full.pkl is the file data_w_strategies_outcomes.pkl from
# the preprocessing jupyter notebook `PreProcessNegotiationData.ipynb`.
path = f'{NFS_DIR}/data/craigslist/raw_data/strategy_vector_data_full.pkl'

with open(path, 'rb') as f:
    data = pickle.load(f)

In [6]:
# Two functions to convert vector with strategy/dialog act ids 
# into corresponding strings to make sure we use the same labels 
# as Dialograph paper.

def get_dialog_act_stringify_fn(dialog_act_to_idx):
    idx_to_dialog_act = {v: k for k, v in dialog_act_to_idx.items()}

    def convert_dialog_act_idxs_to_strings(dialog_act_idxs):
        dialog_acts = []
        for dialog_act_idx in dialog_act_idxs:
            dialog_act = idx_to_dialog_act[dialog_act_idx]
            dialog_acts.append(dialog_act)
        return dialog_acts
    return convert_dialog_act_idxs_to_strings


def get_strategy_stringify_fn(strategy_to_idx):
    idx_to_strategy = {v: k for k, v in strategy_to_idx.items()}
    
    def convert_strategy_idxs_to_strings(strategy_arr):
        strategies = [[] for _ in range(len(strategy_arr))]
        for turn_idx, strategy_idx in zip(*np.nonzero(strategy_arr)):
            strategies[turn_idx].append(idx_to_strategy[strategy_idx])
        return strategies
    return convert_strategy_idxs_to_strings

# Convert strategy and dialog act ids into string

To make sure we use the same data as Dialograph paper.

In [7]:
idx_to_party = {0: 'buyer', 1: 'seller', -1: '<start>'}
dialog_act_to_idx = data['dialacts2id']
strategy_to_idx = data['strategies2colid']

convert_dialog_act_idxs_to_strings = get_dialog_act_stringify_fn(
    dialog_act_to_idx)
convert_strategy_idxs_to_strings = get_strategy_stringify_fn(
    strategy_to_idx)

for split in ['train', 'valid', 'test']:
    converted_examples = [] 
    for dialog_idx, example in enumerate(data[split]):
        strategies = convert_strategy_idxs_to_strings(example['strategies_vec'])
        dialog_acts = convert_dialog_act_idxs_to_strings(
            example['dial_acts_vec'])
        agent_ids = example['agent_list']
        parties = list(map(idx_to_party.get, agent_ids))
        utterances = example['utterance']
        uuid = example['uuid']
        ratio_bucket = int(example['ratio_bucket'])
        ratio = example['ratio']

        assert len(parties) == len(utterances) == len(dialog_acts)

        dialogue = []
        turn_idx = 0
        for party, utt, da in zip(parties, utterances, dialog_acts):
            if utt == '<start>' and da == '<start>':
                continue
            dialogue.append({
                'turn_idx': turn_idx,
                'party': party,
                'turn': utt,
                'dialogue_act': da,
            })
            turn_idx += 1

        example = {
            'dialogue': dialogue,
            'dialog_idx': dialog_idx,
            'example_id': uuid,
            'ratio_bucket': ratio_bucket,
            'ratio': ratio,
        }
        assert (len(agent_ids)
                == len(utterances)
                == len(strategies)
                == len(dialog_acts))
        converted_examples.append(example)

    split = {'valid': 'dev'}.get(split, split)
    path = data_config['path']['input_pattern_map'][split].format(nfs_dir=NFS_DIR)
    dir_name = os.path.dirname(path)
    os.makedirs(dir_name, exist_ok=True)
    with open(path, 'w') as f:
        for example in converted_examples:
            json_string = json.dumps(example)
            print(json_string, file=f)