In [2]:
import json
import csv
import os
import random
from tqdm import tqdm

In [26]:
ed_path = '/u/scr/abisee/empatheticdialogues'
pc_path = '/u/scr/abisee/transfer-learning-conv-ai/personachat/personachat_self_original.json'
ed_path_out = '/u/scr/abisee/transfer-learning-conv-ai/empatheticdialogues/ed.json'
mini_ed_path_out = '/u/scr/abisee/transfer-learning-conv-ai/empatheticdialogues/ed_mini.json'

In [8]:
def load_pc():
    with open(pc_path, 'r') as f:
        data = json.load(f)
    return data
    
    
pc_data = load_pc()

In [23]:
pc_data['train'][0]

{'personality': ['i like to remodel homes .',
  'i like to go hunting .',
  'i like to shoot a bow .',
  'my favorite holiday is halloween .'],
 'utterances': [{'candidates': ['my mom was single with 3 boys , so we never left the projects .',
    'i try to wear all black every day . it makes me feel comfortable .',
    'well nursing stresses you out so i wish luck with sister',
    'yeah just want to pick up nba nfl getting old',
    'i really like celine dion . what about you ?',
    'no . i live near farms .',
    "i wish i had a daughter , i'm a boy mom . they're beautiful boys though still lucky",
    'yeah when i get bored i play gone with the wind my favorite movie .',
    "hi how are you ? i'm eating dinner with my hubby and 2 kids .",
    'were you married to your high school sweetheart ? i was .',
    'that is great to hear ! are you a competitive rider ?',
    "hi , i'm doing ok . i'm a banker . how about you ?",
    "i'm 5 years old",
    'hi there . how are you today ?',
  

In [11]:
def load_ed(filename):
    path = os.path.join(ed_path, filename)
    with open(path, 'r') as f:
        csv_reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
        row_num = 0
        lines = []
        for row in csv_reader:
            if row_num == 0:
                col_names = row
            else:
                lines.append({name:val for name,val in zip(col_names, row)})
            row_num += 1
    print('got {} lines from {}'.format(len(lines), filename))
    return lines


ed_rows_train = load_ed('train.csv')
ed_rows_valid = load_ed('valid.csv')
ed_rows_test = load_ed('test.csv')

got 84169 lines from train.csv
got 12078 lines from valid.csv
got 10973 lines from test.csv


In [33]:
def merge_ed_rows(ed_rows):
    dialogues = []
    curr_dialogue = []
    for idx, row in enumerate(ed_rows):
        conv_id = row['conv_id']
        if curr_dialogue == []:
            curr_dialogue.append(row)
        else:
            if conv_id == curr_dialogue[0]['conv_id']:
                curr_dialogue.append(row)
            else: # new dialogue
                dialogues.append(curr_dialogue)
                curr_dialogue = []
                curr_dialogue.append(row)
    dialogues.append(curr_dialogue)
    return dialogues


def process_utterance(utterance):
    utterance = utterance.lower()
    utterance = utterance.replace('_comma_', ',')
    return utterance

    
def make_dialogue(dialogue_in, all_utterances, num_candidates=20):
    turns = []
    
    # make turns
    for i in range(1, len(dialogue_in), 2):
        
        # get candidates
        gold_utterance = dialogue_in[i]['utterance']
        while True:
            candidates = random.sample(all_utterances, num_candidates-1)
            if gold_utterance not in candidates:
                break
        candidates.append(gold_utterance)  # length num_candidates
        candidates = [process_utterance(c) for c in candidates]
                
        # get history
        history = [row['utterance'] for row in dialogue_in[:i]]
        history = [process_utterance(h) for h in history]
        
        # append
        turns.append({'candidates': candidates, 'history': history})
    
    dialogue_out = {
        'emotion': dialogue_in[0]['context'], 
        'situation': process_utterance(dialogue_in[0]['prompt']),
        'turns': turns, 
       }
    
    return dialogue_out


def make_ed_data(ed_rows):
    
    # merge the rows into dialogues
    ed_merged_rows = merge_ed_rows(ed_rows)
    
    # get set of all utterances
    all_utterances = []
    for dialogue in ed_merged_rows:
        for row in dialogue:
            all_utterances.append(row['utterance'])
    all_utterances = set(all_utterances)
    
    # convert to new format
    ed_data = []
    print('converting {} dialogues to new format...'.format(len(ed_merged_rows)))
    for idx, dialogue in tqdm(enumerate(ed_merged_rows)):
        ed_data.append(make_dialogue(dialogue, all_utterances))
    
    return ed_data


def save_ed_data(ed_data_train, ed_data_valid, ed_data_test, ed_path_out):
    out = {'train': ed_data_train, 'valid': ed_data_valid, 'test': ed_data_test}
    print('saving to {}...'.format(ed_path_out))
    with open(ed_path_out, 'w') as f:
        json.dump(out, f)
    print('saved')
    

# ed_data_train = make_ed_data(ed_rows_train)
# ed_data_valid = make_ed_data(ed_rows_valid)
# ed_data_test = make_ed_data(ed_rows_test)

save_ed_data(ed_data_train, ed_data_valid, ed_data_test, ed_path_out)

In [27]:
# Make mini version

with open(ed_path_out, 'r') as f:
    new_ed_data = json.load(f)
    
def make_mini_ed_data(new_ed_data, num_exs=100):
    return {split: examples[:100] for split, examples in new_ed_data.items()}

mini_ed_data = make_mini_ed_data(new_ed_data)

with open(mini_ed_path_out, 'w') as f:
    json.dump(mini_ed_data, f)

In [25]:
len(mini_ed_data['valid'])

100