In [1]:
from torch.utils.data import Dataset
import random
import itertools

In [2]:
def load_txt(in_fname):
    id2txt = {} # idx에 해당하는 발화
    dup_dict = {}   # 중복되는 인덱스-발화 딕셔너리
    dup_list = []   # 중복되는 발화의 인덱스 리스트
    with open(in_fname) as in_file:
        for idx, line in enumerate(in_file):    # idx (인덱스), line (대화 텍스트)
            if line not in dup_dict.values():
                # "__eou__ "로 분리 -> 각 발화 리스트
                id2txt[idx] = [utterance.replace(" __eou__","") for utterance in line.strip().split(" __eou__ ")]
                dup_dict[idx] = line    # "__eou__" 포함 대화 텍스트
            else:   # 대화 텍스트가 존재하면, 중복
                dup_list.append(idx)    # 중복 리스트에 인덱스 추가
    return id2txt, dup_list

def load_act(in_fname, dup_list):
    id2act = {}
    with open(in_fname) as in_file:
        for idx, line in enumerate(in_file):
            # 중복된 대화가 아니면, 인덱스-행동 딕셔너리에 추가
            if idx not in dup_list:
                id2act[idx] = line.strip().split(" ")
    return id2act

def load_topic(in_fname, dup_list):
    id2topic = {}
    with open(in_fname) as in_file:
        for idx, line in enumerate(in_file):
            # 중복된 대화가 아니면, 인덱스-토픽 딕셔너리에 추가
            if idx not in dup_list:
                id2topic[idx] = int(line.strip())
    return id2topic

In [4]:
text_path = './data/train/dailydialog/dialogues_text.txt'
act_path = './data/train/dailydialog/dialogues_act.txt'
topic_path = './data/train/dailydialog/dialogues_topic.txt'

id2txt, dup_list = load_txt(text_path)

id2act = load_act(act_path, dup_list)

id2topic = load_topic(topic_path, dup_list)

In [9]:
def load_meta(text_path, act_path, topic_path):
    txt_dict, dup_list = load_txt(text_path)
    topic_dict = load_topic(topic_path, dup_list)
    act_dict = load_act(act_path, dup_list)

    return txt_dict, topic_dict, act_dict

def remove_duplicates(txt_dict, topic_dict, act_dict):
    # Remove duplicated dialogues from all three dictionaries.
    unique_utterances = {}
    cleaned_topic_dict = {}
    cleaned_act_dict = {}

    for key, utterances in txt_dict.items():
        if utterances not in unique_utterances.values():
            unique_utterances[key] = utterances
            cleaned_topic_dict[key] = topic_dict[key]
            cleaned_act_dict[key] = act_dict[key]

    return unique_utterances, cleaned_topic_dict, cleaned_act_dict

In [10]:
txt_dict, topic_dict, act_dict = load_meta(text_path, act_path, topic_path)

In [30]:
################# FOR POS/NEG SAMPLES SELECTION ################
def pesudo_generation_for_one_sample(utterances, acts, topic, txt_dict, act_dict, topic_dict):
    sample_triple_for_this_dial = []
    for a_idx in range(len(acts)-1):
        # extract utterance triples (anchor, pos, neg_1, neg_2) for pattern Questions - Inform (2 - 1)
        if acts[a_idx] == '2':
            if acts[a_idx+1] == '1':
                anchor = utterances[a_idx]
                postive = utterances[a_idx+1]

                # find the first kind of negative samples (within the same dialogue (same dial act as postive utterance) but not adjacent)
                negtive_minor_list = [utterances[i] for i in range(len(utterances)) if acts[i] != '1' and i != a_idx+1 and i != a_idx-1]
                # find the second kind of negative samples (from dialogue with different topic)
                dial_id = random.choice([key for key, value in topic_dict.items() if value != topic]) # randomly choose one dialogue with different topic
                sampled_utterances = txt_dict[dial_id] 
                sampled_acts = act_dict[dial_id]
                negative_major_list = [sampled_utterances[i] for i in range(len(sampled_utterances)) if sampled_acts[i] != '1']
                for u_n1, u_n2 in itertools.product(negtive_minor_list, negative_major_list):
                    sample_triple_for_this_dial.append((anchor, postive, u_n1, u_n2))
                    
        # extract utterance triples (anchor, pos, neg_1, neg_2) for pattern Directives - Commissives (3 - 4)
        if acts[a_idx] == '3':
            if acts[a_idx+1] == '4':
                anchor = utterances[a_idx]
                postive = utterances[a_idx+1]

                # find the first kind of negative samples (within the same dialogue (same dial act as postive utterance) but not adjacent)
                negtive_minor_list = [utterances[i] for i in range(len(utterances)) if acts[i] != '4' and i != a_idx+1 and i != a_idx-1]
                # find the second kind of negative samples (from dialogue with different topic)
                dial_id = random.choice([key for key, value in topic_dict.items() if value != topic]) # randomly choose one dialogue with different topic
                sampled_utterances = txt_dict[dial_id] 
                sampled_acts = act_dict[dial_id]
                negative_major_list = [sampled_utterances[i] for i in range(len(sampled_utterances)) if sampled_acts[i] != '4']
                for u_n1, u_n2 in itertools.product(negtive_minor_list, negative_major_list):
                    sample_triple_for_this_dial.append((anchor, postive, u_n1, u_n2))
        
    return sample_triple_for_this_dial

In [31]:
sample_triple = []
idx = 1
utterances = txt_dict[idx]
acts = act_dict[idx]
topic = topic_dict[idx]

print('utterances : {}'.format(utterances))
print('acts : {}'.format(acts))
print('topic: {}'.format(topic))
print('\n')

sample_triple_for_this_dial = pesudo_generation_for_one_sample(utterances, acts, topic, txt_dict, act_dict, topic_dict)

utterances : ['So Dick , how about getting some coffee for tonight ?', 'Coffee ? I don ’ t honestly like that kind of stuff .', 'Come on , you can at least try a little , besides your cigarette .', 'What ’ s wrong with that ? Cigarette is the thing I go crazy for .', 'Not for me , Dick .']
acts : ['3', '4', '3', '1', '1']
topic: 1




In [32]:
len(sample_triple_for_this_dial)

28

In [23]:
sample_triple_for_this_dial

[('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'Have you every belonged to a political party ?'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'No , I haven ’ t , but I thought about joining the green party .'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'Really ? I know you are very concerned about the environment . You were a member of the pressure group Greenpeace , weren ’ t you ?'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'Yes . I was . But I didn ’ 

In [40]:
def pesudo_generation(txt_dict, act_dict, topic_dict):
    sample_triple = []

    for idx, v in txt_dict.items():
        utterances = txt_dict[idx]
        acts = act_dict[idx]
        topic = topic_dict[idx]
        try:
            sample_triple += pesudo_generation_for_one_sample(utterances, acts, topic, txt_dict, act_dict, topic_dict)
        except:
            print('idx: {}'.format(idx))
            print('[Error] Problematic datapoint/dialogue, dropped it...')
            continue

    sample_triple = remove_exact_duplicates(sample_triple)
    return sample_triple

def remove_exact_duplicates(entries):
    # Remove entries from the list that have exactly the same elements and return the cleaned list.
    unique_entries = []
    seen = set()
    for entry in entries:
        entry_set = frozenset(entry)  # Convert tuple to a frozenset for immutable set operations
        if entry_set not in seen:
            unique_entries.append(entry)
            seen.add(entry_set)
    
    #print(len(unique_entries), ' unique entries out of ', len(entries))
    return unique_entries

In [35]:
unique_entries = remove_exact_duplicates(sample_triple_for_this_dial)

[('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'Devi , do you have any preference for where you would like to go for dinner for your birthday ?'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  "I don't really know where I want to go . I am having trouble thinking of a particular restaurant ."),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'There is a great restaurant directory here in the weekend section of the newspaper .'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonigh

In [36]:
unique_entries = remove_exact_duplicates(sample_triple_for_this_dial)

In [38]:
unique_entries

[('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'Devi , do you have any preference for where you would like to go for dinner for your birthday ?'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  "I don't really know where I want to go . I am having trouble thinking of a particular restaurant ."),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonight ?',
  'There is a great restaurant directory here in the weekend section of the newspaper .'),
 ('So Dick , how about getting some coffee for tonight ?',
  'Coffee ? I don ’ t honestly like that kind of stuff .',
  'So Dick , how about getting some coffee for tonigh

In [42]:
trainin_samples = pesudo_generation(txt_dict, act_dict, topic_dict)

idx: 672
[Error] Problematic datapoint/dialogue, dropped it...
idx: 8247
[Error] Problematic datapoint/dialogue, dropped it...
idx: 9190
[Error] Problematic datapoint/dialogue, dropped it...


In [None]:
"Hey , Ann . You don't have a pen , do you ? __eou__ Sure , here you go . __eou__ Thanks . I don't suppose you have some paper , too . __eou__ Of course . There you are . __eou__ Thanks so much . I owe you one . __eou__"
"2 1 1 1 1 "

In [43]:
len(trainin_samples)

885742

In [46]:
error_example = 672

utterances = txt_dict[error_example]
acts = act_dict[error_example]
topic = topic_dict[error_example]

print('utterances : {}'.format(utterances))
print('acts : {}'.format(acts))
print('topic: {}'.format(topic))
print('\n')

sample_triple_for_this_dial = pesudo_generation_for_one_sample(utterances, acts, topic, txt_dict, act_dict, topic_dict)


utterances : ['Sam , can we stop at this bicycle shop ?', 'Do you want to buy a new bicycle ?', 'Yes , and they have a sale on now .', 'What happened to your old one ?', "I left it at my parent's house , but I need one here as well .", "I've been using Jim's old bike but he needs it back .", "Let's go then .", 'Look at this mountain bike . It is only £ 330 . Do you like it ?', 'I prefer something like this one - a touring bike , but it is more expensive .', 'How much is it ?', 'The price on the tag says £ 565 but maybe you can get a discount .', "OK , let's go and ask ."]
acts : ['3', '2', '1', '2', '1', '4', '2', '1', '2', '1', '3']
topic: 1




IndexError: list index out of range