In [118]:
import json
import os
import sys
from nltk.tokenize import word_tokenize 
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ajm353/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_json('data/preprocessed/UBAR/multi-woz-processed/data_for_ubar.json')

In [40]:
len(df.columns)

10433

In [41]:
# Get the distribution of the number of goals in each dialogue
lens = []
for i in range(len(df.columns)):
    lens.append(len(df.iloc[0][i]))

counts = Counter(lens)
counts

Counter({1: 3409, 2: 5405, 3: 1619})

In [42]:
# Indexes of dialogues with 3 domains in multiwoz
indices = [i for i, x in enumerate(lens) if x == 3]

In [4]:
file_path = "data/raw/UBAR/multi-woz/data.json"

df_raw_mwoz = pd.read_json(file_path)

In [44]:
first_ten_goals = []
for i in range(20):
    parsed_goal = {}
    goal = df_raw_mwoz.iloc[:,i].goal
    for key in goal.keys():
        relevant_goals = {k: v for k, v in goal.items() if v != {} and k != 'topic' and k != 'message'}
        services = [key for key in relevant_goals.keys()]
        for service in services:
            parsed_goal[service] = relevant_goals[service]
    first_ten_goals.append(parsed_goal)

first_ten_goals[0]

{'hotel': {'info': {'type': 'hotel',
   'parking': 'yes',
   'pricerange': 'cheap',
   'internet': 'yes'},
  'fail_info': {},
  'book': {'pre_invalid': True,
   'stay': '2',
   'day': 'tuesday',
   'invalid': False,
   'people': '6'},
  'fail_book': {'stay': '3'}}}

In [187]:
""" 
Save the system dialogue from all turns in all dialogues
along with the turn number and the dialogue ID
"""

system_response_file = "data/preprocessed/UBAR/system_responses.txt"
val_list_file = "data/raw/UBAR/multi-woz/valListFile.json"
test_list_file = "data/raw/UBAR/multi-woz/testListFile.json"

with open(val_list_file, 'r') as f:
    val_list = f.readlines()
    val_list = [x.strip() for x in val_list]

with open(test_list_file, 'r') as f:
    test_list = f.readlines()
    test_list = [x.strip() for x in test_list]

with open(system_response_file, 'w') as f:
    f.write("Dialogue ID\tTurn #\tSystem Response\n")

    for dialogue_idx, dialogue_filename in enumerate((df_raw_mwoz.columns)):
        if dialogue_filename not in val_list and dialogue_filename not in test_list:
            
            for turn_idx in range(len(df_raw_mwoz.iloc[:,dialogue_idx].log)):
                if turn_idx % 2 != 0:
                    system_responses = df_raw_mwoz.iloc[:,dialogue_idx].log[turn_idx]['text']
                    system_responses = system_responses.replace("\n", " ")
                    f.write(dialogue_filename + "\t" + str(turn_idx) + "\t" + system_responses + '\n')

In [46]:
for idx, dialogue_name in enumerate(df_raw_mwoz.columns):
    if dialogue_name == "PMUL3719.json":
        print("PMUL3719.json is index " + str(idx))
    if dialogue_name == "PMUL4899.json":
        print("PMUL4899.json is index " + str(idx))
    if dialogue_name == "MUL2395.json":
        print("MUL2395.json is index " + str(idx))

MUL2395.json is index 8
PMUL4899.json is index 12
PMUL3719.json is index 39


In [47]:
for idx, dialogue_name in enumerate(df_raw_mwoz.columns):
    message = df_raw_mwoz.iloc[:,dialogue_name].goal["message"]

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [None]:
# Reorder the goals in the dialogue (goal_idx) to be aligned
# with the order they are covered by the user utterances in the data

import re

goal_idx = 19

current_dialogue_goals = []
for idx, goal_name in enumerate(first_ten_goals[goal_idx]):
    current_dialogue_goals.append(goal_name)

print(current_dialogue_goals)


message = df_raw_mwoz.iloc[:,goal_idx].goal["message"]

ordered_current_dialogue_goals = []

for instruction in message:
    instruction_split = re.split(" |<|>", instruction)
    for word in instruction_split:
        if word in current_dialogue_goals:
            ordered_current_dialogue_goals.append(word)
            current_dialogue_goals.remove(word)

ordered_current_dialogue_goals
# reorder first_ten_goals[goal_idx] to be aligned with ordered_current_dialogue_goals


['train', 'restaurant']


['restaurant', 'train']

## EXPLAIN BELOW CODE HERE

In [197]:
"""
Read the user utterances from all turns in all dialogues in gen_usr_utterances_file and save them in 
the correct slots in new_data_file.

# NOTE: this needs to be done for BOTH new_data_file AND new_data_file_with_spans as the first is used
# by data_analysis.py and the second is used by preprocess.py to create the data_for_ubar.json file
"""

raw_data_file = "data/raw/UBAR/multi-woz/data.json"
raw_data_file_with_spans = "data/raw/UBAR/multi-woz/annotated_user_da_with_span_full.json"
new_data_file = "data/preprocessed/UBAR/gen_usr_utt_experiment_data.json"
new_data_file_with_spans = "data/preprocessed/UBAR/gen_usr_utt_experiment_data_with_span_full.json"
# Using file with duplicated data here to make sure that the data is not lost
gen_usr_utterances_file = "data/preprocessed/UBAR/duplicated_user_utterances_from_simulator.txt"

# Load the raw data to a df where the user utterances will be updated
df_mwoz = pd.read_json(raw_data_file_with_spans)
df_undedited_data = df_mwoz.copy(deep=True)

In [189]:
# NOTE: ONLY RUN THIS FOR PROCESSING RAW_DATA_FILE AND NEW_DATA_FILE (no spans)

# Read in gen_usr_utterances_file to a dictionary where the Dialogue ID is the key and the System Response is the value
gen_usr_utterances_dict = {}
with open(gen_usr_utterances_file, 'r') as f:
    next(f)
    for line in f:
        line_split = line.split("\t")
        dialogue_id = line_split[1]
        system_response = line_split[3]
        # Add the dialogue ID as the key to the dictionary with the System Response as the value
        if line_split[1] in gen_usr_utterances_dict:
            gen_usr_utterances_dict[dialogue_id].append(" ".join(nltk.word_tokenize(system_response)))
        else:
            gen_usr_utterances_dict[dialogue_id] = [" ".join(nltk.word_tokenize(system_response))]

In [198]:
# NOTE: ONLY RUN THIS FOR PROCESSING RAW_DATA_FILE_WITH_SPANS AND NEW_DATA_FILE_WITH_SPANS

# Read in gen_usr_utterances_file to a dictionary where the Dialogue ID is the key and the System Response is the value
gen_usr_utterances_dict = {}
with open(gen_usr_utterances_file, 'r') as f:
    next(f)
    for line in f:
        line_split = line.split("\t")
        # Remove ".json" from the end of the dialogue ID
        dialogue_id = line_split[1].replace(".json", "")
        system_response = line_split[3]
        # Add the dialogue ID as the key to the dictionary with the System Response as the value
        if dialogue_id in gen_usr_utterances_dict:
            gen_usr_utterances_dict[dialogue_id].append(" ".join(nltk.word_tokenize(system_response)))
        else:
            gen_usr_utterances_dict[dialogue_id] = [" ".join(nltk.word_tokenize(system_response))]


In [159]:
# TEST SCRIPT #

# Test the dialogues have been read into the dict correctly

# Calculate the number of different Dialogue IDs in gen_usr_utterances_file
dialogue_ids = {}
with open(gen_usr_utterances_file, 'r') as f:
    f.readline()
    for line in f:
        line_split = line.split("\t")
        if line_split[1] not in dialogue_ids:
            dialogue_ids[line_split[1]] = ""

print(len(dialogue_ids))

# Calculate the number of different Dialogue IDs in gen_usr_utterances_dict
dialogue_ids = []
for dialogue_id, user_utterances in gen_usr_utterances_dict.items():
    dialogue_ids.append(dialogue_id)

print(len(dialogue_ids))

6537
6537


In [199]:
# Now need to update df_mwoz to *only* contain data from the dialogue IDs in gen_usr_utterances_dict
# AND need to do some manipulation to handle the fact that the number of turns in the generated data
# can be different to the raw data

# If the dialogue ID is not in the gen_usr_utterances_dict, then remove the dialogue from df_mwoz
for dialogue_id in df_mwoz.columns:
    if dialogue_id not in gen_usr_utterances_dict.keys():
        df_mwoz.drop(dialogue_id, axis=1, inplace=True)

In [200]:
# Deal with the cases where the number of generated turns is different to ground truth
for dialogue_name, user_utterances in gen_usr_utterances_dict.items():
    # This has to be done because there are dialogues not in "annotated_user_da_with_span_full.json" which
    # must be due to an error when it was created
    if dialogue_name not in df_mwoz.columns:
        print("Dialogue ID not in df_mwoz: " + dialogue_name)
        continue
    
    dialogue_idx = df_mwoz.columns.get_loc(dialogue_name)
    # Both of the below treat turns as total messages i.e. 6 turns is 3 user utterances and 3 sys responses
    n_raw_turns = len(df_mwoz.iloc[:,dialogue_idx].log)
    n_gen_sys_utt_turns = len(gen_usr_utterances_dict[dialogue_name]) * 2

    # Case where the number of generated turns is less than the number of ground truth turns
    if n_raw_turns > n_gen_sys_utt_turns:
        turns_to_cut = n_raw_turns - n_gen_sys_utt_turns
        # Remove the extra raw turns from the end of the dialogue, EXCEPT the last turn.
        # This way, the final turn from the system is the actual final one in the raw data
        # e.g. 'Thanks goobye' not some intermediary state where they might be asking the 
        # user a clarifying question or something like that
        del df_mwoz.iloc[:,dialogue_idx].log[n_gen_sys_utt_turns - 1 : turns_to_cut + n_gen_sys_utt_turns - 1]

    # Change the user utterances to the generated utterances up until all utterances are filled
    # This handles the case where the number of generated utterances is greater than in the raw data
    # which happens because of the user simulator always ends the conversation in my implementation
    n_raw_turns = len(df_mwoz.iloc[:,dialogue_idx].log)
    for usr_turn_idx, user_utterance in enumerate(user_utterances):  
        turn_idx = usr_turn_idx * 2
        if turn_idx < n_raw_turns: 
            df_mwoz.iloc[:,dialogue_idx].log[turn_idx]['text'] = user_utterance.strip('\n')

    # print(n_gen_sys_utt_turns, len(df_mwoz.iloc[:,dialogue_idx].log))

Dialogue ID not in df_mwoz: PMUL2245
Dialogue ID not in df_mwoz: PMUL4859


In [34]:
# TEST SCRIPT #

# Calculate the new number of different Dialogue IDs in df_mwoz
dialogue_ids = {}
for dialogue_id in df_mwoz.columns:
    if dialogue_id not in dialogue_ids:
        dialogue_ids[dialogue_id] = ""

len(dialogue_ids)

6535

In [154]:
# TEST SCRIPT #

# Inspect modified user utterances and system responses to sense check the updates

for i in range(8, 10):
    print("Dialogue ID: " + df_mwoz.columns[i])
    print("*" * 30)
    for j in range(len(df_mwoz.iloc[:,i].log)):
        if j % 2 == 0:
            print("User: " + df_mwoz.iloc[:,i].log[j]['text'])
        else:  
            print("System: " + df_mwoz.iloc[:,i].log[j]['text'])

Dialogue ID: PMUL4899.json
******************************
User: I am looking for entertainment in Cambridge .
System: i got 5 options. which side is okay for you?
User: I do n't care .
System: How about funky fun house, they are located at 8 mercers row, mercers row industrial estate.
User: That sounds great . Can I get the phone number and postcode ?
System: Sure. The phone number is 01223304705, and the postcode is cb58hy. Is there anything else I can assist you with?
User: Yes , I am also looking for an expensive Indian restaurant in the south .
System: Trer are many. What kind of food would you like ?
User: I would like Indian food .
System: Unfortunately, there aren't any Indian restaurants in the south side of town, would you like me to look in other areas?
User: No , I just need the address of one of the restaurants .
System: I'm sorry there doesn't seem to be any expensive Indian restaurants to the south.
User: Can you check again ? I just need the address .
System: Please stop

In [201]:
# Change the names of the new dialogues to avoid keys colliding when joining data

for dialogue_id in df_mwoz.columns:
    df_mwoz.rename(columns={dialogue_id: dialogue_id + "_modified"}, inplace=True)

In [202]:
# Final step is to add the origal data into the df_mwoz which now contains only the new, generated data

# Add df_undedited_data to df_mwoz
joined_df_mwoz = df_mwoz.join(df_undedited_data, how='outer')

In [203]:
# Save df_raw_mwoz to new_data_file
with open(new_data_file_with_spans, 'w') as f:
    json.dump(joined_df_mwoz.to_dict(), f, indent=4)

### FIGURING OUT LENGTH DISPARITIES IN NUM TURNS

In [62]:
# Count number of turns for each dialogue ID in df_raw_mwoz
raw_data_dialogue_turn_counts = {}
for idx, dialogue_name in enumerate(df_raw_mwoz.columns):
    raw_data_dialogue_turn_counts[dialogue_name] = len(df_raw_mwoz.iloc[:,idx].log)

# Count number of turns in each dialogue ID in data
gen_dialogue_turn_counts = {}
for dialogue_id in data.dialogue_id.unique():
    gen_dialogue_turn_counts[dialogue_id] = len(data[data.dialogue_id == dialogue_id]) * 2

In [67]:
# Go through gen_dialogue_turn_counts and compare whether the value for each key is the same
# as the corresponding value in gen_dialogue_turn_counts
count_fine = 0
count_not_fine = 0
for dialogue_id in gen_dialogue_turn_counts:
    if gen_dialogue_turn_counts[dialogue_id] != raw_data_dialogue_turn_counts[dialogue_id]:
        if count_not_fine < 30:
            print("generated: " + dialogue_id + " has " + str(gen_dialogue_turn_counts[dialogue_id]) + " turns, but " + str(raw_data_dialogue_turn_counts[dialogue_id]) + " turns in raw data")
        count_not_fine += 1
    else:
        count_fine += 1
print(count_fine)
print(count_not_fine)
        
# Note that if the generated user utterances index is longer than the raw data then
# it means the user answered a final time after the last system response 
# (beyond the data) because in the user model the user terminates it
# But in UBAR the system terminates it
# So in this case we just need to cut the final user utterance
# from the generated user utterances i.e. not add it into the json file

# In the case that the generated is shorter (often is)
# It is usually just two turns shorter because 
# The dialogue system asks if they can help the user with anything else
# And the user says no, but the user terminates the conversation beyond this point
# So we can just throw these away. Same for bigger differences than 2, it
# seems its just fluff we can throw away

generated: SNG0129.json has 6 turns, but 10 turns in raw data
generated: MUL2168.json has 18 turns, but 16 turns in raw data
generated: SNG01445.json has 6 turns, but 8 turns in raw data
generated: PMUL4899.json has 20 turns, but 22 turns in raw data
generated: MUL0784.json has 16 turns, but 24 turns in raw data
generated: SNG0548.json has 6 turns, but 8 turns in raw data
generated: PMUL4372.json has 10 turns, but 16 turns in raw data
generated: PMUL4047.json has 14 turns, but 16 turns in raw data
generated: PMUL3552.json has 18 turns, but 26 turns in raw data
generated: PMUL1539.json has 16 turns, but 18 turns in raw data
generated: PMUL3296.json has 10 turns, but 12 turns in raw data
generated: MUL1434.json has 14 turns, but 16 turns in raw data
generated: SNG0297.json has 10 turns, but 8 turns in raw data
generated: PMUL2049.json has 16 turns, but 18 turns in raw data
generated: PMUL2749.json has 16 turns, but 18 turns in raw data
generated: MUL1628.json has 14 turns, but 20 turns i

In [24]:
df_mwoz.iloc[:,0].log[0]

{'text': 'I need a hotel with free wifi and free parking.\n', 'metadata': {}}