In [1]:
import os
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
# import plotly.express as px
# import plotly.graph_objects as go
from IPython.display import display, HTML

In [2]:
kage_mwz20_dialogue2len_path = './data/mwz20/train_dialogue2len.csv'
pptod_mwz20_dialogue2len_path = './data/mwz20/pptod_train_dialogue2len.csv'

In [3]:
kage_mwz20_dialogue2len_df = pd.read_csv(kage_mwz20_dialogue2len_path)
display(kage_mwz20_dialogue2len_df)

pptod_mwz20_dialogue2len_df = pd.read_csv(pptod_mwz20_dialogue2len_path)
display(pptod_mwz20_dialogue2len_df)

Unnamed: 0,dialogue_id,total_turns
0,MUL0001,10
1,MUL0002,7
2,MUL0005,9
3,MUL0006,8
4,MUL0007,7
...,...,...
7883,WOZ20671,3
7884,WOZ20672,5
7885,WOZ20673,5
7886,WOZ20674,4


Unnamed: 0,dialogue_id,total_turns
0,SNG01856,5
1,MUL2168,8
2,MUL2105,9
3,PMUL1690,11
4,MUL2395,7
...,...,...
7896,PMUL4251,9
7897,MUL1383,10
7898,SNG0827,5
7899,PMUL2395,7


In [4]:
def cal_turn_percentage(x):
    
    if int(x['total_turns']) == 0:
        return 1
    
    return int(x['turn_idx'])/int(x['total_turns'])

In [5]:
def merge_df(dialogue2len_df, selected_turn_df):
    
    selected_turn_df['dialogue_id'] = selected_turn_df['selected_turn_id'].apply(lambda x: x.split('-')[0])
    selected_turn_df['turn_idx'] = selected_turn_df['selected_turn_id'].apply(lambda x: int(x.split('-')[1]))
#     display(selected_turn_df)
    
    # merge 
    merged_df = pd.merge(selected_turn_df, dialogue2len_df, on='dialogue_id', how='left')
    merged_df['dialogue'] = merged_df.index
#     merged_df['total_turns'] = merged_df['total_turns'] - 1
    # turn_idx+1 because the index starts from 0
    merged_df['turn_idx'] = merged_df['turn_idx'] + 1
    
    ### metric1: calculate turn percentage
    # to see the model tends to select which turn of each dialogue in each round
    # should be in [0, 1], 0 means select the first turn, 1 means select the last turn
    merged_df['turn_percentage'] = merged_df.apply(lambda x: cal_turn_percentage(x), axis=1)
    
    ### metric2: # of turns that are read by annotators
    # if total_turns is 10, select turn_idx is 3, then annotator needs to read 3/10 turns to 
    # label the turn_idx=3 turn
    annotate_turns_percent = round(merged_df['turn_percentage'].mean(), 4)
    std_annotate_turns_percent = round(merged_df['turn_percentage'].std(ddof=0), 4)
    print(f'# of turns read by annotators: mean - {annotate_turns_percent} std - {std_annotate_turns_percent}')
    
    merged_df_wo_budget = merged_df[merged_df['round'] != -1].reset_index(drop=True)
    annotate_turns_percent_wo_budget = round(merged_df_wo_budget['turn_percentage'].mean(), 4)
    print(f'# of turns read by annotators without budget: {annotate_turns_percent_wo_budget}')
    print('--------------------------------------------------')
    
    merged_df['turn_percentage_by_round'] = merged_df.groupby('round')['turn_percentage'].transform('mean')
    annotate_turns_percent_by_round = merged_df.groupby('round')['turn_percentage'].mean()
    std_annotate_turns_percent_by_round = merged_df.groupby('round')['turn_percentage'].std(ddof=0)
    for idx in annotate_turns_percent_by_round.index:
        print(f'# of turns read by annotators by round {idx}: mean - '
              f'{round(annotate_turns_percent_by_round[idx], 4)} '
              f'std - {round(std_annotate_turns_percent_by_round[idx], 4)}'
             
             )
    
    
#     merged_df = merged_df.sort_values('dialogue_id').reset_index(drop=True)
    
    return merged_df

In [6]:
def read_all_by_folder_name(folder_name):
    selected_turn_path_list = []
    for filename in os.listdir(folder_name):
        if not filename.endswith('selected_turn_id.csv'):
            continue
#         print(filename)
        selected_turn_path_list.append(f'{folder_name}/{filename}')
#     print(selected_turn_path_list)
    
    df_list = []
    for path in selected_turn_path_list:
        df = pd.read_csv(path)
#         df = pd.read_csv(path, usecols=[0,1])
        
        df = df[df['round'] <= 4]
        
        df_list.append(df)
        
    merged = pd.concat(df_list)
    
    if 'PPTOD' in folder_name:
        merged_statis = merge_df(pptod_mwz20_dialogue2len_df, merged)
    else:
    
        merged_statis = merge_df(kage_mwz20_dialogue2len_df, merged)
        
    return merged_statis

In [7]:
selected_turn_folder_list = [
    # Max Entropy
    './data/mwz20/KAGE/max_entropy/k100',
    
    # Least Confidence
    './data/mwz20/KAGE/least_confidence/k100',
    
    # Random
    './data/mwz20/KAGE/random/k100',
    
    #####################################
    # Max Entropy
    './data/mwz20/PPTOD/max_entropy/k100',
    
    # Least Confidence
    './data/mwz20/PPTOD/least_confidence/k100',
    
    # Random
    './data/mwz20/PPTOD/random/k100',

]

In [8]:
# for selected_turn_folder in selected_turn_folder_list:
#     print(f'============= {selected_turn_folder} =============')
#     statis = read_all_by_folder_name(selected_turn_folder)
#     statis = statis.sort_values('dialogue_id').reset_index(drop=True)
    
#     display(statis)
#     print()

## Compare by different strategies

### PPTOD 

In [9]:
# PPTOD

pptod_list = [
    # Max Entropy
    './data/mwz20/PPTOD/max_entropy/k100',
    
    # Least Confidence
    './data/mwz20/PPTOD/least_confidence/k100',
    
    # Random
    './data/mwz20/PPTOD/random/k100',
]

# pptod_statis_list = []
# for selected_turn_folder in pptod_list:
#     print(f'============= {selected_turn_folder} =============')
#     statis = read_all_by_folder_name(selected_turn_folder)
#     statis = statis.sort_values('dialogue_id').reset_index(drop=True)
    
#     pptod_statis_list.append(statis)
#     display(statis)
#     print()

In [10]:
def merge_folder_df(folder_name):
    selected_turn_path_list = []
    for filename in os.listdir(folder_name):
        if not filename.endswith('selected_turn_id.csv'):
            continue
    #         print(filename)
        selected_turn_path_list.append(f'{folder_name}/{filename}')
    #     print(selected_turn_path_list)

    df_list = []
    for path in selected_turn_path_list:
        df = pd.read_csv(path)
    #         df = pd.read_csv(path, usecols=[0,1])

        df = df[df['round'] <= 4]

        df_list.append(df)

    merged = pd.concat(df_list)
    
    return merged

In [11]:
# folder_name = './data/mwz20/PPTOD/max_entropy/k100'
folder_name = './data/mwz20/KAGE/max_entropy/k100'
selected_turn_path_list = []
for filename in os.listdir(folder_name):
    if not filename.endswith('selected_turn_id.csv'):
        continue
#         print(filename)
    selected_turn_path_list.append(f'{folder_name}/{filename}')
#     print(selected_turn_path_list)

df_list = []
for path in selected_turn_path_list:
    df = pd.read_csv(path)
#         df = pd.read_csv(path, usecols=[0,1])

    df = df[df['round'] <= 4]

    df_list.append(df)

merged = pd.concat(df_list)
merged

Unnamed: 0,round,selected_turn_id,max_entropy,min_confidence,select_turn_per_round_time,min_slot_acc
0,0,PMUL0274-4,2352.146484,0.0,9053.854584,
1,0,PMUL0620-0,2470.353027,0.0,9053.854584,
2,0,WOZ20501-2,2502.310791,0.0,9053.854584,
3,0,MUL1277-0,2464.903320,0.0,9053.854584,
4,0,PMUL1240-0,2391.855957,0.0,9053.854584,
...,...,...,...,...,...,...
495,4,SNG1357-0,1.088714,,27636.466459,0.0
496,4,MUL1704-2,2.665967,,27636.466459,0.0
497,4,MUL1992-1,2.623942,,27636.466459,0.0
498,4,MUL2631-11,17.841730,,27636.466459,0.0


In [12]:
kage_me = merge_folder_df('./data/mwz20/KAGE/max_entropy/k100')
display(kage_me)

kage_me.groupby('round')['max_entropy'].mean()

Unnamed: 0,round,selected_turn_id,max_entropy,min_confidence,select_turn_per_round_time,min_slot_acc
0,0,PMUL0274-4,2352.146484,0.0,9053.854584,
1,0,PMUL0620-0,2470.353027,0.0,9053.854584,
2,0,WOZ20501-2,2502.310791,0.0,9053.854584,
3,0,MUL1277-0,2464.903320,0.0,9053.854584,
4,0,PMUL1240-0,2391.855957,0.0,9053.854584,
...,...,...,...,...,...,...
495,4,SNG1357-0,1.088714,,27636.466459,0.0
496,4,MUL1704-2,2.665967,,27636.466459,0.0
497,4,MUL1992-1,2.623942,,27636.466459,0.0
498,4,MUL2631-11,17.841730,,27636.466459,0.0


round
0    2373.819516
1      19.953207
2      16.865694
3       8.404960
4       5.682351
Name: max_entropy, dtype: float64

In [13]:
kage_lc = merge_folder_df('./data/mwz20/KAGE/least_confidence/k100')
display(kage_lc)

kage_lc.groupby('round')['min_confidence'].mean()

Unnamed: 0,round,selected_turn_id,max_entropy,min_confidence,select_turn_per_round_time
0,0,WOZ20203-2,0,3226.492432,12937.667790
1,0,PMUL2157-3,0,2980.762939,12937.667790
2,0,PMUL2843-8,0,3124.443848,12937.667790
3,0,PMUL2545-4,0,3188.487061,12937.667790
4,0,SSNG0203-4,0,2842.015625,12937.667790
...,...,...,...,...,...
495,4,PMUL1469-4,0,2762.980469,36282.477574
496,4,PMUL4135-7,0,2703.090820,36282.477574
497,4,PMUL3653-11,0,2644.262451,36282.477574
498,4,SSNG0141-3,0,2765.253906,36282.477574


round
0    2721.945278
1    3017.955684
2    2996.289056
3    3455.204601
4    3225.511880
Name: min_confidence, dtype: float64

In [14]:
pptod_me = merge_folder_df('./data/mwz20/PPTOD/max_entropy/k100')
display(pptod_me)

pptod_me.groupby('round')['max_entropy'].mean()

Unnamed: 0,round,selected_turn_id,max_entropy,min_confidence
0,0,WOZ20000-0,2.059196,0
1,0,PMUL3877-0,9.868336,0
2,0,MUL0015-0,26.539364,0
3,0,PMUL4105-3,8.808507,0
4,0,MUL1524-0,13.296951,0
...,...,...,...,...
495,4,WOZ20153-0,0.085689,0
496,4,PMUL1989-7,13.563635,0
497,4,MUL1962-3,5.145829,0
498,4,SNG1317-0,3.275151,0


round
0     9.471201
1    31.129605
2    23.026060
3     5.556783
4     8.319178
Name: max_entropy, dtype: float64

In [15]:
pptod_lc = merge_folder_df('./data/mwz20/PPTOD/least_confidence/k100')
display(pptod_lc)

pptod_lc.groupby('round')['min_confidence'].mean()

Unnamed: 0,round,selected_turn_id,max_entropy,min_confidence
0,0,PMUL2369-2,0,208.692902
1,0,PMUL3009-7,0,403.974640
2,0,MUL1236-8,0,914.898254
3,0,MUL1907-5,0,231.558228
4,0,MUL1990-4,0,370.201965
...,...,...,...,...
495,4,SNG02298-3,0,322.118591
496,4,SSNG0374-3,0,343.250366
497,4,MUL0758-4,0,629.664062
498,4,MUL1170-3,0,286.257538


round
0    418.603738
1    413.207868
2    500.726613
3    518.156325
4    622.318366
Name: min_confidence, dtype: float64

In [None]:
# pptod_statis_list