In [1]:
import os
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
# import plotly.express as px
# import plotly.graph_objects as go
from IPython.display import display, HTML

In [2]:
kage_mwz20_dialogue2len_path = './data/mwz20/train_dialogue2len.csv'
pptod_mwz20_dialogue2len_path = './data/mwz20/pptod_train_dialogue2len.csv'

In [3]:
kage_mwz20_dialogue2len_df = pd.read_csv(kage_mwz20_dialogue2len_path)
display(kage_mwz20_dialogue2len_df)

pptod_mwz20_dialogue2len_df = pd.read_csv(pptod_mwz20_dialogue2len_path)
display(pptod_mwz20_dialogue2len_df)

Unnamed: 0,dialogue_id,total_turns
0,MUL0001,10
1,MUL0002,7
2,MUL0005,9
3,MUL0006,8
4,MUL0007,7
...,...,...
7883,WOZ20671,3
7884,WOZ20672,5
7885,WOZ20673,5
7886,WOZ20674,4


Unnamed: 0,dialogue_id,total_turns
0,SNG01856,5
1,MUL2168,8
2,MUL2105,9
3,PMUL1690,11
4,MUL2395,7
...,...,...
7896,PMUL4251,9
7897,MUL1383,10
7898,SNG0827,5
7899,PMUL2395,7


In [4]:
def cal_turn_percentage(x):
    
    if int(x['total_turns']) == 0:
        return 1
    
    return int(x['turn_idx'])/int(x['total_turns'])

In [5]:
def merge_df(dialogue2len_df, selected_turn_df):
    
    selected_turn_df['dialogue_id'] = selected_turn_df['selected_turn_id'].apply(lambda x: x.split('-')[0])
    selected_turn_df['turn_idx'] = selected_turn_df['selected_turn_id'].apply(lambda x: int(x.split('-')[1]))
#     display(selected_turn_df)
    
    # merge 
    merged_df = pd.merge(selected_turn_df, dialogue2len_df, on='dialogue_id', how='left')
    merged_df['dialogue'] = merged_df.index
#     merged_df['total_turns'] = merged_df['total_turns'] - 1
    # turn_idx+1 because the index starts from 0
    merged_df['turn_idx'] = merged_df['turn_idx'] + 1
    
    ### metric1: calculate turn percentage
    # to see the model tends to select which turn of each dialogue in each round
    # should be in [0, 1], 0 means select the first turn, 1 means select the last turn
    merged_df['turn_percentage'] = merged_df.apply(lambda x: cal_turn_percentage(x), axis=1)
    
    ### metric2: # of turns that are read by annotators
    # if total_turns is 10, select turn_idx is 3, then annotator needs to read 3/10 turns to 
    # label the turn_idx=3 turn
    annotate_turns_percent = round(merged_df['turn_percentage'].mean(), 4)
    std_annotate_turns_percent = round(merged_df['turn_percentage'].std(ddof=0), 4)
    print(f'# of turns read by annotators: mean - {annotate_turns_percent} std - {std_annotate_turns_percent}')
    
    merged_df_wo_budget = merged_df[merged_df['round'] != -1].reset_index(drop=True)
    annotate_turns_percent_wo_budget = round(merged_df_wo_budget['turn_percentage'].mean(), 4)
    print(f'# of turns read by annotators without budget: {annotate_turns_percent_wo_budget}')
    print('--------------------------------------------------')
    
    merged_df['turn_percentage_by_round'] = merged_df.groupby('round')['turn_percentage'].transform('mean')
    annotate_turns_percent_by_round = merged_df.groupby('round')['turn_percentage'].mean()
    std_annotate_turns_percent_by_round = merged_df.groupby('round')['turn_percentage'].std(ddof=0)
    for idx in annotate_turns_percent_by_round.index:
        print(f'# of turns read by annotators by round {idx}: mean - '
              f'{round(annotate_turns_percent_by_round[idx], 4)} '
              f'std - {round(std_annotate_turns_percent_by_round[idx], 4)}'
             
             )
    
    
    merged_df = merged_df.sort_values('dialogue_id').reset_index(drop=True)
    
    return merged_df

In [6]:
def read_all_by_folder_name(folder_name):
    print(folder_name)
    
    selected_turn_path_list = []
    for filename in os.listdir(folder_name):
        if not filename.endswith('selected_turn_id.csv'):
            continue
#         print(filename)
        selected_turn_path_list.append(f'{folder_name}/{filename}')
#     print(selected_turn_path_list)
    
    df_list = []
    for path in selected_turn_path_list:
#         df = pd.read_csv(path)
        df = pd.read_csv(path, usecols=[0,1])
        
        df = df[df['round'] <= 4]
        
        df_list.append(df)
        
    merged = pd.concat(df_list)
    
    if 'PPTOD' in folder_name:
        merged_statis = merge_df(pptod_mwz20_dialogue2len_df, merged)
    else:
    
        merged_statis = merge_df(kage_mwz20_dialogue2len_df, merged)
        
    return merged_statis

In [7]:
selected_turn_folder_list = [
    # Max Entropy
    './data/mwz20/KAGE/max_entropy/k100',
    
    # Least Confidence
    './data/mwz20/KAGE/least_confidence/k100',
    
    # Random
    './data/mwz20/KAGE/random/k100',
    
    #####################################
    # Max Entropy
    './data/mwz20/PPTOD/max_entropy/k100',
    
    # Least Confidence
    './data/mwz20/PPTOD/least_confidence/k100',
    
    # Random
    './data/mwz20/PPTOD/random/k100',

]

In [8]:
# for selected_turn_folder in selected_turn_folder_list:
#     print(f'============= {selected_turn_folder} =============')
#     statis = read_all_by_folder_name(selected_turn_folder)
#     statis = statis.sort_values('dialogue_id').reset_index(drop=True)
    
#     display(statis)
#     print()

## Compare by different strategies

In [9]:
# PPTOD

pptod_list = [
    # Max Entropy
    './data/mwz20/PPTOD/max_entropy/k2000',
    
    # Least Confidence
    './data/mwz20/PPTOD/least_confidence/k2000',
    
    # Random
    './data/mwz20/PPTOD/random/k2000',
]

# pptod_statis_list = []
# for selected_turn_folder in pptod_list:
#     print(f'============= {selected_turn_folder} =============')
#     statis = read_all_by_folder_name(selected_turn_folder)
# #     statis = statis.sort_values('dialogue_id').reset_index(drop=True)
    
#     pptod_statis_list.append(statis)
#     display(statis)
#     print()

In [10]:
def plot_turn_percentage_by_round(round_idx, df):
    
    r = df[df['round'] == round_idx].reset_index(drop=True)

#     fig = px.scatter(r, x="dialogue", y="turn_percentage")
    fig = px.strip(r, x="dialogue", y="turn_percentage")
#     fig.show()
    return fig

In [11]:
def plot_turn_percentage(df):
    
#     r = df[df['round'] == round_idx].reset_index(drop=True)

    fig = px.strip(df, x="dialogue", y="turn_percentage")
#     fig.show()
    return fig

In [12]:
def visualize_turn_percentage(df):
    
#     r = df[df['round'] == round_idx].reset_index(drop=True)

#     fig = px.strip(df, x="method", y="turn_percentage")
    fig = px.strip(df, x="turn_percentage", y="method")
#     fig.show()
    return fig

### KAGE

In [55]:
# kage_me = read_all_by_folder_name('./data/mwz20/KAGE/max_entropy/k100_only_one')
# kage_lc = read_all_by_folder_name('./data/mwz20/KAGE/least_confidence/k100_only_one')
# kage_random = read_all_by_folder_name('./data/mwz20/KAGE/random/k100_only_one')

kage_me = read_all_by_folder_name('./data/mwz20/KAGE/max_entropy/k2000')
kage_lc = read_all_by_folder_name('./data/mwz20/KAGE/least_confidence/k2000')
kage_random = read_all_by_folder_name('./data/mwz20/KAGE/random/k2000')

# kage_me['method'] = 'ME'
# kage_lc['method'] = 'LC'
# kage_random['method'] = 'RS'

# display(kage_me)
# display(kage_lc)
# display(kage_random)


# kage_me = kage_me[kage_me['round'] == 4]
# kage_lc = kage_lc[kage_lc['round'] == 4]
# kage_random = kage_random[kage_random['round'] == 4]


display(kage_me)
display(kage_lc)
display(kage_random)

# kage_me = kage_me[['round', 'dialogue_id', 'turn_idx', 'total_turns']]
# kage_me.groupby(['dialogue_id','round', 'turn_idx']).count()


# kage_me[kage_me['dialogue_id'] == 'MUL1219']

./data/mwz20/KAGE/max_entropy/k2000
# of turns read by annotators: mean - 0.6198 std - 0.2882
# of turns read by annotators without budget: 0.6198
--------------------------------------------------
# of turns read by annotators by round 0: mean - 0.4774 std - 0.2776
# of turns read by annotators by round 1: mean - 0.7104 std - 0.2887
# of turns read by annotators by round 2: mean - 0.6614 std - 0.2612
# of turns read by annotators by round 3: mean - 0.6306 std - 0.2696
./data/mwz20/KAGE/least_confidence/k2000
# of turns read by annotators: mean - 0.7051 std - 0.3033
# of turns read by annotators without budget: 0.7051
--------------------------------------------------
# of turns read by annotators by round 0: mean - 0.4879 std - 0.3225
# of turns read by annotators by round 1: mean - 0.846 std - 0.2374
# of turns read by annotators by round 2: mean - 0.7092 std - 0.2768
# of turns read by annotators by round 3: mean - 0.7817 std - 0.2375
./data/mwz20/KAGE/random/k2000
# of turns read b

Unnamed: 0,round,selected_turn_id,dialogue_id,turn_idx,total_turns,dialogue,turn_percentage,turn_percentage_by_round
0,1,MUL0001-8,MUL0001,9,10,2371,0.9,0.710398
1,3,MUL0001-6,MUL0001,7,10,13888,0.7,0.630607
2,1,MUL0002-6,MUL0002,7,7,2673,1.0,0.710398
3,1,MUL0002-6,MUL0002,7,7,11004,1.0,0.710398
4,3,MUL0005-8,MUL0005,9,9,13889,1.0,0.630607
...,...,...,...,...,...,...,...,...
15771,2,WOZ20673-4,WOZ20673,5,5,5983,1.0,0.661356
15772,0,WOZ20674-1,WOZ20674,2,4,8627,0.5,0.477416
15773,0,WOZ20674-3,WOZ20674,4,4,397,1.0,0.477416
15774,1,WOZ20675-3,WOZ20675,4,5,11426,0.8,0.710398


Unnamed: 0,round,selected_turn_id,dialogue_id,turn_idx,total_turns,dialogue,turn_percentage,turn_percentage_by_round
0,2,MUL0001-6,MUL0001,7,10,4141,0.700000,0.709218
1,2,MUL0001-6,MUL0001,7,10,13546,0.700000,0.709218
2,1,MUL0002-6,MUL0002,7,7,2880,1.000000,0.846028
3,0,MUL0002-0,MUL0002,1,7,8441,0.142857,0.487851
4,0,MUL0005-1,MUL0005,2,9,1385,0.222222,0.487851
...,...,...,...,...,...,...,...,...
15771,2,WOZ20673-0,WOZ20673,1,5,5237,0.200000,0.709218
15772,0,WOZ20674-1,WOZ20674,2,4,9185,0.500000,0.487851
15773,1,WOZ20674-0,WOZ20674,1,4,3302,0.250000,0.846028
15774,2,WOZ20675-2,WOZ20675,3,5,5761,0.600000,0.709218


Unnamed: 0,round,selected_turn_id,dialogue_id,turn_idx,total_turns,dialogue,turn_percentage,turn_percentage_by_round
0,3,MUL0001-1,MUL0001,2,10,6000,0.200000,0.585768
1,1,MUL0001-0,MUL0001,1,10,11458,0.100000,0.580981
2,0,MUL0002-5,MUL0002,6,7,1827,0.857143,0.583759
3,2,MUL0002-3,MUL0002,4,7,12772,0.571429,0.592863
4,3,MUL0005-7,MUL0005,8,9,13888,0.888889,0.585768
...,...,...,...,...,...,...,...,...
15771,1,WOZ20673-1,WOZ20673,2,5,10548,0.400000,0.580981
15772,1,WOZ20674-3,WOZ20674,4,4,2446,1.000000,0.580981
15773,1,WOZ20674-0,WOZ20674,1,4,9895,0.250000,0.580981
15774,2,WOZ20675-1,WOZ20675,2,5,4498,0.400000,0.592863


In [58]:
# kage_me_lc_rs = pd.concat([kage_me, kage_lc, kage_random])
# kage_me_lc_rs

In [38]:
fig = go.Figure()

# y_me = 

fig.add_trace(go.Box(
    y=kage_random['turn_percentage'],
    name='RS',
    boxpoints='all',
    jitter=0.5,
    whiskerwidth=0.2,
    marker_size=4,
#     line_width=1
    )
)

fig.add_trace(go.Box(
    y=kage_lc['turn_percentage'],
    name='LC',
    boxpoints='all',
    jitter=0.5,
    whiskerwidth=0.2,
    marker_size=4,
#     line_width=1
    )
)


fig.add_trace(go.Box(
    y=kage_me['turn_percentage'],
    name='ME',
    boxpoints='all',
    jitter=0.5,
    whiskerwidth=0.2,
    marker_size=4,
#     line_width=1
    )
)

fig.update_yaxes(title_text="Reading Cost (RC)")

fig.update_layout(
#     height=600,
#     width=400,
    margin_l=5, margin_t=5, margin_b=5, margin_r=5,
    legend=dict(
        title=None, orientation = 'h', y=1.03, yanchor="bottom", x=0.5, xanchor="center",
        font=dict(size=11)
    )
)


# fig = px.box(pptod_me_lc_rs, x="method", y="turn_percentage", boxpoints='all')
# fig.show()

pio.write_image(fig, "./data/plot/visualization_kage.pdf", width=600, height=500)

In [16]:
# visualize_turn_percentage(kage_me_lc_rs)

In [17]:
# kage_lc = read_all_by_folder_name('./data/mwz20/KAGE/least_confidence/k2000')
# display(kage_lc)
# # kage_lc = kage_lc[['round', 'dialogue_id', 'turn_idx', 'total_turns']]
# # kage_lc

# kage_me[kage_me['dialogue_id'] == 'MUL1219']

In [59]:
kage_merged = pd.merge(kage_me, kage_lc, how='inner', on=['dialogue_id'])
display(kage_merged)

# kage_merged = kage_merged[(kage_merged['round_x'] == 4) & (kage_merged['round_y'] == 4)]
# kage_merged

Unnamed: 0,round_x,selected_turn_id_x,dialogue_id,turn_idx_x,total_turns_x,dialogue_x,turn_percentage_x,turn_percentage_by_round_x,round_y,selected_turn_id_y,turn_idx_y,total_turns_y,dialogue_y,turn_percentage_y,turn_percentage_by_round_y
0,1,MUL0001-8,MUL0001,9,10,2371,0.9,0.710398,2,MUL0001-6,7,10,4141,0.70,0.709218
1,1,MUL0001-8,MUL0001,9,10,2371,0.9,0.710398,2,MUL0001-6,7,10,13546,0.70,0.709218
2,3,MUL0001-6,MUL0001,7,10,13888,0.7,0.630607,2,MUL0001-6,7,10,4141,0.70,0.709218
3,3,MUL0001-6,MUL0001,7,10,13888,0.7,0.630607,2,MUL0001-6,7,10,13546,0.70,0.709218
4,1,MUL0002-6,MUL0002,7,7,2673,1.0,0.710398,1,MUL0002-6,7,7,2880,1.00,0.846028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31547,0,WOZ20674-3,WOZ20674,4,4,397,1.0,0.477416,1,WOZ20674-0,1,4,3302,0.25,0.846028
31548,1,WOZ20675-3,WOZ20675,4,5,11426,0.8,0.710398,2,WOZ20675-2,3,5,5761,0.60,0.709218
31549,1,WOZ20675-3,WOZ20675,4,5,11426,0.8,0.710398,3,WOZ20675-0,1,5,15775,0.20,0.781710
31550,1,WOZ20675-2,WOZ20675,3,5,2845,0.6,0.710398,2,WOZ20675-2,3,5,5761,0.60,0.709218


### PPTOD

In [51]:
pptod_me = read_all_by_folder_name('./data/mwz20/PPTOD/max_entropy/k100')
pptod_lc = read_all_by_folder_name('./data/mwz20/PPTOD/least_confidence/k100')
pptod_random = read_all_by_folder_name('./data/mwz20/PPTOD/random/k100')

pptod_me['method'] = 'ME'
pptod_lc['method'] = 'LC'
pptod_random['method'] = 'RS'

# display(pptod_me)
# display(pptod_lc)
# display(pptod_random)

# pptod_me = pptod_me[pptod_me['round'] == 4]
# pptod_lc = pptod_lc[pptod_lc['round'] == 4]
# pptod_random = pptod_random[pptod_random['round'] == 4]

display(pptod_me)
display(pptod_lc)
display(pptod_random)
# pptod_me = pptod_me[['round', 'dialogue_id', 'turn_idx', 'total_turns']]
# pptod_me

./data/mwz20/PPTOD/max_entropy/k100
# of turns read by annotators: mean - 0.5868 std - 0.3153
# of turns read by annotators without budget: 0.5868
--------------------------------------------------
# of turns read by annotators by round 0: mean - 0.5522 std - 0.3231
# of turns read by annotators by round 1: mean - 0.5913 std - 0.2887
# of turns read by annotators by round 2: mean - 0.5708 std - 0.3414
# of turns read by annotators by round 3: mean - 0.74 std - 0.2551
# of turns read by annotators by round 4: mean - 0.4795 std - 0.3022
./data/mwz20/PPTOD/least_confidence/k100
# of turns read by annotators: mean - 0.8113 std - 0.2228
# of turns read by annotators without budget: 0.8113
--------------------------------------------------
# of turns read by annotators by round 0: mean - 0.7952 std - 0.2219
# of turns read by annotators by round 1: mean - 0.8343 std - 0.2282
# of turns read by annotators by round 2: mean - 0.8177 std - 0.2138
# of turns read by annotators by round 3: mean - 

Unnamed: 0,round,selected_turn_id,dialogue_id,turn_idx,total_turns,dialogue,turn_percentage,turn_percentage_by_round,method
0,0,MUL0015-0,MUL0015,1,9,2,0.111111,0.552174,ME
1,1,MUL0020-10,MUL0020,11,11,165,1.000000,0.591323,ME
2,4,MUL0028-7,MUL0028,8,8,426,1.000000,0.479531,ME
3,0,MUL0054-5,MUL0054,6,6,62,1.000000,0.552174,ME
4,3,MUL0066-8,MUL0066,9,9,372,1.000000,0.740002,ME
...,...,...,...,...,...,...,...,...,...
495,4,WOZ20587-0,WOZ20587,1,3,468,0.333333,0.479531,ME
496,2,WOZ20638-2,WOZ20638,3,4,286,0.750000,0.570771,ME
497,1,WOZ20645-1,WOZ20645,2,2,115,1.000000,0.591323,ME
498,2,WOZ20648-2,WOZ20648,3,3,235,1.000000,0.570771,ME


Unnamed: 0,round,selected_turn_id,dialogue_id,turn_idx,total_turns,dialogue,turn_percentage,turn_percentage_by_round,method
0,0,MUL0015-8,MUL0015,9,9,99,1.000000,0.795186,LC
1,3,MUL0020-8,MUL0020,9,11,356,0.818182,0.727864,LC
2,4,MUL0028-4,MUL0028,5,8,403,0.625000,0.881302,LC
3,1,MUL0054-5,MUL0054,6,6,168,1.000000,0.834286,LC
4,2,MUL0066-6,MUL0066,7,9,207,0.777778,0.817656,LC
...,...,...,...,...,...,...,...,...,...
495,2,WOZ20587-2,WOZ20587,3,3,209,1.000000,0.817656,LC
496,3,WOZ20638-0,WOZ20638,1,4,326,0.250000,0.727864,LC
497,1,WOZ20645-1,WOZ20645,2,2,157,1.000000,0.834286,LC
498,3,WOZ20648-0,WOZ20648,1,3,365,0.333333,0.727864,LC


Unnamed: 0,round,selected_turn_id,dialogue_id,turn_idx,total_turns,dialogue,turn_percentage,turn_percentage_by_round,method
0,2,MUL0015-5,MUL0015,6,9,200,0.666667,0.538223,RS
1,1,MUL0020-9,MUL0020,10,11,101,0.909091,0.642623,RS
2,3,MUL0028-0,MUL0028,1,8,364,0.125000,0.491131,RS
3,0,MUL0054-1,MUL0054,2,6,68,0.333333,0.592026,RS
4,3,MUL0066-3,MUL0066,4,9,303,0.444444,0.491131,RS
...,...,...,...,...,...,...,...,...,...
495,0,WOZ20587-0,WOZ20587,1,3,22,0.333333,0.592026,RS
496,0,WOZ20638-2,WOZ20638,3,4,82,0.750000,0.592026,RS
497,4,WOZ20645-1,WOZ20645,2,2,453,1.000000,0.631876,RS
498,4,WOZ20648-1,WOZ20648,2,3,479,0.666667,0.631876,RS


In [20]:
pptod_me_lc_rs = pd.concat([pptod_me, pptod_lc, pptod_random])
pptod_me_lc_rs

# Plot KAGE & PPTOD turn selection visualization

Only plot k=500 setting and only plot the last round

In [35]:
fig = go.Figure()

# y_me = 

fig.add_trace(go.Box(
    y=pptod_random['turn_percentage'],
    name='RS',
    boxpoints='all',
    jitter=0.5,
    whiskerwidth=0.2,
    marker_size=4,
#     line_width=1
    )
)

fig.add_trace(go.Box(
    y=pptod_lc['turn_percentage'],
    name='LC',
    boxpoints='all',
    jitter=0.5,
    whiskerwidth=0.2,
    marker_size=4,
#     line_width=1
    )
)


fig.add_trace(go.Box(
    y=pptod_me['turn_percentage'],
    name='ME',
    boxpoints='all',
    jitter=0.5,
    whiskerwidth=0.2,
    marker_size=4,
#     line_width=1
    )
)

fig.update_yaxes(title_text="Reading Cost (RC)")

fig.update_layout(
#     height=600,
#     width=400,
    margin_l=5, margin_t=5, margin_b=5, margin_r=5,
    legend=dict(
        title=None, orientation = 'h', y=1.03, yanchor="bottom", x=0.5, xanchor="center",
        font=dict(size=11)
    )
)


fig.show()

# pio.write_image(fig, "./data/plot/visualization_pptod.pdf", width=600, height=500)

In [43]:
# visualize_turn_percentage(pptod_me_lc_rs)

In [42]:
# plot_turn_percentage(pptod_me)

In [41]:
# plot_turn_percentage_by_round(0, pptod_me)

In [45]:
# pptod_lc = read_all_by_folder_name('./data/mwz20/PPTOD/least_confidence/k100')
# display(pptod_lc)
# # pptod_lc = pptod_lc[['round', 'dialogue_id', 'turn_idx', 'total_turns']]
# pptod_lc

In [39]:
# plot_turn_percentage(pptod_lc)

In [52]:
pptod_merged = pd.merge(pptod_me, pptod_lc, how='inner', on=['dialogue_id'])
# display(pptod_merged)

pptod_merged = pptod_merged[['dialogue_id','round_x','round_y','turn_idx_x','turn_idx_y','total_turns_y']]

# pptod_merged = pptod_merged[(pptod_merged['round_x'] == 4) & (pptod_merged['round_y'] == 4)]
pptod_merged

Unnamed: 0,dialogue_id,round_x,round_y,turn_idx_x,turn_idx_y,total_turns_y
0,MUL0015,0,0,1,9,9
1,MUL0020,1,3,11,9,11
2,MUL0028,4,4,8,5,8
3,MUL0054,0,1,6,6,6
4,MUL0066,3,2,9,7,9
...,...,...,...,...,...,...
495,WOZ20587,4,2,1,3,3
496,WOZ20638,2,3,3,1,4
497,WOZ20645,1,1,2,2,2
498,WOZ20648,2,3,3,1,3


In [66]:
print(pptod_merged.to_string())

    dialogue_id  round_x  round_y  turn_idx_x  turn_idx_y  total_turns_y
0       MUL0015        0        0           1           9              9
1       MUL0020        1        3          11           9             11
2       MUL0028        4        4           8           5              8
3       MUL0054        0        1           6           6              6
4       MUL0066        3        2           9           7              9
5       MUL0087        2        0           6           7              8
6       MUL0101        0        4           3           9              9
7       MUL0108        2        3           7           6              7
8       MUL0122        2        4           7           8              8
9       MUL0137        1        0           3           7             13
10      MUL0146        4        4           6           8              8
11      MUL0156        0        1           4          10             11
12      MUL0184        4        2           6      

In [53]:
pptod_merged.to_csv('pptod.csv')

In [44]:
pptod_merged[pptod_merged['dialogue_id'] == 'SNG0025']

Unnamed: 0,round_x,dialogue_id,turn_idx_x,total_turns_x,round_y,turn_idx_y,total_turns_y
361,4,SNG0025,1,4,4,4,4


In [65]:
kage_merged = kage_merged[['dialogue_id','round_x','round_y','turn_idx_x','turn_idx_y','total_turns_y']]

kage_merged[kage_merged['dialogue_id'] == 'MUL1170']

Unnamed: 0,dialogue_id,round_x,round_y,turn_idx_x,turn_idx_y,total_turns_y
3384,MUL1170,1,1,6,5,6
3385,MUL1170,1,2,6,5,6
3386,MUL1170,2,1,3,5,6
3387,MUL1170,2,2,3,5,6


In [47]:
pd.merge(pptod_merged, kage_merged, how='inner', on=['dialogue_id']) 

Unnamed: 0,round_x_x,dialogue_id,turn_idx_x_x,total_turns_x_x,round_y_x,turn_idx_y_x,total_turns_y_x,round_x_y,turn_idx_x_y,total_turns_x_y,round_y_y,turn_idx_y_y,total_turns_y_y
