In [8]:
import pandas as pd
import plotly.express as px
import os
import re

In [2]:
root_dir = "./results/"
dataset_names = os.listdir(root_dir)
metrics_fpaths = [os.path.join(root_dir, dataset_dir, "metrics/100.csv") for dataset_dir in dataset_names]


In [21]:
dfs = {}
for dataset_name, fpath in zip(dataset_names, metrics_fpaths):
    df = pd.read_csv(fpath, index_col=0)
    df.index = df.index.map(lambda x: re.sub(r'valentwin-.*?-n-', 'valentwin-x-n-', x))
    df = df[["recall_at_sizeof_ground_truth"]]
    df.columns = pd.MultiIndex.from_product([[dataset_name], df.columns])
    dfs[dataset_name] = df

concatenated_df = pd.concat(dfs.values(), axis=1)

average_df = concatenated_df.groupby(level=1, axis=1).mean()
average_df["column_name_weight"] = average_df.index.to_series().str.extract('cnw-([^-]*)').astype(int)/10
average_df["sim_measure"] = average_df.index.to_series().str.extract('bs-[^-]*-([^-]*)-')
average_df["column_name_sim_measure"] = average_df.index.to_series().str.extract(r'cnw-\d{2}-(\w+)(?:-|$)')
average_df["column_name_sim_measure"].fillna(average_df["sim_measure"], inplace=True)
average_df["combined_sim_measure"] = average_df["sim_measure"]+"-"+average_df["column_name_sim_measure"]

average_df


DataFrame.groupby with axis=1 is deprecated. Do `frame.T.groupby(...)` without axis instead.



Unnamed: 0,recall_at_sizeof_ground_truth,column_name_weight,sim_measure,column_name_sim_measure,combined_sim_measure
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-cos-cnw-00-cos,0.788180,0.0,cos,cos,cos-cos
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-cos-cnw-00-euc,0.788180,0.0,cos,euc,cos-euc
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-cos-cnw-01-cos,0.795839,0.1,cos,cos,cos-cos
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-cos-cnw-01-euc,0.803042,0.1,cos,euc,cos-euc
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-cos-cnw-02-cos,0.798301,0.2,cos,cos,cos-cos
...,...,...,...,...,...
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-euc-cnw-08-euc,0.655007,0.8,euc,euc,euc-euc
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-euc-cnw-09-cos,0.605260,0.9,euc,cos,euc-cos
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-euc-cnw-09-euc,0.595042,0.9,euc,euc,euc-euc
valentwin-x-n-100-hn-10-selective-neg-lr-3e5-bs-512-euc-cnw-10-cos,0.536235,1.0,euc,cos,euc-cos


In [23]:
fig = px.line(average_df[average_df["column_name_weight"] <= 0.6],
              x="column_name_weight", y="recall_at_sizeof_ground_truth",
              color="combined_sim_measure", markers=True, symbol="combined_sim_measure",
              symbol_sequence = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up'],
              labels={"column_name_weight": "Column Name Weight",
                      "recall_at_sizeof_ground_truth": "Recall@k",
                      "combined_sim_measure": "Values - Name Sim Measure"},
              width=800, height=600)
fig.update_xaxes(dtick=0.1)
fig.update_layout(
    font_family="Times Roman",
    font_color="black",
    font_size=16,
    plot_bgcolor='white',
    legend=dict(orientation="h",yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=0.95),
    margin = {'l':0,'r':0,'t':0,'b':0},
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    tickformat=".2f",
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_traces(marker={'size': 20})
fig.show()