In [1]:
from server.scripts import config

import altair as alt
import pandas as pd
import json
import os

Find the hardness of each query according to the original nvBench JSON file.

In [2]:
query_to_hardness_map = {}
with open(os.path.join(config.BENCHMARK_DIR_PATH, "NVBench.json")) as f:
    nvBench = json.load(f)

for benchmark in nvBench.values():
    for query in benchmark["nl_queries"]:
        query_to_hardness_map[' '.join(query.strip().split())] = benchmark["hardness"]

with open(
    os.path.join(config.BENCHMARK_DIR_PATH, "query_to_hardness_map.json"),
    "w",
    encoding="utf-8",
) as f:
    json.dump(query_to_hardness_map, f, indent=4)


In [3]:
data = []

for file in os.listdir(config.BENCHMARK_EVAL_DIR_PATH):
    if not file.endswith(".json"):
        continue
    filepath = os.path.join(config.BENCHMARK_EVAL_DIR_PATH, file)
    with open(filepath) as f:
        eval_data = json.load(f)

    # Delete the evaluation files with zero results
    results = eval_data["results"]
    if len(results) == 0:
        print(f"{file} has no results, deleting...")
        os.remove(filepath)

    for result in results:
        nl_query = ' '.join(result["query"].strip().split())
        try:
            hardness = query_to_hardness_map[nl_query]
        except KeyError:
            hardness = 'unknown'
        
        data.append({
            'model_name': eval_data['model_name'],
            'dataset_name': eval_data['dataset_name'],
            'nl_query': nl_query,
            'hardness': hardness,
            'produced_viz': 1 if result['produced_spec'] else 0,
            'ssim': result['metrics']['ssim'],
        })

df = pd.DataFrame(data)
df.to_csv(config.BENCHMARK_CLEAN_EVAL_CSV, index=False)

In [4]:
eval_df = pd.read_csv(config.BENCHMARK_CLEAN_EVAL_CSV)
eval_df.groupby('model_name').size()

model_name
ncNet    8857
nl4dv    9632
dtype: int64

Group the results by model and determine the success rate—that is, the
percentage of results that produced a visualization for each model.

In [5]:
hardness_counts = eval_df.groupby('hardness').size()
hardness_counts

hardness
Easy          7871
Extra Hard     622
Hard          1312
Medium        8628
unknown         56
dtype: int64

In [6]:
grouped = (
    eval_df.groupby(["model_name", "hardness"])
    .agg(total=("dataset_name", "count"), total_successes=("produced_viz", "sum"))
    .reset_index()
)
grouped["success_rate"] = grouped["total_successes"] / grouped["total"]
grouped = grouped[grouped['hardness'] != 'unknown']
grouped


Unnamed: 0,model_name,hardness,total,total_successes,success_rate
0,ncNet,Easy,3810,3079,0.808136
1,ncNet,Extra Hard,308,284,0.922078
2,ncNet,Hard,613,392,0.639478
3,ncNet,Medium,4100,3201,0.780732
5,nl4dv,Easy,4061,1693,0.416892
6,nl4dv,Extra Hard,314,210,0.66879
7,nl4dv,Hard,699,457,0.653791
8,nl4dv,Medium,4528,2521,0.556758


Visualize the success rate in a simple bar chart.

In [7]:
bars = (
    alt.Chart(grouped)
    .mark_bar()
    .encode(
        x=alt.X(
            "hardness:N",
            axis=alt.Axis(title=""),
            sort=["Easy", "Medium", "Hard", "Extra Hard"],
        ),
        y=alt.Y(
            "success_rate",
            scale=alt.Scale(domain=[0, 1]),
            axis=alt.Axis(title="Success Rate", format="%"),
        ),
        color=alt.Color(
            "hardness:N",
            legend=alt.Legend(
                title="Query Hardness",
            ),
        ),
        column=alt.Column("model_name:N", title="Model"),
    )
    .properties(width=200, height=300, title="Success Rate by Model")
)

bars


Drop all of the rows that did not produce visualizations so we can compare SSIM metrics.

In [8]:
produced_viz_df = eval_df[(eval_df['produced_viz'] == 1) & (eval_df['ssim'] >= 0)]

In [9]:
# Check if any model/dataset pairs produced a SSIM of 0
produced_viz_df[produced_viz_df['ssim'] == 0]

Unnamed: 0,model_name,dataset_name,nl_query,hardness,produced_viz,ssim


Calculate and visualize the average SSIM aggregated by dataset and model.

In [10]:
ssim_aggregate = produced_viz_df.groupby(['dataset_name', 'model_name']).agg(avg_ssim=('ssim', 'mean')).reset_index()
ssim_aggregate.head()

Unnamed: 0,dataset_name,model_name,avg_ssim
0,Catalog_Contents,nl4dv,0.718819
1,College,nl4dv,0.68403
2,Companies,nl4dv,0.764052
3,Course_Authors_and_Tutors,nl4dv,0.673186
4,Dogs,nl4dv,0.748116


In [11]:
# Drop the datasets where at least one model did not produce visualizations
datasets = ssim_aggregate['dataset_name'].unique()
grouped_by_dataset = ssim_aggregate.groupby('dataset_name').agg(models_that_produced=('avg_ssim', 'count')).reset_index()
datasets_to_drop = grouped_by_dataset[grouped_by_dataset['models_that_produced'] < 2]['dataset_name'].unique()
ssim_aggregate_clean = ssim_aggregate[~ssim_aggregate['dataset_name'].isin(datasets_to_drop)]
def negate_ssim(r):
    if r['model_name'] == 'ncNet':
        r['avg_ssim'] = -r['avg_ssim']
    return r

ssim_aggregate_clean = ssim_aggregate_clean.apply(negate_ssim, axis=1)
    

In [12]:
bars = alt.Chart(ssim_aggregate_clean).mark_bar().encode(
    x=alt.X('dataset_name:N', axis=alt.Axis(title=''), sort='-y'),
    y=alt.Y('avg_ssim', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(title='SSIM')),
    color=alt.Color('model_name:N', legend=alt.Legend(title='Model')),
).properties(title='Average SSIM comparison with benchmark visualizations [0, 1]', padding=10)
bars

Calculate the on average SSIM across all datasets

In [13]:
all_ssim_aggregate_with_hardness = (
    eval_df.groupby(["model_name", "hardness"])
    .agg(avg_ssim=("ssim", "mean"))
    .reset_index()
)
all_ssim_aggregate_with_hardness = all_ssim_aggregate_with_hardness[
    all_ssim_aggregate_with_hardness["hardness"] != "unknown"
]
all_ssim_aggregate_with_hardness


Unnamed: 0,model_name,hardness,avg_ssim
0,ncNet,Easy,0.702823
1,ncNet,Extra Hard,0.666741
2,ncNet,Hard,0.411115
3,ncNet,Medium,0.649225
5,nl4dv,Easy,0.298881
6,nl4dv,Extra Hard,0.47504
7,nl4dv,Hard,0.46041
8,nl4dv,Medium,0.402197


In [16]:
bars = (
    alt.Chart(all_ssim_aggregate_with_hardness)
    .mark_bar()
    .encode(
        x=alt.X("hardness:N", axis=alt.Axis(title=""), sort=["Easy", "Medium", "Hard", "Extra Hard"]),
        y=alt.Y(
            "avg_ssim", scale=alt.Scale(domain=[0, 1]), axis=alt.Axis(title="SSIM")
        ),
        color=alt.Color(
            "hardness:N",
            legend=alt.Legend(
                title="Query Hardness",
            ),
        ),
        column=alt.Column("model_name:N", title="Model"),
    )
    .properties(width=200, height=300, title="Average SSIM per NLViz model", padding=10)
)

bars


Show the distribution of types of results in a Sunburst

In [15]:
# Create a sunburst chart using the evaluation dataframe
import plotly.express as px
produced_viz_df = produced_viz_df[produced_viz_df['hardness'] != 'unknown']
fig = px.sunburst(produced_viz_df, path=['model_name', 'hardness', 'dataset_name'], width=1000, height=1000)

# Model, then datasets, then median SSIM

fig.show()

  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
