This notebook analyzes the various properties of USearch indexes, depending on the used metric, connectivity, and dataset size.

In [None]:
!pip install kaleido plotly nbformat

In [None]:
from benchmark import *
from shared import *
import pickle

In [None]:
def read_stats(name):
    numba_results = pickle.load(open("stats/numba/" + name + ".pickle", "rb"))
    simsimd_results = pickle.load(open("stats/simsimd/" + name + ".pickle", "rb"))
    return {
        "numba_add_speed": numba_results.add_speed,
        "numba_search_speed": numba_results.search_speed,
        "simsimd_add_speed": simsimd_results.add_speed,
        "simsimd_search_speed": simsimd_results.search_speed,
        "memory_usage": numba_results.memory_usage,
        "recall_vector": numba_results.recall_vector,
        "size": [i * batch_size for i in range(0, 100)],
        "number_of_levels": numba_results.number_of_levels,
        "number_of_edges": numba_results.number_of_edges,
    }

maccs_results = [read_stats(name) for name in names_maccs]
ecfp4_results = [read_stats(name) for name in names_ecfp4]
mixed_results = [read_stats(name) for name in names_mixed]
conditional_results = [read_stats(name) for name in names_conditional]

In [None]:
maccs_results[-1]["numba_add_speed"][-1], maccs_results[-1]["simsimd_add_speed"][-1]

In [None]:
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import numpy as np
from typing import List, Dict

def render_eval_results(names: List[str], results: List[Dict[str, np.ndarray]]):
    rows = len(results)
    max_add_speed = max(
        np.max([x["numba_add_speed"].max() for x in results]), 
        np.max([x["simsimd_add_speed"].max() for x in results]))

    max_search_speed = max(
        np.max([x["numba_search_speed"].max() for x in results]), 
        np.max([x["simsimd_search_speed"].max() for x in results]))

    for row in range(rows):
        name = names[row]
        eval_result = results[row]
        sizes = eval_result["size"]    
        
        fig = make_subplots(rows=1, cols=4, 
                            subplot_titles=(f"{name} Construction Speed", 
                                            f"{name} Construction Memory", 
                                            f"{name} Search Speed", 
                                            f"{name} Search Recall"))
        
        colors = {'Numba': 'blue', 'SimdSIMD': 'red'}
        
        fig.add_trace(go.Scatter(x=sizes, y=eval_result["numba_add_speed"], mode='lines', 
                                 name='Numba', line=dict(color=colors['Numba']), showlegend=True),
                      row=1, col=1)
        fig.add_trace(go.Scatter(x=sizes, y=eval_result["simsimd_add_speed"], mode='lines', 
                                 name='SimdSIMD', line=dict(color=colors['SimdSIMD']), showlegend=True),
                      row=1, col=1)
        
        fig.add_trace(go.Scatter(x=sizes, y=eval_result["memory_usage"] / 1e9, mode='lines', name='Memory'),
                      row=1, col=2)
        
        fig.add_trace(go.Scatter(x=sizes, y=eval_result["numba_search_speed"], mode='lines', 
                                 name='Numba', line=dict(color=colors['Numba']), showlegend=False),
                      row=1, col=3)
        fig.add_trace(go.Scatter(x=sizes, y=eval_result["simsimd_search_speed"], mode='lines', 
                                 name='SimdSIMD', line=dict(color=colors['SimdSIMD']), showlegend=False),
                      row=1, col=3)
        
        fig.add_trace(go.Scatter(x=sizes, y=eval_result["recall_vector"], mode='lines', name='Recall'),
                      row=1, col=4)
        

        # Get the last values for each line
        last_size = sizes[-1]
        last_numba_add_speed = eval_result["numba_add_speed"][-1]
        last_simsimd_add_speed = eval_result["simsimd_add_speed"][-1]
        last_numba_search_speed = eval_result["numba_search_speed"][-1]
        last_simsimd_search_speed = eval_result["simsimd_search_speed"][-1]
        last_memory_usage = eval_result["memory_usage"][-1] / 1e9
        last_recall = eval_result["recall_vector"][-1]

        # Add horizontal bars and annotations for each line
        for col, last_value, yref, fmt in zip([1, 1, 2, 3, 3, 4], 
                                        [last_numba_add_speed, last_simsimd_add_speed, last_memory_usage, last_numba_search_speed, last_simsimd_search_speed, last_recall], 
                                        ["y1", "y1", "y2", "y3", "y3", "y4"],
                                        [",.0f", ",.0f", ",.1f", ",.0f", ",.0f", ",.2f"]):
            fig.add_shape(
                type="line",
                x0=0,
                x1=last_size,
                y0=last_value,
                y1=last_value,
                line=dict(color="Gray", width=0.5),
                xref=f"x{col}",
                yref=yref
            )
            fig.add_annotation(
                x=last_size,  # Maximum x-value of the axis
                y=last_value,
                xref=f"x{col}",
                yref=yref,
                text=f"{last_value:{fmt}}",
                showarrow=False,
                font=dict(
                    size=10,
                    color="Gray"
                ),
                xanchor="left",  # Align the left edge of the text with the x-coordinate
                yanchor="bottom",
                ax=20,  # Shift the annotation 20 units to the right
                ay=0
            )

        fig.update_yaxes(range=[0, max_add_speed], row=1, col=1)
        fig.update_yaxes(range=[0, max_search_speed], row=1, col=3)
        fig.update_yaxes(range=[0.85, 1], row=1, col=4)
        
        fig.update_layout(
            title=dict(
                text=f"{name} Evaluation",
                x=0.5,
                font=dict(
                    size=16,
                )
            ),
            margin=dict(l=20, r=20, t=140, b=0),  # Increase top margin to make space for title
            legend=dict(
                x=0.5,
                y=1.35,  # Move legend down slightly
                xanchor="center",
                yanchor="top",
                orientation="h"
            ),
            font=dict(
                size=10,
            ),
            height=330,
        )
        
        fig.show()

In [None]:
render_eval_results(names_maccs, maccs_results)

In [None]:
render_eval_results(names_ecfp4, ecfp4_results)

In [None]:
render_eval_results(names_mixed, mixed_results)

In [None]:
render_eval_results(names_conditional, conditional_results)

In [None]:
def read_stats(name):
    results_maccs = pickle.load(open("stats/numba/" + name[0] + ".pickle", "rb"))
    results_ecfp4 = pickle.load(open("stats/numba/" + name[1] + ".pickle", "rb"))
    results_mixed = pickle.load(open("stats/numba/" + name[2] + ".pickle", "rb"))
    results_condi = pickle.load(open("stats/numba/" + name[3] + ".pickle", "rb"))
    return {
        "add_speed_maccs": results_maccs.add_speed,
        "add_speed_ecfp4": results_ecfp4.add_speed,
        "add_speed_mixed": results_mixed.add_speed,
        "add_speed_condi": results_condi.add_speed,

        "memory_usage_maccs": results_maccs.memory_usage,
        "memory_usage_ecfp4": results_ecfp4.memory_usage,
        "memory_usage_mixed": results_mixed.memory_usage,
        "memory_usage_condi": results_condi.memory_usage,

        "search_speed_maccs": results_maccs.search_speed,
        "search_speed_ecfp4": results_ecfp4.search_speed,
        "search_speed_mixed": results_mixed.search_speed,
        "search_speed_condi": results_condi.search_speed,

        "recall_vector_maccs": results_maccs.recall_vector,
        "recall_vector_ecfp4": results_ecfp4.recall_vector,
        "recall_vector_mixed": results_mixed.recall_vector,
        "recall_vector_condi": results_condi.recall_vector,

        "size": [i * batch_size for i in range(0, 100)],
    }

results = [read_stats(name) for name in zip(names_maccs, names_ecfp4, names_mixed, names_conditional)]

In [None]:
import plotly.express as px

rows = len(results)
max_add_speed = max(
    np.max([x["add_speed_maccs"].max() for x in results]), 
    np.max([x["add_speed_ecfp4"].max() for x in results]), 
    np.max([x["add_speed_mixed"].max() for x in results]), 
    np.max([x["add_speed_condi"].max() for x in results]))

max_search_speed = max(
    np.max([x["search_speed_maccs"].max() for x in results]), 
    np.max([x["search_speed_ecfp4"].max() for x in results]), 
    np.max([x["search_speed_mixed"].max() for x in results]), 
    np.max([x["search_speed_condi"].max() for x in results]))

names = ["PubChem", "GDB13", "REAL"]

for row in range(rows):
    name = names[row]
    eval_result = results[row]
    sizes = eval_result["size"]    
    
    fig = make_subplots(rows=1, cols=4, 
                        subplot_titles=(f"{name} Construction Speed", 
                                        f"{name} Construction Memory", 
                                        f"{name} Search Speed", 
                                        f"{name} Search Recall"))
    
    colors = {
        'MACCS': px.colors.qualitative.Plotly[0], 
        'ECFP4': px.colors.qualitative.Plotly[1],
        'Mixed': px.colors.qualitative.Plotly[2],
        'Conditional': px.colors.qualitative.Plotly[3],
    }
    
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["add_speed_maccs"], mode='lines', 
                                name='MACCS', line=dict(color=colors['MACCS'], dash='dot'), showlegend=True),
                    row=1, col=1)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["add_speed_ecfp4"], mode='lines', 
                                name='ECFP4', line=dict(color=colors['ECFP4'], dash='dash'), showlegend=True),
                    row=1, col=1)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["add_speed_mixed"], mode='lines', 
                                name='Mixed', line=dict(color=colors['Mixed'], dash='dashdot'), showlegend=True),
                    row=1, col=1)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["add_speed_condi"], mode='lines', 
                                name='Conditional', line=dict(color=colors['Conditional'], dash='solid'), showlegend=True),
                    row=1, col=1)

    fig.add_trace(go.Scatter(x=sizes, y=eval_result["memory_usage_maccs"] / 1e9, mode='lines', 
                                name='MACCS', line=dict(color=colors['MACCS'], dash='dot'), showlegend=False),
                    row=1, col=2)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["memory_usage_ecfp4"] / 1e9, mode='lines', 
                                name='ECFP4', line=dict(color=colors['ECFP4'], dash='dash'), showlegend=False),
                    row=1, col=2)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["memory_usage_mixed"] / 1e9, mode='lines', 
                                name='Mixed', line=dict(color=colors['Mixed'], dash='dashdot'), showlegend=False),
                    row=1, col=2)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["memory_usage_condi"] / 1e9, mode='lines', 
                                name='Conditional', line=dict(color=colors['Conditional'], dash='solid'), showlegend=False),
                    row=1, col=2)

    fig.add_trace(go.Scatter(x=sizes, y=eval_result["search_speed_maccs"], mode='lines', 
                                name='MACCS', line=dict(color=colors['MACCS'], dash='dot'), showlegend=False),
                    row=1, col=3)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["search_speed_ecfp4"], mode='lines', 
                                name='ECFP4', line=dict(color=colors['ECFP4'], dash='dash'), showlegend=False),
                    row=1, col=3)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["search_speed_mixed"], mode='lines', 
                                name='Mixed', line=dict(color=colors['Mixed'], dash='dashdot'), showlegend=False),
                    row=1, col=3)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["search_speed_condi"], mode='lines', 
                                name='Conditional', line=dict(color=colors['Conditional'], dash='solid'), showlegend=False),
                    row=1, col=3)

    fig.add_trace(go.Scatter(x=sizes, y=eval_result["recall_vector_maccs"], mode='lines', 
                                name='MACCS', line=dict(color=colors['MACCS'], dash='dot'), showlegend=False),
                    row=1, col=4)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["recall_vector_ecfp4"], mode='lines', 
                                name='ECFP4', line=dict(color=colors['ECFP4'], dash='dash'), showlegend=False),
                    row=1, col=4)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["recall_vector_mixed"], mode='lines', 
                                name='Mixed', line=dict(color=colors['Mixed'], dash='dashdot'), showlegend=False),
                    row=1, col=4)
    fig.add_trace(go.Scatter(x=sizes, y=eval_result["recall_vector_condi"], mode='lines', 
                                name='Conditional', line=dict(color=colors['Conditional'], dash='solid'), showlegend=False),
                    row=1, col=4)

    fig.update_yaxes(range=[0, max_add_speed], row=1, col=1)
    fig.update_yaxes(range=[0, max_search_speed], row=1, col=3)
    fig.update_yaxes(range=[0.8, 1], row=1, col=4)
    
    fig.update_layout(
        title=dict(
            text=f"{name} Evaluation",
            x=0.5,
            font=dict(
                size=16,
            )
        ),
        margin=dict(l=20, r=20, t=140, b=0),  # Increase top margin to make space for title
        legend=dict(
            x=0.5,
            y=1.35,  # Move legend down slightly
            xanchor="center",
            yanchor="top",
            orientation="h"
        ),
        font=dict(
            size=10,
        ),
        height=330,
    )
    
    fig.show()