In [None]:
import pandas as pd
import altair as alt

alt.theme.enable("latimes")

df = pd.read_csv(
    "Benchmark.BulkCopyBenchmarks-report.csv",
    delimiter=",",
)


def clean_data(df):
    # Clean numeric fields from strings like "49.51 ms"
    df["Median"] = df["Median"].str.replace(" ms", "", case=False, regex=False)
    df["Median"] = df["Median"].str.replace(",", "", case=False, regex=False)
    df = df.astype({"Median": "float64"})

    # Map labels
    mapping = {
        "NoIndex": "no index",
        "Res": "resource_id",
        "Res_BillDate": "resource_id, billing_date",
        "Res_BillDate_Cst": ["resource_id, billing_date,", "cost"],
        "Res_BillDate_Inc_Cst": ["resource_id, billing_date", "include cost"],
        "After": ["resource_id, billing_date", "include cost - after insert"],
    }
    df["index_label"] = df["IndexCreation"].map(mapping).fillna(df["IndexCreation"])

    return df


df_clean = clean_data(df.copy())

# Main chart: Execution time
main_plot = (
    alt.Chart(df_clean)
    .mark_line()
    .encode(
        y=alt.Y("Median", sort="x")
        .scale(type="log")
        .title("Median bulk copy time (ms)"),
        x=alt.X("BatchSize").scale(type="log").title("Batch size (rows)"),
        color=alt.Color(
            "index_label",
            title="Index creation strategy",
            scale=alt.Scale(scheme="tableau10"),
        ),
    )
    .properties(
        title="PostgreSQL bulk copy performance by different indexing strategies",
        height=240,
        width=640,
    )
)

# Relative chart: Relative to no index (%)
baseline = df_clean[df_clean["index_label"] == "no index"][["BatchSize", "Median"]].set_index("BatchSize")
df_clean["relative"] = df_clean.apply(
    lambda row: (row["Median"] / baseline.loc[row["BatchSize"], "Median"] * 100)
    if row["BatchSize"] in baseline.index else None,
    axis=1
)

relative_plot = (
    alt.Chart(df_clean)
    .mark_line()
    .encode(
        y=alt.Y("relative").title("Relative to 'no index' (%)"),
        x=alt.X("BatchSize").scale(type="log").title("Batch size (rows)"),
        color=alt.Color(
            "index_label",
            title="Index creation strategy",
            scale=alt.Scale(scheme="tableau10"),
        ),
    )
    .properties(
        height=240,
        width=640,
    )
)

# Combine plots vertically
combined = main_plot & relative_plot

# Render and save
combined.show()
combined.save("results.png", scale_factor=2)

In [None]:
import pandas as pd
import altair as alt

alt.theme.enable("latimes")

df = pd.read_csv(
    "Benchmark.SelectBenchmarks-report.csv",
    delimiter=",",
)

def clean_data(df):
    # Clean numeric fields from strings like "49.51 ms"
    df["Median"] = df["Median"].str.replace(" ms", "", case=False, regex=False)
    df["Median"] = df["Median"].str.replace(",", "", case=False, regex=False)
    df = df.astype({"Median": "float64"})

    # Map labels
    mapping = {
        "NoIndex": "no index",
        "Res": "resource_id",
        "Res_BillDate": "resource_id, billing_date",
        "Res_BillDate_Cst": ["resource_id, billing_date,", "cost"],
        "Res_BillDate_Inc_Cst": ["resource_id, billing_date", "include cost"],
    }
    df["index_label"] = df["IndexCreation"].map(mapping)

    return df


df_clean = clean_data(df.copy())

# Main chart: Execution time
main_plot = (
    alt.Chart(df_clean)
    .mark_line()
    .encode(
        y=alt.Y("Median", sort="x")
        .scale(type="log")
        .title("Median select time (ms)"),
        x=alt.X("BatchSize").scale(type="log").title("Batch size (rows)"),
        color=alt.Color(
            "Method",
            title="Index creation strategy",
            scale=alt.Scale(scheme="tableau10"),
        ),
    )
    .properties(
        title="PostgreSQL select performance by different indexing strategies",
        height=240,
        width=640,
    )
)

# Relative chart: Relative to no index (%)
baseline = df_clean[df_clean["index_label"] == "no index"][["BatchSize", "Median"]].set_index("BatchSize")
df_clean["relative"] = df_clean.apply(
    lambda row: (row["Median"] / baseline.loc[row["BatchSize"], "Median"] * 100)
    if row["BatchSize"] in baseline.index else None,
    axis=1
)

relative_plot = (
    alt.Chart(df_clean)
    .mark_line()
    .encode(
        y=alt.Y("relative").title("Relative to 'no index' (%)"),
        x=alt.X("BatchSize").scale(type="log").title("Batch size (rows)"),
        color=alt.Color(
            "index_label",
            title="Index creation strategy",
            scale=alt.Scale(scheme="tableau10"),
        ),
    )
    .properties(
        height=240,
        width=640,
    )
)

# Combine plots vertically
combined = main_plot & relative_plot

# Render and save
combined.show()
combined.save("results.png", scale_factor=2)