In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def convert_rows_to_exp(rows):
    return f"{rows:.0E}".replace("+", "")

baseline_df = pd.read_csv('data\\baseline.tsv', sep='\t')
baseline_df["time_ms"] = baseline_df["time_sec"] * 100000 // 1 / 100
baseline_df["in_rows_exp"] = baseline_df["in_rows"].apply(convert_rows_to_exp)

In [None]:
# bar plots

solutions = ("Gandalff", "Pandas", "Polars")
in_rows = (10000, 100000, 1000000, 10000000)
questions = ("Q1", "Q2", "Q3", "Q4", "Q5")

for rows in in_rows:

    x = np.arange(len(questions))  # the label locations
    width = 0.15  # the width of the bars
    multiplier = 0

    fig, ax = plt.subplots(layout='constrained')

    for solution in solutions:
        times = tuple(baseline_df[baseline_df.solution == solution.lower()][baseline_df.in_rows == rows].sort_values(["q"]).time_ms.to_list())
        offset = width * multiplier
        rects = ax.bar(x + offset, times, width, label=solution)
        ax.bar_label(rects, padding=len(questions))
        multiplier += 1

    rows_exp = f"{rows:.0E}".replace("+", "")

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Time (ms)')
    ax.set_title(f'Baseline benchmarking for {rows_exp} rows')
    ax.set_xticks(x + width, questions)
    ax.legend(loc='upper right', ncols=3)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # plt.xkcd()
    # plt.savefig(f"baseline_{rows_exp}.png", bbox_inches='tight', dpi=300)
    plt.show()


In [None]:
# line plots

gp = baseline_df.groupby(["in_rows", "solution"]).agg({"time_ms":"mean"}).reset_index()

gandalff = gp[gp.solution == "gandalff"][["in_rows", "time_ms"]]
pandas_ = gp[gp.solution == "pandas"][["in_rows", "time_ms"]]
polars_ = gp[gp.solution == "polars"][["in_rows", "time_ms"]]

in_rows_values = ["1E04", "1E05", "1E06", "1E07"]

fig, ax = plt.subplots(layout='constrained')

# plt.xkcd()

ax.plot(in_rows_values, gandalff.time_ms, label="Gandalff")
ax.plot(in_rows_values, pandas_.time_ms, label="Pandas")
ax.plot(in_rows_values, polars_.time_ms, label="Polars")

ax.set_title('Average time per input size')
ax.set_xlabel('Input size')
ax.set_ylabel('Log Time (ms)')
ax.set_yscale("log")
ax.legend()

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# plt.savefig(f"baseline_avg_line.png", bbox_inches='tight', dpi=300)


In [None]:
# calculate speedup

def calculate_speedup(solution_name: str, path_to_tsv: str):
    new_gandalff_df = pd.read_csv(path_to_tsv, sep='\t')
    new_gandalff_df["time_ms"] = new_gandalff_df["time_ns"] / 10000 // 1 / 100
    new_gandalff_df["in_rows_exp"] = new_gandalff_df["in_rows"].apply(convert_rows_to_exp)

    gandalff_1 = new_gandalff_df.groupby(["in_rows_exp", "solution"]).agg({"time_ms": "mean"}).reset_index()

    baseline_gp = baseline_df.groupby(["in_rows_exp", "solution"]).agg({"time_ms": "mean"}).reset_index()
    baseline_polars = baseline_gp[baseline_gp.solution == "polars"].rename(columns={"time_ms": "polars"}).drop(columns=["solution"])
    baseline_gandalff = baseline_gp[baseline_gp.solution == "gandalff"].rename(columns={"time_ms": "gandalff"}).drop(columns=["solution"])


    # # Gandalff baseline VS Polars
    # t = baseline_gandalff.merge(baseline_polars)
    # t["Ratio"] = t["gandalff"] / t["polars"] * 100 // 1 / 100
    # print()
    # print("Gandalff baseline VS Polars")
    # print(t.to_markdown(index=False))

    # # Gandalff baseline VS New Gandalff
    # t = baseline_gandalff.merge(gandalff_1.rename(columns={"time_ms": solution_name}).drop(columns=["solution"]))
    # t["Ratio"] = t["gandalff"] / t[solution_name] * 100 // 1 / 100
    # print()
    # print("Gandalff baseline VS New Gandalff")
    # print(t.to_markdown(index=False))

    # New Gandalff VS Polars
    t = gandalff_1.rename(columns={"time_ms": solution_name}).drop(columns=["solution"]).merge(baseline_polars)
    t["Ratio"] = t[solution_name] / t["polars"] * 100 // 1 / 100
    print()
    print("New Gandalff VS Polars")
    print(t.to_markdown(index=False))

calculate_speedup("gandalff_1_0", "data\\gandalff_1_0.tsv")
calculate_speedup("gandalff_3_0", "data\\gandalff_3_0.tsv")

