In [None]:
import re

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shared import Benchmark, Run, Group, InputSize

from pathlib import Path
from typing import List, Tuple, Dict

benchmark = Benchmark.load(Path.cwd().parent / "benchmarking" / "args_test" / "gpulab_args_test2")

In [None]:
group = benchmark.groups["one_to_one"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

data = data[data["Input matrix rows"] >= 256]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

sns.set(rc={'figure.figsize': (10, 80)})
fig, axs = plt.subplots(nrows=results["Name"].nunique())
for idx, name in enumerate(sorted(results["Name"].unique())):
    filtered_result = results[results["Name"] == name]
    ax = sns.lineplot(data=filtered_result, x="Input matrix rows", y="Kernel time", hue="Args", marker='o', ax=axs[idx])

    ax.set_title(f"{name}")
fig.tight_layout()

In [None]:
group = benchmark.groups["one_to_many"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

data = data[data["Input matrix rows"] < 256]

grouped = data.groupby(["Name", "Args", "Input right matrices", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

sns.set(rc={'figure.figsize': (10, 50)})

for num_rights in sorted(results["Input right matrices"].unique()):
    right_results = results[results["Input right matrices"] == num_rights]
    fig, axs = plt.subplots(nrows=right_results["Name"].nunique())
    for idx, name in enumerate(sorted(right_results["Name"].unique())):
        filtered_result = right_results[right_results["Name"] == name]
        ax = sns.lineplot(data=filtered_result, x="Input matrix rows", y="Kernel time", hue="Args", marker='o', ax=axs[idx])

        ax.set_title(f"{name}")
    fig.suptitle(f"{num_rights} right matrices")
    fig.tight_layout()

In [None]:
group = benchmark.groups["n_to_mn"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

data = data[data["Input matrix rows"] < 256]

grouped = data.groupby(["Name", "Args", "Input left matrices", "Input right matrices", "Input matrix rows"])

kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

sns.set(rc={'figure.figsize': (10, 50)})

for num_lefts in sorted(results["Input left matrices"].unique()):
    left_results = results[results["Input left matrices"] == num_lefts]
    for num_rights in sorted(left_results["Input right matrices"].unique()):
        right_results = left_results[left_results["Input right matrices"] == num_rights]
        fig, axs = plt.subplots(nrows=right_results["Name"].nunique())
        for idx, name in enumerate(sorted(right_results["Name"].unique())):
            filtered_result = right_results[right_results["Name"] == name]
            ax = sns.lineplot(data=filtered_result, x="Input matrix rows", y="Kernel time", hue="Args", marker='o',
                              ax=axs[idx])

            ax.set_title(f"{name}")
        fig.suptitle(f"{num_lefts}x{num_rights}")
        fig.tight_layout()

In [None]:
group = benchmark.groups["n_to_m"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

data = data[data["Input matrix rows"] < 256]

grouped = data.groupby(["Name", "Args", "Input left matrices", "Input right matrices", "Input matrix rows"])

kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

sns.set(rc={'figure.figsize': (10, 50)})

for num_lefts in sorted(results["Input left matrices"].unique()):
    left_results = results[results["Input left matrices"] == num_lefts]
    for num_rights in sorted(left_results["Input right matrices"].unique()):
        right_results = left_results[left_results["Input right matrices"] == num_rights]
        fig, axs = plt.subplots(nrows=right_results["Name"].nunique())
        for idx, name in enumerate(sorted(right_results["Name"].unique())):
            filtered_result = right_results[right_results["Name"] == name]
            ax = sns.lineplot(data=filtered_result, x="Input matrix rows", y="Kernel time", hue="Args", marker='o',
                              ax=axs[idx])

            ax.set_title(f"{name}")
        fig.suptitle(f"{num_lefts}x{num_rights}")
        fig.tight_layout()

In [None]:
group = benchmark.groups["one_to_one"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

#data = data[data["Input matrix rows"] < 256]

grouped = data.groupby(["Name", "Args", "Input matrix area"])


kernel_times = (grouped["Kernel"].mean() / 1e9)
kernel_times.name = "Kernel time"
results = kernel_times.reset_index()

results["Kernel time per input element"] = results["Kernel time"] / (2 * results["Input matrix area"])
results.sort_values("Kernel time per input element")


sns.set(rc={'figure.figsize': (10, 80)})
fig, axs = plt.subplots(nrows=results["Name"].nunique())
for idx, name in enumerate(sorted(results["Name"].unique())):
    filtered_result = results[results["Name"] == name]
    ax = sns.lineplot(data=filtered_result, x="Input matrix area", y="Kernel time per input element", hue="Args", marker='o', ax=axs[idx])

    ax.set_title(f"{name}")
fig.tight_layout()

In [None]:
group = benchmark.groups["one_to_one"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

# data = data[data["Input matrix rows"] >= 256]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index()

min_times = results.groupby(["Name", "Input matrix rows"])["Kernel time"].min().to_frame("Kernel time").reset_index()


def speedup(row):
    min_time = min_times[(min_times["Name"] == row["Name"]) & (min_times["Input matrix rows"] == row["Input matrix rows"])]["Kernel time"].item()
    return row["Kernel time"] / min_time

results["Relative speed"] = results.apply(speedup, axis=1)



sns.set(rc={'figure.figsize': (10, 80)})
fig, axs = plt.subplots(nrows=results["Name"].nunique())
for idx, name in enumerate(sorted(results["Name"].unique())):
    filtered_result = results[results["Name"] == name]
    ax = sns.lineplot(data=filtered_result, x="Input matrix rows", y="Relative speed", hue="Args", marker='o', ax=axs[idx])

    ax.set_title(f"{name}")
fig.tight_layout()

best_args = results.groupby(["Name", "Args"])["Relative speed"].sum().to_frame("Total relative time").reset_index().sort_values("Total relative time")
print(best_args.groupby(["Name"]).head(3).sort_values("Name", kind="stable"))

In [None]:
group = benchmark.groups["one_to_many"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

grouped = data.groupby(["Name", "Args", "Input right matrices", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

min_times = results.groupby(["Name", "Input right matrices", "Input matrix rows"])["Kernel time"].min().to_frame("Kernel time").reset_index()


def speedup(row):
    min_time = min_times[(min_times["Name"] == row["Name"]) & (min_times["Input matrix rows"] == row["Input matrix rows"]) & (min_times["Input right matrices"] == row["Input right matrices"])]["Kernel time"].item()
    return row["Kernel time"] / min_time

results["Relative speed"] = results.apply(speedup, axis=1)


sns.set(rc={'figure.figsize': (10, 40)})

for name in sorted(results["Name"].unique()):

    alg_results = results[results["Name"] == name]
    fig, axs = plt.subplots(nrows=alg_results["Input right matrices"].nunique())

    palette = dict(zip(alg_results["Args"].unique(), sns.color_palette(n_colors=alg_results["Args"].nunique())))

    for idx, num_rights in enumerate(sorted(results["Input right matrices"].unique())):
        filtered_result = alg_results[alg_results["Input right matrices"] == num_rights]
        ax = sns.lineplot(data=filtered_result, x="Input matrix rows", y="Relative speed", hue="Args", marker='o', ax=axs[idx], palette=palette)

        ax.set_title(f"{num_rights} right matrices")
    fig.suptitle(f"{name}")
    fig.tight_layout()

best_args = results.groupby(["Name", "Args"])["Relative speed"].sum().to_frame("Total relative time").reset_index().sort_values("Total relative time")
print(best_args.groupby(["Name"]).head(3).sort_values("Name", kind="stable"))

# Warp shuffle optimizations diagram for one to one type used in text

In [None]:
group = benchmark.groups["one_to_one"]

warp_shuffle_algs_and_args = {
    "nai_shuffle_multimat_right": {
        "name": "Simple",
        "args": "8",
    },
    "nai_shuffle_multimat_right_work_distribution": {
        "name": "Simple with work distribution",
        "args": "8_1_triangle",
    },
    "nai_shuffle_multirow_right": {
        "name": "Multirow right",
        "args": "8_4",
    },
    "nai_shuffle_multirow_both": {
        "name": "Multirow both",
        "args": "8_4_4",
    },
}

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

def filter_data(row) -> bool:
    return row["Name"] in warp_shuffle_algs_and_args and warp_shuffle_algs_and_args[row["Name"]]["args"] == row["Args"]

data = data[data[["Name", "Args"]].apply(filter_data, axis=1)]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index()

simple_impl_times = results[results["Name"] == "nai_shuffle_multimat_right"]

def speedup(row):
    min_time = simple_impl_times[simple_impl_times["Input matrix rows"] == row["Input matrix rows"]]["Kernel time"].item()
    return row["Kernel time"] / min_time

def map_name(row):
    return warp_shuffle_algs_and_args[row["Name"]]["name"]

results["Relative speed"] = results.apply(speedup, axis=1)
results["Implementation"] = results.apply(map_name, axis=1)


sns.set(rc={'figure.figsize': (5, 5)})
fig, ax = plt.subplots()

ax = sns.lineplot(data=results, x="Input matrix rows", y="Relative speed", hue="Implementation", marker='o', ax=ax)
ticks = ax.get_xticks()
ticks = [f"{int(num_rows)}x{int(num_rows)}" for num_rows in ticks]
ax.set_xticklabels(ticks)
ax.set_xlabel("Input matrix size")
ax.set_ylabel("Speedup")
fig.tight_layout()
fig.savefig("warp_shuffle_one_to_one_results.svg", format="svg")

# Warp per shift diagram used in text

In [None]:
group = benchmark.groups["one_to_one"]

warp_per_shift_algs_and_args = {
    "nai_warp_per_shift": {
        "name": "Warp per shift",
        "args": "16",
    },
    "nai_warp_per_shift_work_distribution": {
        "name": "Warp per shift with work distribution",
        "args": "8_10_triangle",
    },
    "nai_warp_per_shift_shared_mem": {
        "name": "Warp per shift with shared memory",
        "args": "16_128_True_True",
    },
    "nai_block_per_shift": {
        "name": "Block per shift",
        "args": "256",
    },
}

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

def filter_data(row) -> bool:
    return row["Name"] in warp_per_shift_algs_and_args and warp_per_shift_algs_and_args[row["Name"]]["args"] == row["Args"]

data = data[data[["Name", "Args"]].apply(filter_data, axis=1)]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index()

warp_per_shift_impl_times = results[results["Name"] == "nai_warp_per_shift"]

def speedup(row):
    min_time = warp_per_shift_impl_times[warp_per_shift_impl_times["Input matrix rows"] == row["Input matrix rows"]]["Kernel time"].item()
    return row["Kernel time"] / min_time

def map_name(row):
    return warp_per_shift_algs_and_args[row["Name"]]["name"]

results["Relative speed"] = results.apply(speedup, axis=1)
results["Implementation"] = results.apply(map_name, axis=1)


sns.set(rc={'figure.figsize': (5, 5)})
fig, ax = plt.subplots()

ax = sns.lineplot(data=results, x="Input matrix rows", y="Relative speed", hue="Implementation", marker='o', ax=ax)
ticks = ax.get_xticks()
ticks = [f"{int(num_rows)}x{int(num_rows)}" for num_rows in ticks]
ax.set_xticklabels(ticks)
ax.set_xlabel("Input matrix size")
ax.set_ylabel("Speedup")
fig.tight_layout()
fig.savefig("warp_per_shift_one_to_one_results.svg", format="svg")

# N TO MN

In [None]:
group = benchmark.groups["n_to_mn"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

num_input_types = data["Input type"].nunique()

grouped = data.groupby(["Name", "Args", "Input left matrices", "Input right matrices", "Input matrix rows"])

kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

min_times = results.groupby(["Name", "Input left matrices", "Input right matrices", "Input matrix rows"])["Kernel time"].min().to_frame("Kernel time").reset_index()

def speedup(row):
    min_time = min_times[(min_times["Name"] == row["Name"]) & (min_times["Input matrix rows"] == row["Input matrix rows"]) & (min_times["Input right matrices"] == row["Input right matrices"]) & (min_times["Input left matrices"] == row["Input left matrices"])]["Kernel time"].item()
    return row["Kernel time"] / min_time

results["Relative speed"] = results.apply(speedup, axis=1)


sns.set(rc={'figure.figsize': (10, 40)})


for name in sorted(results["Name"].unique()):
    alg_results = results[results["Name"] == name]
    fig, axs = plt.subplots(nrows=num_input_types)

    palette = dict(zip(alg_results["Args"].unique(), sns.color_palette(n_colors=alg_results["Args"].nunique())))

    input_type = 0
    for num_lefts in sorted(alg_results["Input left matrices"].unique()):
        left_results = alg_results[alg_results["Input left matrices"] == num_lefts]
        for num_rights in sorted(left_results["Input right matrices"].unique()):
            right_results = left_results[left_results["Input right matrices"] == num_rights]
            ax = sns.lineplot(data=right_results, x="Input matrix rows", y="Relative speed", hue="Args", marker='o',
                              ax=axs[input_type], palette=palette)
            input_type += 1

            ax.set_title(f"{num_lefts}x{num_rights}")
    fig.suptitle(f"{name}")
    fig.tight_layout()

best_args = results.groupby(["Name", "Args"])["Relative speed"].sum().to_frame("Total relative time").reset_index().sort_values("Total relative time")
print(best_args.groupby(["Name"]).head(3).sort_values("Name", kind="stable"))

In [None]:
group = benchmark.groups["n_to_m"]

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

num_input_types = data["Input type"].nunique()

grouped = data.groupby(["Name", "Args", "Input left matrices", "Input right matrices", "Input matrix rows"])

kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index().sort_values("Kernel time")

min_times = results.groupby(["Name", "Input left matrices", "Input right matrices", "Input matrix rows"])["Kernel time"].min().to_frame("Kernel time").reset_index()

def speedup(row):
    min_time = min_times[(min_times["Name"] == row["Name"]) & (min_times["Input matrix rows"] == row["Input matrix rows"]) & (min_times["Input right matrices"] == row["Input right matrices"]) & (min_times["Input left matrices"] == row["Input left matrices"])]["Kernel time"].item()
    return row["Kernel time"] / min_time

results["Relative speed"] = results.apply(speedup, axis=1)


sns.set(rc={'figure.figsize': (10, 40)})


for name in sorted(results["Name"].unique()):
    alg_results = results[results["Name"] == name]
    fig, axs = plt.subplots(nrows=num_input_types)

    palette = dict(zip(alg_results["Args"].unique(), sns.color_palette(n_colors=alg_results["Args"].nunique())))

    input_type = 0
    for num_lefts in sorted(alg_results["Input left matrices"].unique()):
        left_results = alg_results[alg_results["Input left matrices"] == num_lefts]
        for num_rights in sorted(left_results["Input right matrices"].unique()):
            right_results = left_results[left_results["Input right matrices"] == num_rights]
            ax = sns.lineplot(data=right_results, x="Input matrix rows", y="Relative speed", hue="Args", marker='o',
                              ax=axs[input_type], palette=palette)
            input_type += 1

            ax.set_title(f"{num_lefts}x{num_rights}")
    fig.suptitle(f"{name}")
    fig.tight_layout()

best_args = results.groupby(["Name", "Args"])["Relative speed"].sum().to_frame("Total relative time").reset_index().sort_values("Total relative time")
print(best_args.groupby(["Name"]).head(3).sort_values("Name", kind="stable"))