Used in the implementation chapter to illustrate warp per shift optimizations

In [None]:
import re

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shared import Benchmark, Run, Group, InputSize

from pathlib import Path
from typing import List, Tuple, Dict

presentable_names = {
    "per_warp_orig": "Simplified",
    "per_warp_shared_mem": "Shared memory",
    "per_warp_work_dist": "Work distribution"
}

benchmark = Benchmark.load(Path.cwd().parent / "benchmarking" / "text" / "BlockPerShift")

data = pd.concat([run.data for run in benchmark.groups["one_to_one"].runs], axis=0, ignore_index=True)

data = data[(data["Name"] != "per_block") & (data["Name"] != "orig")]
data = data[data["Input matrix rows"] <= 32]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])
compute_times = grouped["Computation"].mean() / 1e9
compute_times.name = "Compute time"
results = compute_times.reset_index().sort_values("Compute time")

alg_mins = results.groupby(["Name", "Input matrix rows"])["Compute time"].min().to_frame("Compute time")

alg_mins = alg_mins.reset_index()
alg_mins["Implementations"] = alg_mins["Name"].map(presentable_names)

sns.set(rc={'figure.figsize': (5, 5)})
fig, ax = plt.subplots()

ax = sns.lineplot(data=alg_mins, x="Input matrix rows", y="Compute time", hue="Implementations", marker='o', ax=ax)
ticks = ax.get_xticks()
ticks = [f"{int(num_rows)}x{int(num_rows)}" for num_rows in ticks]
ax.set_xticklabels(ticks)
ax.set_xlabel("Input matrix size")
ax.set_ylabel("Computation time (s)")
fig.tight_layout()
fig.savefig("warp_per_shift_work_dist_local_results.svg", format="svg")

Used in implementation chapter to illustrate block per shift

In [None]:


import re

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shared import Benchmark, Run, Group, InputSize

from pathlib import Path
from typing import List, Tuple, Dict

presentable_names = {
    "orig": "Bali original",
    "per_block": "Block per Shift",
    "per_warp_orig": "Warp per Shift simplified",
    "per_warp_shared_mem": "Warp per Shift shared memory",
    "per_warp_work_dist": "Warp per Shift work distribution"
}

benchmark = Benchmark.load(Path.cwd().parent / "benchmarking" / "text" / "BlockPerShift")

data = pd.concat([run.data for run in benchmark.groups["one_to_one"].runs], axis=0, ignore_index=True)
data = data[(data["Name"] != "per_warp_shared_mem") & (data["Name"] != "per_warp_work_dist") & (data["Name"] != "orig")]
data = data[data["Input matrix rows"] <= 32]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])
compute_times = grouped["Computation"].mean() / 1e9
compute_times.name = "Compute time"
results = compute_times.reset_index().sort_values("Compute time")

alg_mins = results.groupby(["Name", "Input matrix rows"])["Compute time"].min().to_frame("Compute time")

alg_mins = alg_mins.reset_index()
alg_mins["Implementations"] = alg_mins["Name"].map(presentable_names)

sns.set(rc={'figure.figsize': (5, 5)})
fig, ax = plt.subplots()

ax = sns.lineplot(data=alg_mins, x="Input matrix rows", y="Compute time", hue="Implementations", marker='o', ax=ax)
ticks = ax.get_xticks()
ticks = [f"{int(num_rows)}x{int(num_rows)}" for num_rows in ticks]
ax.set_xticklabels(ticks)
ax.set_xlabel("Input matrix size")
ax.set_ylabel("Computation time (s)")
fig.tight_layout()
fig.savefig("block_per_shift_local_results.svg", format="svg")

Used in results chapter

Results of Warp shuffle optimizations diagram for one to one type used in text

In [None]:
import re

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shared import Benchmark, Run, Group, InputSize

from pathlib import Path
from typing import List, Tuple, Dict

benchmark = Benchmark.load(Path.cwd().parent / "benchmarking" / "args_test" / "gpulab_args_test2")
group = benchmark.groups["one_to_one"]

warp_shuffle_algs_and_args = {
    "nai_shuffle_multimat_right": {
        "name": "Simple",
        "args": "8",
    },
    "nai_shuffle_multimat_right_work_distribution": {
        "name": "Simple with work distribution",
        "args": "8_1_triangle",
    },
    "nai_shuffle_multirow_right": {
        "name": "Multirow right",
        "args": "8_4",
    },
    "nai_shuffle_multirow_both": {
        "name": "Multirow both",
        "args": "8_4_4",
    },
}

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

def filter_data(row) -> bool:
    return row["Name"] in warp_shuffle_algs_and_args and warp_shuffle_algs_and_args[row["Name"]]["args"] == row["Args"]

data = data[data[["Name", "Args"]].apply(filter_data, axis=1)]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index()

simple_impl_times = results[results["Name"] == "nai_shuffle_multimat_right"]

def speedup(row):
    min_time = simple_impl_times[simple_impl_times["Input matrix rows"] == row["Input matrix rows"]]["Kernel time"].item()
    return row["Kernel time"] / min_time

def map_name(row):
    return warp_shuffle_algs_and_args[row["Name"]]["name"]

results["Relative speed"] = results.apply(speedup, axis=1)
results["Implementation"] = results.apply(map_name, axis=1)


sns.set(rc={'figure.figsize': (5, 5)})
fig, ax = plt.subplots()

ax = sns.lineplot(data=results, x="Input matrix rows", y="Relative speed", hue="Implementation", marker='o', ax=ax)
ticks = ax.get_xticks()
ticks = [f"{int(num_rows)}x{int(num_rows)}" for num_rows in ticks]
ax.set_xticklabels(ticks)
ax.set_xlabel("Input matrix size")
ax.set_ylabel("Speedup")
fig.tight_layout()
fig.savefig("warp_shuffle_one_to_one_results.svg", format="svg")

Warp per shift diagram used in text

In [None]:
import re

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shared import Benchmark, Run, Group, InputSize

from pathlib import Path
from typing import List, Tuple, Dict

benchmark = Benchmark.load(Path.cwd().parent / "benchmarking" / "args_test" / "gpulab_args_test2")

group = benchmark.groups["one_to_one"]

warp_per_shift_algs_and_args = {
    "nai_warp_per_shift": {
        "name": "Warp per shift",
        "args": "16",
    },
    "nai_warp_per_shift_work_distribution": {
        "name": "Warp per shift with work distribution",
        "args": "8_10_triangle",
    },
    "nai_warp_per_shift_shared_mem": {
        "name": "Warp per shift with shared memory",
        "args": "16_128_True_True",
    },
    "nai_block_per_shift": {
        "name": "Block per shift",
        "args": "256",
    },
}

data = pd.concat([run.data for run in group.runs], axis=0, ignore_index=True)

def filter_data(row) -> bool:
    return row["Name"] in warp_per_shift_algs_and_args and warp_per_shift_algs_and_args[row["Name"]]["args"] == row["Args"]

data = data[data[["Name", "Args"]].apply(filter_data, axis=1)]

grouped = data.groupby(["Name", "Args", "Input matrix rows"])



kernel_times = grouped["Kernel"].mean() / 1e9
kernel_times.name = "Kernel time"
results = kernel_times.reset_index()

warp_per_shift_impl_times = results[results["Name"] == "nai_warp_per_shift"]

def speedup(row):
    min_time = warp_per_shift_impl_times[warp_per_shift_impl_times["Input matrix rows"] == row["Input matrix rows"]]["Kernel time"].item()
    return row["Kernel time"] / min_time

def map_name(row):
    return warp_per_shift_algs_and_args[row["Name"]]["name"]

results["Relative speed"] = results.apply(speedup, axis=1)
results["Implementation"] = results.apply(map_name, axis=1)


sns.set(rc={'figure.figsize': (5, 5)})
fig, ax = plt.subplots()

ax = sns.lineplot(data=results, x="Input matrix rows", y="Relative speed", hue="Implementation", marker='o', ax=ax)
ticks = ax.get_xticks()
ticks = [f"{int(num_rows)}x{int(num_rows)}" for num_rows in ticks]
ax.set_xticklabels(ticks)
ax.set_xlabel("Input matrix size")
ax.set_ylabel("Speedup")
fig.tight_layout()
fig.savefig("warp_per_shift_one_to_one_results.svg", format="svg")