In [1]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sys
import itertools
from collections import namedtuple, defaultdict
from pathlib import Path

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

%load_ext autoreload
%autoreload 1
%aimport analyze

with open('plots/style.json') as f:
    mpl.rcParams.update(json.load(f))

## Wildstyle failures

In [2]:
ws = analyze.Analysis('results/wildstyle')
print(ws)

Analysis for Celeritas v0.3.2-rc.1 on wildstyle


In [3]:
ftab = analyze.make_failure_table(ws.failures())
ftab.to_frame()

Unnamed: 0,Failure


In [4]:
print("\n".join(k for (k, v) in ftab.items()
                if 'is_soft' in v))




In [5]:
summed = analyze.summarize_instances(ws.result[['unconverged']])

Unconverged tracks (from runs that didn't fail):

In [6]:
unconv = summed[('unconverged', 'mean')]
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


## Summit results

In [7]:
summit = analyze.Analysis('results/summit')
print(summit)
summed = analyze.summarize_instances(summit.result[summit.successful].dropna(how='all'))
problems = summit.problems()
problem_to_abbr = summit.problem_to_abbr(problems)
p_to_i = dict(zip(problems, itertools.count()))

Analysis for Celeritas v0.3.2 on summit


In [8]:
deets = summit.load_results(('testem3-flat','orange','gpu'), 0)
deets['system']['device']

{'capability_major': 7,
 'capability_minor': 0,
 'clock_rate': 1530000,
 'default_block_size': 256,
 'device_id': 0,
 'eu_per_cu': 1,
 'max_blocks_per_grid': 2147483647,
 'max_blocks_per_multiprocessor': 32,
 'max_cache_size': 6291456,
 'max_threads_per_block': 1024,
 'max_threads_per_cu': 2048,
 'memory_clock_rate': 877000,
 'multiprocessor_count': 80,
 'name': 'Tesla V100-SXM2-16GB',
 'platform': 'cuda',
 'regs_per_block': 65536,
 'regs_per_multiprocessor': 65536,
 'shared_mem_per_block': 49152,
 'threads_per_warp': 32,
 'total_const_mem': 65536,
 'total_global_mem': 16911433728}

### Failures

Average number of unconverged tracks:

In [9]:
unconv = analyze.summarize_instances(summit.result['unconverged'])['mean']
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


### Timing tables

In [10]:
(fig, [run_ax, setup_ax]) = plt.subplots(nrows=2,
                                         gridspec_kw=dict(height_ratios=[3, 1]),
                                         subplot_kw=dict(yscale='log'))
summit.plot_results(run_ax, summed['total_time'])
run_ax.legend();
run_ax.set_ylabel('Run [s]')
run_ax.tick_params(labelbottom=False)
summit.plot_results(setup_ax, summed['setup_time'])
setup_ax.set_ylabel('Setup [s]')
analyze.annotate_metadata(run_ax, summit)
plt.tight_layout()
fig.savefig('plots/timing.pdf', transparent=True)
plt.close()

In [11]:
times = summed[('total_time', 'mean')].unstack()
times.style.format(analyze.float_fmt_transform(2))

Unnamed: 0_level_0,arch,cpu,gpu,gpu+sync
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cms2018,vecgeom,125.77,13.11,—
cms2018+field+msc,vecgeom,281.30,41.81,42.23
simple-cms+field,orange,77.35,3.12,—
simple-cms+field+msc,orange,103.73,3.57,—
simple-cms+field+msc,vecgeom,101.88,3.66,—
simple-cms+msc,orange,91.51,3.46,—
testem15,orange,61.66,2.76,—
testem15+field,orange,72.11,2.83,3.06
testem15+field,vecgeom,—,—,2.59
testem15+field+msc,orange,92.72,2.89,—


In [12]:
event_rate = analyze.calc_event_rate(summit, summed)

In [13]:
testem3 = event_rate['mean'].xs('testem3-flat+field+msc', level='problem').unstack('arch')

In [14]:
testem3 / testem3.loc[('vecgeom', 'cpu')]

arch,cpu,gpu,gpu+sync
geo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
orange,1.064366,28.589785,28.558007
vecgeom,1.0,15.481465,15.326692


In [15]:
speedup = analyze.get_cpugpu_ratio(summed['total_time'])
speedup.dropna().style.format(analyze.float_fmt_transform(1))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1
cms2018,vecgeom,9.6,0.1
cms2018+field+msc,vecgeom,6.7,0.1
simple-cms+field,orange,24.8,2.3
simple-cms+field+msc,orange,29.0,1.9
simple-cms+field+msc,vecgeom,27.8,1.6
simple-cms+msc,orange,26.5,0.2
testem15,orange,22.4,2.4
testem15+field,orange,25.5,2.6
testem15+field+msc,orange,32.1,3.2
testem15+field+msc,vecgeom,31.0,2.1


In [16]:
speedup['mean'].unstack('geo').describe()

geo,orange,vecgeom
count,10.0,6.0
mean,26.812873,19.372603
std,3.761416,10.16251
min,22.36267,6.727459
25%,24.369168,11.067115
50%,25.983358,20.525778
75%,28.48716,27.276708
max,33.730414,31.015356


In [17]:
#df = speedup.apply(np.vectorize(analyze.float_fmt_transform(1)))
speedup_out = np.full((len(speedup), 3), "", dtype=object)
_abbrev = summit.problem_to_abbr()
prev_prob = None
for (i, ((prob, geo), row)) in enumerate(speedup.iterrows()):
    if prob != prev_prob:
        abbr = _abbrev[prob]
        speedup_out[i, 0] = f"{prob} [{abbr}]"
    speedup_out[i, 1] = geo
    speedup_out[i, 2] = "{:.1f}× (±{:.1f})".format(*row)
    prev_prob = prob
    
headers = ["Problem", "Geometry", "Speedup"]
widths = np.vectorize(len)(np.concatenate([speedup_out, [headers]], axis=0))
col_widths = np.max(widths, axis=0)
fmt = f"| {{:<{col_widths[0]}}} | {{:<{col_widths[1]}}} | {{:>{col_widths[2]}}} |\n".format

with open("results/summit/speedup.md", "w") as f:
    f.write(fmt("Problem", "Geometry", "Speedup"))
    f.write(fmt(*["-"*w for w in col_widths]))
    for i in range(speedup_out.shape[0]):
        f.write(fmt(*speedup_out[i,:].tolist()))

In [18]:
_desc = (speedup['mean'].dropna() * 7).describe()
print("CPU:GPU equivalence: {min:.0f}× to {max:.0f}×".format(**_desc))

CPU:GPU equivalence: 47× to 236×


In [19]:
# Determine the fraction of action time spent in geometry routines
action_times_inst = analyze.unstack_subdict(summit.result['action_times'][summit.valid]).T
total_time_inst = summit.result['total_time']
geo_actions = [lab for lab in action_times_inst.index
               if lab.startswith('along-step-') or lab.startswith('geo-')]
geo_frac_inst = action_times_inst.loc[geo_actions].sum() / total_time_inst
geo_frac = analyze.summarize_instances(geo_frac_inst)

In [20]:
geo_frac['mean'].unstack('arch')[['cpu', 'gpu+sync']]

Unnamed: 0_level_0,arch,cpu,gpu+sync
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1
cms2018,vecgeom,0.456218,
cms2018+field+msc,vecgeom,0.691245,0.902085
simple-cms+field,orange,0.329535,
simple-cms+field+msc,orange,0.498907,
simple-cms+field+msc,vecgeom,0.492256,
simple-cms+msc,orange,0.42977,
testem15,orange,0.197602,
testem15+field,orange,0.313485,0.169418
testem15+field,vecgeom,,0.170993
testem15+field+msc,orange,0.463364,


### Plots

In [21]:
fig, ax = plt.subplots()
summit.plot_results(ax, speedup)
ax.set_ylabel("Speedup (7-CPU / 1-GPU wall time)")
ax.set_ylim([0, None])
analyze.annotate_metadata(ax, summit)
plt.tight_layout()
fig.savefig('plots/speedups.pdf', transparent=True)
fig.savefig('results/summit/speedup.png', transparent=False, dpi=150)
plt.close()

In [22]:
fig, axes = plt.subplots(nrows=2, figsize=(4,4), subplot_kw=dict(yscale='log'))
for (ax, q) in zip(axes, ['step', 'primary']):
    summit.plot_results(ax, analyze.inverse_summary(summed['avg_time_per_' + q]))
    ax.set_ylabel(q + ' per sec')
    if ax != axes[-1]:
        ax.tick_params(labelbottom=False)
    ax.legend()
plt.tight_layout()
fig.savefig('plots/steps-vs-primaries.pdf')
plt.close()

In [23]:
(fig, (time_ax, geo_ax)) = plt.subplots(
    nrows=2, #figsize=(4, 4),
    gridspec_kw=dict(height_ratios=[3, 1])
)
time_ax.set_yscale('log')
summit.plot_results(time_ax, event_rate)
time_ax.set_ylabel(r"Throughput [event/s]")
time_ax.set_ylim([0.5 * event_rate['mean'].min(), None])
time_ax.legend()
time_ax.tick_params(labelbottom=False)
analyze.annotate_metadata(time_ax, summit)
summit.plot_results(geo_ax, geo_frac * 100)
geo_ax.set_ylabel("Geometry [%]")
geo_ax.set_ylim([0, 100])
plt.tight_layout()
fig.savefig('plots/throughput-geo.pdf', transparent=True)
plt.close()

## Action fraction pie charts

In [24]:
avg_time = summed[('total_time', 'mean')].T
mean_action_times = summit.action_times().xs('mean', axis=1, level=1).T

In [25]:
for (prob, geo) in itertools.product(
        ["testem15+field", "testem3-flat+field+msc", "cms2018+field+msc"],
        ["vecgeom", "orange"]):
    try:
        plot_times = mean_action_times.xs((prob, geo), axis=1, level=('problem', 'geo')).dropna(axis=1, how='all')
    except KeyError:
        plot_times = pd.DataFrame()
    if plot_times.empty:
        print("Missing problem/geo:", prob, geo)
        continue
    md = {k: getattr(summit, k) for k in ["version", "system"]}
    pieplot = analyze.PiePlotter(plot_times)
    
    # Loop over CPU/GPU
    for arch in pieplot.times:
        (fig, ax) = plt.subplots(figsize=(3, 3), subplot_kw=dict(aspect="equal"),
                                 layout="constrained")
        pieplot(ax, arch)
        name = (prob, geo, arch)
        slashname = "/".join(name)
        fig.text(
            0.98, 0.1, f"{slashname}\n{md['version']} on {md['system']}",
            va='bottom', ha='right',
            fontstyle='italic', color=(0.75,)*4, size='xx-small',
#            zorder=-100
        )

        dashname = "-".join(name)        
        fig.savefig(f'plots/actions-{dashname}.pdf', transparent=True)
        plt.close()

Missing problem/geo: cms2018+field+msc orange


## Plot per-step timing on GPU

In [26]:
for p in ['cms2018', 'cms2018+field+msc']:
    data = summit.load_results((p, 'vecgeom', 'gpu'), 0)
    (fig, axes) = plt.subplots(nrows=2, figsize=(3, 4), sharex=True)
    for i, ax, plot in zip(itertools.count(),
                           axes,
                           [analyze.plot_counts, analyze.plot_accum_time_inv]):
        objs = plot(ax, data)
        analyze.annotate_metadata(ax, data['_metadata'])
        if i == 0:
            ax.set_xlabel(None)
    fig.savefig(f'plots/per-step-{p}.pdf', transparent=True)
    plt.tight_layout()
    plt.close()
    
    (fig, ax) = plt.subplots(figsize=(4, 3))
    analyze.plot_time_per_step(ax, data, scale=2)
    analyze.annotate_metadata(ax, summit)
    plt.tight_layout()
    fig.savefig(f'plots/time-per-step-{p}.pdf', transparent=True)
    plt.close()

## Crusher

In [27]:
crusher = analyze.Analysis('results/crusher')
print(crusher)

Analysis for Celeritas v0.3.2-3+35f1ba614 on crusher


In [28]:
# VecGeom failures aren't really failures; just missing capability
#failures = crusher.failures().xs('orange', level='geo').fillna(1)
#failures.groupby(['problem', 'arch']).count().unstack()

In [29]:
csum = analyze.summarize_instances(crusher.result[crusher.successful].dropna(how='all'))

In [30]:
csum[('total_time', 'mean')].unstack()

Unnamed: 0_level_0,arch,cpu,gpu
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1
simple-cms+field,orange,123.032814,15.769437
simple-cms+field+msc,orange,130.367693,15.692955
simple-cms+msc,orange,154.633955,11.139834
testem15,orange,116.0908,16.824183
testem15+field,orange,83.060299,18.263873
testem15+field+msc,orange,111.35808,13.281362
testem3-flat,orange,164.550103,12.91027
testem3-flat+field,orange,159.157076,23.867335
testem3-flat+field+msc,orange,262.394382,30.033116
testem3-flat+msc,orange,124.458898,15.305734


In [31]:
out = crusher.load_results(('testem3-flat','orange','gpu'), 0)

In [32]:
new_idx = []
ratios = []
times = []
for i in crusher.index:
    try:
        outp = crusher.load_results(i, 0)
        step_times = analyze.StepTimeGetter(outp, stream=0)()
    except Exception:
        continue
    times.append(step_times[0])
    ratios.append(step_times[0] / step_times[1])
    new_idx.append(i)

In [33]:
pd.Series(ratios, index=pd.MultiIndex.from_tuples(new_idx))

testem15                orange  gpu    12667.966035
testem15+field          orange  gpu     8177.139758
testem15+field+msc      orange  gpu     4990.678852
simple-cms+msc          orange  gpu     8061.694042
simple-cms+field        orange  gpu     2738.929957
simple-cms+field+msc    orange  gpu     2560.752946
testem3-flat            orange  gpu    11368.561214
testem3-flat+field      orange  gpu     6179.919774
testem3-flat+msc        orange  gpu    11721.083944
testem3-flat+field+msc  orange  gpu     6162.613339
dtype: float64

In [34]:
pd.Series(times, index=pd.MultiIndex.from_tuples(new_idx))

testem15                orange  gpu    10.830756
testem15+field          orange  gpu    11.720540
testem15+field+msc      orange  gpu     7.755131
simple-cms+msc          orange  gpu     6.692286
simple-cms+field        orange  gpu     7.413352
simple-cms+field+msc    orange  gpu     7.021659
testem3-flat            orange  gpu     6.887654
testem3-flat+field      orange  gpu     7.554717
testem3-flat+msc        orange  gpu     7.561939
testem3-flat+field+msc  orange  gpu     7.837230
dtype: float64

In [35]:
rel_err = csum.xs('std', axis=1, level=1) / csum.xs('mean', axis=1, level=1)
high_err = rel_err > 0.02
rel_err[high_err].dropna(how='all').dropna(how='all', axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avg_time_per_primary,avg_time_per_step,num_step_iters,pre_emptying_time,setup_time,slot_occupancy,total_time
problem,geo,arch,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
simple-cms+field,orange,cpu,0.373492,0.373406,,0.407446,,,0.373492
simple-cms+field,orange,gpu,0.026371,0.026435,,0.368877,0.074183,,0.026371
simple-cms+field+msc,orange,cpu,0.249154,0.24919,,0.338954,,,0.249154
simple-cms+field+msc,orange,gpu,,,,0.140241,0.040229,,
simple-cms+msc,orange,cpu,0.403753,0.403437,,0.570944,,,0.403753
simple-cms+msc,orange,gpu,0.064626,0.064739,0.023591,,0.058739,0.023168,0.064626
testem15,orange,cpu,0.485162,0.485225,,0.701036,0.036708,,0.485162
testem15,orange,gpu,0.029813,0.029737,,,0.403204,,0.029813
testem15+field,orange,cpu,0.339686,0.339675,,0.418265,,,0.339686
testem15+field,orange,gpu,0.024865,0.024945,,0.025612,0.063183,,0.024865


In [36]:
analyze.get_cpugpu_ratio(csum['total_time'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1
simple-cms+field,orange,7.801979,2.92123
simple-cms+field+msc,orange,8.307402,2.0756
simple-cms+msc,orange,13.881172,5.675904
testem15,orange,6.900234,3.354043
testem15+field,orange,4.547792,1.548956
testem15+field+msc,orange,8.384537,2.765334
testem3-flat,orange,12.745675,3.590251
testem3-flat+field,orange,6.668406,1.520868
testem3-flat+field+msc,orange,8.736835,1.83313
testem3-flat+msc,orange,8.131521,2.110356


In [37]:
crusher_times = csum['total_time']
crusher_times

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std
problem,geo,arch,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
simple-cms+field,orange,cpu,8.0,123.032814,45.95175
simple-cms+field,orange,gpu,8.0,15.769437,0.415863
simple-cms+field+msc,orange,cpu,8.0,130.367693,32.48167
simple-cms+field+msc,orange,gpu,8.0,15.692955,0.292284
simple-cms+msc,orange,cpu,8.0,154.633955,62.433898
simple-cms+msc,orange,gpu,8.0,11.139834,0.719924
testem15,orange,cpu,8.0,116.0908,56.322791
testem15,orange,gpu,8.0,16.824183,0.50158
testem15+field,orange,cpu,8.0,83.060299,28.214437
testem15+field,orange,gpu,8.0,18.263873,0.454137


In [38]:
crusher_rates = analyze.calc_event_rate(crusher, csum)
summit_rates = analyze.calc_event_rate(summit, summed.loc[crusher_times.index])

counts = {
    ('summit', 'cpu'): 7,
    ('summit', 'gpu'): 1,
    ('crusher', 'cpu'): 8,
    ('crusher', 'gpu'): 1,
}

In [39]:
(crusher_rates['mean'] / summit_rates['mean']).unstack()

Unnamed: 0_level_0,arch,cpu,gpu
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1
simple-cms+field,orange,0.62873,0.197659
simple-cms+field+msc,orange,0.795669,0.227699
simple-cms+msc,orange,0.591781,0.310452
testem15,orange,0.531156,0.163894
testem15+field,orange,0.868113,0.154784
testem15+field+msc,orange,0.832618,0.217461
testem3-flat,orange,0.642133,0.337922
testem3-flat+field,orange,0.776058,0.224622
testem3-flat+field+msc,orange,0.82414,0.268062
testem3-flat+msc,orange,1.531158,0.369122


In [40]:
fig, ax = plt.subplots()
ax.set_yscale('log')
for offset, color, machine, rates in [(-0.05, '#7A954F', 'Summit', summit_rates),
                                      (0.05, '#BC5544', 'Crusher', crusher_rates)]:
    for arch in ['cpu', 'gpu']:
        summary = rates.xs(arch, level='arch')
        index = np.array([p_to_i[p]
                          for p in summary.index.get_level_values('problem')], dtype=float)
        index += offset
    
        mark = analyze.ARCH_SHAPES[arch]
        count = counts[(machine.lower(), arch)]
        arch = arch.upper()
        ax.errorbar(index, summary['mean'], summary['std'],
                    capsize=0, fmt='none', ecolor=(0.2,)*3)
        scat = ax.scatter(index, summary['mean'], c=color, marker=mark,
                         label=f"{machine} ({count} {arch})")    
xax = ax.get_xaxis()
xax.set_ticks(np.arange(len(problems)))
xax.set_ticklabels(list(problem_to_abbr.values()), rotation=90)
grid = ax.grid()
ax.set_axisbelow(True)
ax.legend()
ax.set_ylabel(r"Event rate [1/s]")
analyze.annotate_metadata(ax, summit)
plt.tight_layout()
fig.savefig('plots/crusher-vs-summit.pdf')
plt.close()

## Occupancy

In [41]:
BYTES_PER_REG = 4 # 32-bit registers

# Kernel memory usage in bytes
class KernelMem(namedtuple('KernelMem', ['const', 'local', 'register'])):
    __slots__ = []
    
    @classmethod
    def from_kernel_stats(cls, kstat):
        return cls(kstat['const_mem'], kstat['local_mem'], kstat['num_regs'] * BYTES_PER_REG)

def get_kmem(kernels):
    return {kstat['name']: KernelMem.from_kernel_stats(kstat) for kstat in kernels}

def get_occupancy(kernels):
    return {kstat['name']: kstat['occupancy'] for kstat in kernels}

In [42]:
def unzip_kernel_stats(all_ks):
    result = defaultdict(list)
    for ks in all_ks:
        for k, v in ks.items():
            result[k].append(v)
            
    return {k: np.array(v) for k, v in result.items()}

def load_kernels(results, problem, geo):
    return results.load_results((problem, geo, 'gpu'), 0)['system']['kernels']

In [43]:
def plot_kernel_mem(ax, multimem, colors, labels):
    dtype = dtype=list(zip(KernelMem._fields, (int,)*3))
    mem = next(iter(multimem.values()))
    labels = list(mem)
    y = np.arange(len(labels))
    width = .9 / len(multimem)
    ynudge = np.linspace(-0.34, 0.34, len(multimem))
    
    for (i, (k, mem)) in enumerate(multimem.items()):
        values = np.array(list(mem.values()), dtype=dtype)

        ax.barh(y + ynudge[i], values['register'], width,
                color=colors[k], label=f'{pretty_labels[k]}')
        ax.barh(y + ynudge[i], values['local'], width, left=values['register'],
                color=colors[k + '.spill'])#, label=f'Local spill ({pretty_labels[k]})')

    ax.invert_yaxis();
    ax.set_xlabel('Memory [B]')
    ax.set_yticks(y, labels)
    leg = ax.legend()
    leg.set_title("Register usage (light)\nLocal spill (dark)")
    leg.get_title().set_fontsize('x-small')

In [44]:
kernel_stats = {
    'cuda/vecgeom': load_kernels(summit, 'testem3-flat+field+msc', 'vecgeom'),
    'cuda/orange': load_kernels(summit, 'testem3-flat+field+msc', 'orange'),
    'hip/orange': load_kernels(crusher, 'testem3-flat+field+msc', 'orange'),
}
pretty_labels = {
    'cuda/vecgeom': 'NVIDIA V100 (VecGeom)',
    'cuda/orange': 'NVIDIA V100 (ORANGE)',
    'hip/orange': 'AMD MI250 (ORANGE)',
}
colors = {
    'cuda/vecgeom': (191, 40, 96),
    'cuda/vecgeom.spill': (107, 76, 88),
    'cuda/orange': (153, 168, 50),
    'cuda/orange.spill': (106, 112, 67),
    'hip/orange': (57, 140, 173),
    'hip/orange.spill': (78, 101, 110),
}
colors = {k: np.array(v, dtype=float) / 255 for k, v in colors.items()}

In [45]:
#(fig, ax) = plt.subplots()
#plot_kernel_mem(ax, {k: get_kmem(v) for k, v in kernel_stats.items()},
#                colors=colors, labels=pretty_labels)

In [46]:
kernel_stats['cuda/orange'][4]

{'const_mem': 0,
 'heap_size': 8388608,
 'local_mem': 0,
 'max_blocks_per_cu': 1,
 'max_threads_per_block': 256,
 'max_warps_per_eu': 8,
 'name': 'along-step-uniform-msc-propagate',
 'num_regs': 184,
 'occupancy': 0.125,
 'print_buffer_size': 5242880,
 'stack_size': 1024,
 'threads_per_block': 256}

In [47]:
unzipped_ks = {k: unzip_kernel_stats(v) for k, v in kernel_stats.items()}

In [48]:
labels = {
    'cuda/vecgeom': 'NVIDIA V100 (VecGeom)',
    'cuda/orange': 'NVIDIA V100 (ORANGE)',
    'hip/orange': 'AMD MI250 (ORANGE)',
}
markers = {
    'cuda/vecgeom': '.',
    'cuda/orange': '+',
    'hip/orange': 'x',
}

(fig, ax) = plt.subplots()
for k, ks in unzipped_ks.items():
    s = ax.scatter(ks['num_regs'] * BYTES_PER_REG, ks['local_mem'],
               c=np.arange(len(ks['local_mem'])),
               marker=markers[k], label=labels[k])
ax.set_xlabel('Register usage [B]')
ax.set_ylabel('Memory spill [B]')
ax.legend()
cb = fig.colorbar(s)

plt.tight_layout()
#fig.savefig('plots/reg-vs-spill.png')
fig.savefig('plots/reg-vs-spill.pdf', transparent=True)
plt.close()

In [50]:
(fig, ax) = plt.subplots()
for k, ks in unzipped_ks.items():
    tot_mem = ks['num_regs'] * BYTES_PER_REG + ks['local_mem']
    s = ax.scatter(ks['occupancy'], tot_mem,
               c=np.arange(len(ks['local_mem'])),
               marker=markers[k], label=labels[k])
#ax.set_xlim(-0.05, 1.05)
ax.set_xlabel('Occupancy')
ax.set_ylabel('Register + spill [B]')
ax.legend()
cb = fig.colorbar(s)

plt.tight_layout()
#fig.savefig('plots/occupancy-vs-mem.png')
fig.savefig('plots/occupancy-vs-mem.pdf', transparent=True)
plt.close()

In [51]:
(fig, ax) = plt.subplots()
for k, ks in unzipped_ks.items():
    tot_mem = ks['local_mem']
    s = ax.scatter(ks['occupancy'], tot_mem,
               c=np.arange(len(ks['local_mem'])),
               marker=markers[k], label=labels[k])
#ax.set_xlim(-0.05, 1.05)
ax.set_xlabel('Occupancy')
ax.set_ylabel('Local memory spill [B]')
ax.legend()
cb = fig.colorbar(s)

plt.tight_layout()
#fig.savefig('plots/occupancy-vs-spill.png')
fig.savefig('plots/occupancy-vs-spill.pdf', transparent=True)
plt.close()