In [1]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sys
import itertools
from collections import namedtuple, defaultdict
from pathlib import Path

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

%load_ext autoreload
%autoreload 1
%aimport analyze

with open('plots/style.json') as f:
    mpl.rcParams.update(json.load(f))

## Wildstyle failures

In [2]:
ws = analyze.Analysis('results/wildstyle')
print(ws)

Analysis for Celeritas v0.3.2-rc.1 on wildstyle


In [3]:
ftab = analyze.make_failure_table(ws.failures())
ftab.to_frame()

Unnamed: 0,Failure


Unconverged tracks (from runs that didn't fail):

In [4]:
summed = analyze.summarize_instances(ws.result[['unconverged']])
unconv = summed[('unconverged', 'mean')]
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


# Results
## Summit

In [5]:
summit = analyze.Analysis('results/summit')
print(summit)
summed = summit.summed
problems = summit.problems()
problem_to_abbr = summit.problem_to_abbr(problems)
p_to_i = dict(zip(problems, itertools.count()))

Analysis for Celeritas v0.4.0-dev.138+fd9d70b9b on summit


### Failures

Average number of unconverged tracks:

In [6]:
unconv = analyze.summarize_instances(summit.result['unconverged'])['mean']
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


### Timing tables

In [7]:
times = summed[('total_time', 'mean')].unstack()
speedup = analyze.get_cpugpu_ratio(summed['total_time']).dropna(how='all', axis=0)

In [8]:
with open("results/summit/throughput.md", "w") as f:
    analyze.dump_event_rate(f, summit)
    
with open("results/summit/speedup.md", "w") as f:
    analyze.dump_speedup(f, summit)

In [9]:
event_rate = analyze.calc_event_rate(summit)
testem3 = event_rate['mean'].xs('testem3-flat+field+msc', level='problem').unstack('arch')
print(str(testem3 / testem3.loc[('vecgeom', 'cpu')]))

arch         cpu        gpu   gpu+sync
geo                                   
orange   1.01917  33.151208  32.529327
vecgeom  1.00000  16.295044  16.142589


In [10]:
_desc = (speedup['mean'].dropna()).describe()
print("Speedups: {min:.0f}×–{max:.0f}×".format(**_desc))
_desc = (speedup['mean'].dropna() * 7).describe()
print("CPU:GPU equivalence: {min:.0f}×–{max:.0f}×".format(**_desc))

Speedups: 7×–45×
CPU:GPU equivalence: 48×–312×


### Plots

In [11]:
(fig, [run_ax, setup_ax]) = plt.subplots(nrows=2,
                                         gridspec_kw=dict(height_ratios=[3, 1]),
                                         subplot_kw=dict(yscale='log'))
summit.plot_results(run_ax, summed['total_time'])
run_ax.legend();
run_ax.set_ylabel('Run [s]')
run_ax.tick_params(labelbottom=False)
summit.plot_results(setup_ax, summed['setup_time'])
setup_ax.set_ylabel('Setup [s]')
analyze.annotate_metadata(run_ax, summit)
plt.tight_layout()
fig.savefig('plots/timing.pdf', transparent=True)
plt.close()

In [12]:
fig, ax = plt.subplots()
summit.plot_results(ax, speedup)
ax.set_ylabel("Speedup (7-CPU / 1-GPU wall time)")
ax.set_ylim([0, None])
analyze.annotate_metadata(ax, summit)
plt.tight_layout()
fig.savefig('plots/speedups.pdf', transparent=True)
fig.savefig('results/summit/speedup.png', transparent=False, dpi=150)
plt.close()

In [13]:
fig, axes = plt.subplots(nrows=2, figsize=(4,4), subplot_kw=dict(yscale='log'))
for (ax, q) in zip(axes, ['step', 'primary']):
    summit.plot_results(ax, analyze.inverse_summary(summed['avg_time_per_' + q]))
    ax.set_ylabel(q + ' per sec')
    if ax != axes[-1]:
        ax.tick_params(labelbottom=False)
    ax.legend()
plt.tight_layout()
fig.savefig('plots/steps-vs-primaries.pdf')
plt.close()

### Geometry fraction

In [14]:
action_times = summit.result['action_times'][summit.valid]
_arch = action_times.index.get_level_values('arch')
# Only get CPU and GPU+sync values
action_times = analyze.unstack_subdict(action_times[_arch != 'gpu'])
# Get a mask for action categories
_cat = np.vectorize(analyze.get_action_priority)(action_times.columns)
geo_actions = (_cat == analyze.KernelCategory.GEO) | (_cat == analyze.KernelCategory.GP)

In [15]:
# Replace 'gpu+sync' with 'gpu'
def update_tuple(t):
    if t[2] == 'gpu+sync':
        t = t[:2] + ('gpu',) + t[3:]
    return t
geo_action_times = action_times.loc[:, geo_actions]
geo_action_times.index = pd.MultiIndex.from_tuples([update_tuple(r) for r in geo_action_times.index],
                                       names=geo_action_times.index.names)

In [16]:
geo_frac = analyze.summarize_instances(geo_action_times.sum(axis=1) / summit.result['total_time'])
geo_frac.dropna(inplace=True)

In [17]:
gf_table = geo_frac['mean'].unstack(['geo', 'arch']).applymap(analyze.float_fmt_transform(2))

In [18]:
with open("results/summit/geo-frac.md", "w") as f:
    analyze.dump_markdown(f,
                  ["Problem"] + ["/".join(c) for c in gf_table.columns], 
                  np.concatenate([np.array([gf_table.index]).T, gf_table], axis=1),
                  alignment="<" + ">"*gf_table.shape[1])

In [19]:
(fig, (time_ax, geo_ax)) = plt.subplots(
    nrows=2, #figsize=(4, 4),
    gridspec_kw=dict(height_ratios=[3, 1])
)
analyze.plot_event_rate(time_ax, summit)
time_ax.tick_params(labelbottom=False)
time_ax.legend()

summit.plot_results(geo_ax, geo_frac * 100)
geo_ax.set_ylabel("Geometry [%]")
geo_ax.set_ylim([0, 100])
plt.tight_layout()
fig.savefig('plots/throughput-geo.pdf', transparent=True)
plt.close()

### Action fraction pie charts

In [20]:
avg_time = summed[('total_time', 'mean')].T
mean_action_times = summit.action_times().xs('mean', axis=1, level=1).T

In [21]:
for (prob, geo) in itertools.product(
        ["testem15+field", "testem3-flat+field+msc", "cms2018+field+msc"],
        ["vecgeom", "orange"]):
    try:
        plot_times = mean_action_times.xs((prob, geo), axis=1, level=('problem', 'geo')).dropna(axis=1, how='all')
    except KeyError:
        plot_times = pd.DataFrame()
    if plot_times.empty:
        print("Missing problem/geo:", prob, geo)
        continue
    md = {k: getattr(summit, k) for k in ["version", "system"]}
    pieplot = analyze.PiePlotter(plot_times)
    
    # Loop over CPU/GPU
    for arch in pieplot.arch:
        (fig, ax) = plt.subplots(figsize=(3, 3), subplot_kw=dict(aspect="equal"),
                                 layout="constrained")
        pieplot(ax, arch)
        name = (prob, geo, arch)
        slashname = "/".join(name)
        fig.text(
            0.98, 0.1, f"{slashname}\n{md['version']} on {md['system']}",
            va='bottom', ha='right',
            fontstyle='italic', color=(0.75,)*4, size='xx-small',
#            zorder=-100
        )

        dashname = "-".join(name)        
        fig.savefig(f'plots/actions-{dashname}.pdf', transparent=True)
        plt.close()

Missing problem/geo: cms2018+field+msc orange


### Plot per-step timing on GPU

In [22]:
for p in ['cms2018', 'cms2018+field+msc']:
    data = summit.load_results((p, 'vecgeom', 'gpu'), 0)
    (fig, axes) = plt.subplots(nrows=2, figsize=(3, 4), sharex=True)
    for i, ax, plot in zip(itertools.count(),
                           axes,
                           [analyze.plot_counts, analyze.plot_accum_time_inv]):
        objs = plot(ax, data)
        analyze.annotate_metadata(ax, data['_metadata'])
        if i == 0:
            ax.set_xlabel(None)
    fig.savefig(f'plots/per-step-{p}.pdf', transparent=True)
    plt.tight_layout()
    plt.close()
    
    (fig, ax) = plt.subplots(figsize=(4, 3))
    analyze.plot_time_per_step(ax, data, scale=2)
    analyze.annotate_metadata(ax, summit)
    plt.tight_layout()
    fig.savefig(f'plots/time-per-step-{p}.pdf', transparent=True)
    plt.close()

## Crusher

In [23]:
def get_step_times(results):
    new_idx = []
    times = []
    for i in results.index:
        try:
            outp = results.load_results(i, 0)
            step_times = analyze.StepTimeGetter(outp, stream=0)()
        except Exception:
            continue
        times.append([step_times[i] for i in range(2)])
        new_idx.append(i)
    return pd.DataFrame(times, index=pd.MultiIndex.from_tuples(new_idx))

In [24]:
crusher = analyze.Analysis('results/crusher')
print(crusher)

Analysis for Celeritas v0.4.0-dev.99+4507705b5 on crusher


In [25]:
with open("results/crusher/throughput.md", "w") as f:
    analyze.dump_event_rate(f, crusher)



## Frontier

In [26]:
frontier = analyze.Analysis('results/frontier')
print(frontier)

Analysis for Celeritas v0.4.0-dev.101+585806eb4 on frontier


In [27]:
with open("results/frontier/throughput.md", "w") as f:
    analyze.dump_event_rate(f, frontier)
    
with open("results/frontier/speedup.md", "w") as f:
    analyze.dump_speedup(f, frontier)



In [28]:
frontier_rates = analyze.calc_event_rate(frontier)
summit_rates = analyze.calc_event_rate(summit, summit.summed.loc[frontier_rates.index])

counts = {
    ('summit', 'cpu'): 7,
    ('summit', 'gpu'): 1,
    ('frontier', 'cpu'): 8,
    ('frontier', 'gpu'): 1,
}

In [29]:
(frontier_rates['mean'] / summit_rates['mean']).unstack()

Unnamed: 0_level_0,arch,cpu,gpu,gpu+sync
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simple-cms+field,orange,1.455114,0.800337,
simple-cms+field+msc,orange,1.541246,0.853329,
simple-cms+msc,orange,0.930457,0.858847,
testem15,orange,0.831736,0.795938,
testem15+field,orange,0.8917,0.897284,0.901763
testem15+field+msc,orange,1.033451,0.931403,
testem3-flat,orange,1.124521,0.73503,
testem3-flat+field,orange,1.027988,0.782586,
testem3-flat+field+msc,orange,1.553406,0.753832,0.757914
testem3-flat+msc,orange,1.316004,0.818958,


In [30]:
fig, ax = plt.subplots()
ax.set_yscale('log')
for offset, color, machine, rates in [(-0.05, '#7A954F', 'Summit', summit_rates),
                                      (0.05, '#BC5544', 'Frontier', frontier_rates)]:
    for arch in ['cpu', 'gpu']:
        summary = rates.xs(arch, level='arch')
        index = np.array([p_to_i[p]
                          for p in summary.index.get_level_values('problem')], dtype=float)
        index += offset
    
        mark = analyze.ARCH_SHAPES[arch]
        count = counts[(machine.lower(), arch)]
        arch = arch.upper()
        ax.errorbar(index, summary['mean'], summary['std'],
                    capsize=0, fmt='none', ecolor=(0.2,)*3)
        scat = ax.scatter(index, summary['mean'], c=color, marker=mark,
                         label=f"{machine} ({count} {arch})")    
xax = ax.get_xaxis()
xax.set_ticks(np.arange(len(problems)))
xax.set_ticklabels(list(problem_to_abbr.values()), rotation=90)
grid = ax.grid()
ax.set_axisbelow(True)
ax.legend()
ax.set_ylabel(r"Event rate [1/s]")
analyze.annotate_metadata(ax, summit)
plt.tight_layout()
fig.savefig('plots/frontier-vs-summit.pdf')
fig.savefig('results/frontier/event-rate.png')
plt.close()

# Occupancy

In [31]:
BYTES_PER_REG = 4 # 32-bit registers

def load_kernels(results, problem, geo):
    return results.load_results((problem, geo, 'gpu'), 0)['system']['kernels']

def kernel_stats_dataframe(kernel_stats):
    values = []
    index = []
    for (instance, kernels) in kernel_stats.items():
        arch, _, geo = instance.partition('/')
        for (ki, stats) in enumerate(kernels):
            stats.pop('stack_size', None) # Unavailable with HIP
            row = list(stats.values())
            row.append(ki)
            values.append(row)
            name = stats['name']
            if name == "extend-from-secondaries":
                # Fixup duplicate name
                name = f"{name}-{ki}"
            index.append((arch, geo, name))
    index=pd.MultiIndex.from_tuples(index, names=('arch', 'geo', 'name'))
    columns = pd.Index(list(stats.keys()) + ['kernel_index'])
    result = pd.DataFrame(values, index=index, columns=columns)
    del result['name']
    del result['print_buffer_size']
    result['register_mem'] = result['num_regs'] * BYTES_PER_REG
    return result

In [32]:
def plot_kernel_mem(ax, ksdf, colors, labels):
    labels = ['local_mem', 'register_mem']
    y = np.arange(len(labels))
    width = .9 / len(multimem)
    ynudge = np.linspace(-0.34, 0.34, len(multimem))
    
    for (i, (k, mem)) in enumerate(multimem.items()):
        values = np.array(list(mem.values()), dtype=dtype)

        ax.barh(y + ynudge[i], values['register'], width,
                color=colors[k], label=f'{pretty_labels[k]}')
        ax.barh(y + ynudge[i], values['local'], width, left=values['register'],
                color=colors[k + '.spill'])#, label=f'Local spill ({pretty_labels[k]})')

    ax.invert_yaxis();
    ax.set_xlabel('Memory [B]')
    ax.set_yticks(y, labels)
    leg = ax.legend()
    leg.set_title("Register usage (light)\nLocal spill (dark)")
    leg.get_title().set_fontsize('x-small')

In [33]:
kernel_stats = {
    'cuda/vecgeom': load_kernels(summit, 'testem3-flat+field+msc', 'vecgeom'),
    'cuda/orange': load_kernels(summit, 'testem3-flat+field+msc', 'orange'),
    'hip/orange': load_kernels(frontier, 'testem3-flat+field+msc', 'orange'),
}
pretty_labels = {
    'cuda/vecgeom': 'NVIDIA V100 (VecGeom)',
    'cuda/orange': 'NVIDIA V100 (ORANGE)',
    'hip/orange': 'AMD MI250 (ORANGE)',
}
colors = {
    'cuda/vecgeom': (191, 40, 96),
    'cuda/vecgeom.spill': (107, 76, 88),
    'cuda/orange': (153, 168, 50),
    'cuda/orange.spill': (106, 112, 67),
    'hip/orange': (57, 140, 173),
    'hip/orange.spill': (78, 101, 110),
}
colors = {k: np.array(v, dtype=float) / 255 for k, v in colors.items()}

In [34]:
ksdf = kernel_stats_dataframe(kernel_stats)

In [35]:
with open("kernel-occupancy.md", "w") as f:
    analyze.dump_markdown(
        f,
        list(ksdf.index.names) + ['local', 'register', 'occupancy'],
        np.concatenate([
            np.array([list(v) for v in ksdf.index]).T,
            [
                ksdf['local_mem'].apply("{:d}".format),
                ksdf['register_mem'].apply("{:d}".format),
                ksdf['occupancy'].apply("{:.03f}".format)
            ],
        ], axis=0).T,
        alignment="<<<>>>"
    )

In [36]:
labels = {
    'cuda/vecgeom': 'NVIDIA V100 (VecGeom)',
    'cuda/orange': 'NVIDIA V100 (ORANGE)',
    'hip/orange': 'AMD MI250 (ORANGE)',
}
markers = {
    'cuda/vecgeom': '.',
    'cuda/orange': '+',
    'hip/orange': 'x',
}

(fig, ax) = plt.subplots()
for key, ks in ksdf.unstack('name').iterrows():
    k = '/'.join(key)
    ks = ks.unstack(level=0)
    s = ax.scatter(ks['register_mem'], ks['local_mem'],
               c=ks['kernel_index'],
               marker=markers[k], label=labels[k])
ax.set_xlabel('Register usage [B]')
ax.set_ylabel('Memory spill [B]')
ax.legend()
cb = fig.colorbar(s)

plt.tight_layout()
#fig.savefig('plots/reg-vs-spill.png')
fig.savefig('plots/reg-vs-spill.pdf', transparent=True)
plt.close()

In [37]:
(fig, ax) = plt.subplots()
for key, ks in ksdf.unstack('name').iterrows():
    k = '/'.join(key)
    ks = ks.unstack(level=0)
    tot_mem = ks['register_mem'] + ks['local_mem']
    s = ax.scatter(ks['occupancy'], tot_mem,
               c=ks['kernel_index'],
               marker=markers[k], label=labels[k])
#ax.set_xlim(-0.05, 1.05)
ax.set_xlabel('Occupancy')
ax.set_ylabel('Register + spill [B]')
ax.legend()
cb = fig.colorbar(s)

plt.tight_layout()
#fig.savefig('plots/occupancy-vs-mem.png')
fig.savefig('plots/occupancy-vs-mem.pdf', transparent=True)
plt.close()

In [38]:
(fig, ax) = plt.subplots()
for key, ks in ksdf.unstack('name').iterrows():
    k = '/'.join(key)
    ks = ks.unstack(level=0)
    s = ax.scatter(ks['occupancy'], ks['local_mem'],
               c=ks['kernel_index'],
               marker=markers[k], label=labels[k])
#ax.set_xlim(-0.05, 1.05)
ax.set_xlabel('Occupancy')
ax.set_ylabel('Local memory spill [B]')
ax.legend()
cb = fig.colorbar(s)

plt.tight_layout()
#fig.savefig('plots/occupancy-vs-spill.png')
fig.savefig('plots/occupancy-vs-spill.pdf', transparent=True)
plt.close()

In [39]:
ksdf[ksdf['local_mem'] > 64]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,const_mem,heap_size,local_mem,max_blocks_per_cu,max_threads_per_block,max_warps_per_eu,num_regs,occupancy,threads_per_block,kernel_index,register_mem
arch,geo,name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
cuda,vecgeom,scat-rayleigh,20776,8388608,104,4,256,32,64,0.5,256,17,256
cuda,orange,initialize-tracks,0,8388608,112,4,256,32,63,0.5,256,1,252
cuda,orange,along-step-uniform-msc-propagate,0,8388608,224,2,256,16,128,0.25,256,4,512
cuda,orange,scat-rayleigh,0,8388608,104,4,256,32,63,0.5,256,17,252
cuda,orange,geo-boundary,0,8388608,176,4,256,32,64,0.5,256,19,256
hip,orange,initialize-tracks,0,68702699520,120,5,256,5,90,0.625,256,1,360
hip,orange,geo-boundary,0,68702699520,304,5,256,5,92,0.625,256,19,368
