In [1]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sys
import itertools
from collections import namedtuple, defaultdict
from pathlib import Path

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

%load_ext autoreload
%autoreload 1
%aimport analyze

with open('plots/style.json') as f:
    mpl.rcParams.update(json.load(f))

# Results

## Wildstyle failures

In [2]:
ws = analyze.Analysis('results/wildstyle')
print(ws)

Analysis for Celeritas v0.4.0-rc.2 on wildstyle


In [3]:
ftab = analyze.make_failure_table(ws.failures())
ftab.to_frame()

Unnamed: 0,Failure
cms2018+field+msc/vecgeom+cpu (0),internal assertion failed: `track.make_geo_vie...


In [4]:
ftab.to_dict()

{'cms2018+field+msc/vecgeom+cpu (0)': 'internal assertion failed: `track.make_geo_view().pos() != orig_pos` at `PropagationApplier.hh:116`'}

Unconverged tracks (from runs that didn't fail):

In [5]:
summed = analyze.summarize_instances(ws.result[['unconverged']])
unconv = summed[('unconverged', 'mean')]
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


## Summit

In [6]:
summit = analyze.Analysis('results/summit')
print(summit)

Analysis for Celeritas v0.4.0 on summit


Average number of unconverged tracks:

In [7]:
unconv = analyze.summarize_instances(summit.result['unconverged'])['mean']
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


## Crusher

In [8]:
def get_step_times(results):
    new_idx = []
    times = []
    for i in results.index:
        try:
            outp = results.load_results(i, 0)
            step_times = analyze.StepTimeGetter(outp, stream=0)()
        except Exception:
            continue
        times.append([step_times[i] for i in range(2)])
        new_idx.append(i)
    return pd.DataFrame(times, index=pd.MultiIndex.from_tuples(new_idx))

In [9]:
crusher = analyze.Analysis('results/crusher')
print(crusher)

Analysis for Celeritas v0.4.0-10+1782ddbdb on crusher


In [10]:
analyze.get_device_properties(crusher)

{'can_map_host_memory': True,
 'capability_major': 9,
 'capability_minor': 0,
 'clock_rate': 1700000,
 'device_id': 0,
 'eu_per_cu': 4,
 'max_blocks_per_grid': 2147483647,
 'max_cache_size': 8388608,
 'max_threads_per_block': 1024,
 'max_threads_per_cu': 2048,
 'memory_clock_rate': 1600000,
 'multiprocessor_count': 110,
 'name': 'gfx90a',
 'platform': 'hip',
 'regs_per_block': 65536,
 'shared_mem_per_block': 65536,
 'threads_per_warp': 64,
 'total_const_mem': 2147483647,
 'total_global_mem': 68702699520}

## Frontier

In [11]:
frontier = analyze.Analysis('results/frontier')
print(frontier)

Analysis for Celeritas v0.4.0 on frontier


In [12]:
ftab = analyze.make_failure_table(frontier.failures())
ftab.to_frame()

Unnamed: 0,Failure


## Perlmutter

In [13]:
perlmutter = analyze.Analysis('results/perlmutter')
print(perlmutter)

Analysis for Celeritas v0.4.1 on perlmutter


In [14]:
ftab = analyze.make_failure_table(perlmutter.failures())
ftab.to_frame()

Unnamed: 0,Failure
cms2018+field+msc/vecgeom+gpu+g4 (0),Geant4 error: `trivial_device_copy D->H failed...
cms2018+field+msc/vecgeom+gpu+g4 (1),Geant4 error: `trivial_device_copy D->H failed...
cms2018+field+msc/vecgeom+gpu+g4 (2),Geant4 error: `an illegal memory access was en...
cms2018+field+msc/vecgeom+gpu+g4 (3),Geant4 error: `trivial_device_copy D->H failed...
cms2018/vecgeom+cpu (0),runtime error: `insufficient capacity (262144)...
cms2018/vecgeom+cpu (2),runtime error: `insufficient capacity (262144)...
cms2018/vecgeom+cpu (3),runtime error: `insufficient capacity (262144)...
cms2018/vecgeom+gpu+g4 (0),Geant4 error: `trivial_device_copy D->H failed...
cms2018/vecgeom+gpu+g4 (1),Geant4 error: `trivial_device_copy D->H failed...
cms2018/vecgeom+gpu+g4 (2),Geant4 error: `trivial_device_copy D->H failed...


# Occupancy

In [23]:
problem = "testem3-flat"
kernel_stats = {
    "cuda/vecgeom": analyze.load_kernels(perlmutter, problem, "vecgeom"),
    "cuda/orange": analyze.load_kernels(perlmutter, problem, "orange"),
    "hip/orange": analyze.load_kernels(frontier, problem, "orange"),
}
ksdf = analyze.kernel_stats_dataframe(kernel_stats)

In [24]:
ksdf[ksdf['local_mem'] > 64]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,const_mem,heap_size,local_mem,max_blocks_per_cu,max_threads_per_block,max_warps_per_eu,num_regs,occupancy,threads_per_block,kernel_index,register_mem
arch,geo,name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
cuda,orange,initialize-tracks,0,8388608,144,5,256,40,48,0.625,256,1,192
cuda,orange,photoel-livermore,0,8388608,80,3,256,24,67,0.375,256,12,268
cuda,orange,geo-boundary,0,8388608,176,4,256,32,64,0.5,256,17,256
hip,orange,initialize-tracks,0,68702699520,120,5,256,5,86,0.625,256,1,344
hip,orange,geo-boundary,0,68702699520,128,5,256,5,90,0.625,256,17,360


# Performance per core

In [25]:
def get_sm_count(analysis):
    return analyze.get_device_properties(analysis)["multiprocessor_count"]

plot_like = perlmutter
rates = {
    "frontier": calc_events_per_task_sec(frontier, plot_like),
    "summit": calc_events_per_task_sec(summit, plot_like),
    "perlmutter": calc_events_per_task_sec(perlmutter, plot_like),
}

analyses = {
    "summit": summit,
    "perlmutter": perlmutter,
    "frontier": frontier,
}

sm_per_gpu = {k: get_sm_count(v) for k, v in analyses.items()}

sm_per_gpu

{'summit': 80, 'perlmutter': 108, 'frontier': 110}

In [26]:
system_color = {
    "summit": "#7A954F",
    "frontier": "#BC5544",
    "perlmutter": "#3E92C7",
}

In [27]:
for k in rates:
    print(k, rates[k].loc[('testem3-flat', 'orange', 'cpu'), 'mean'],
         rates[k].loc[('testem3-flat', 'orange', 'gpu'), 'mean'])

frontier 0.1499143392240646 1.5860126730382798
summit 0.08835991612886411 2.1261953110268186
perlmutter 0.399988593226848 3.6968187008418067


In [28]:
(fig, ax) = plt.subplots(layout="constrained")
for k in analyses:
    print(k)
    r = rates[k]
    for arch in ['cpu', 'gpu', 'g4']:
        # events per task-sec
        v = r[r.index.get_level_values("arch") == arch].copy()
        if arch != "gpu":
            v /= analyze.CPU_PER_TASK[k] # -> events/cpu-sec
        else:
            v /= sm_per_gpu[k]
        scat = plot_like.plot_results(ax, v)
        for s in scat:
            s.set_color(system_color[k])
            s.set_label(f"{k.title()} ({arch.upper()})")
ax.legend()
ax.set_xlabel("Problem")
ax.set_ylabel("Throughput per core/SM [event/s]")
grid = ax.grid(which='both')
fig.savefig('plots/event-per-core.png')
plt.close()

summit
perlmutter
frontier


In [30]:
# Per-task CPU power
JOULE_PER_WH = 3600

(fig, ax) = plt.subplots(#subplot_kw=dict(yscale="log"),
        layout="constrained")
for k in analyses:
    r = rates[k]
    for arch in ['cpu', 'gpu', 'g4']:
        v = get_where_arch(r, arch) # events/(task * s)
        power = get_where_arch(analyses[k].power, arch) / JOULE_PER_WH # W-h/sec
        v.loc[:, 'mean'] /= power
        v.loc[:, 'std'] = power
        scat = plot_like.plot_results(ax, v)
        for s in scat:
            s.set_color(system_color[k])
            s.set_label(f"{k.title()} ({arch.upper()})")

ax.legend()
ax.set_xlabel("Problem")
ax.set_ylabel("Efficiency [event/W-h]")
grid = ax.grid(which='both')
fig.savefig('plots/event-per-energy.pdf', transparent=True)
fig.savefig('plots/event-per-energy.png', transparent=False, dpi=150)
plt.close()

