In [1]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sys
import itertools
from collections import namedtuple, defaultdict
from pathlib import Path

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

%load_ext autoreload
%autoreload 1
%aimport analyze

with open('plots/style.json') as f:
    mpl.rcParams.update(json.load(f))

# Results

## Wildstyle failures

In [2]:
ws = analyze.Analysis('results/wildstyle')
print(ws)

Analysis for Celeritas v0.4.0-rc.2 on wildstyle


In [3]:
ftab = analyze.make_failure_table(ws.failures())
ftab.to_frame()

Unnamed: 0,Failure
cms2018+field+msc/vecgeom+cpu (0),internal assertion failed: `track.make_geo_vie...


In [5]:
ftab.to_dict()

{'cms2018+field+msc/vecgeom+cpu (0)': 'internal assertion failed: `track.make_geo_view().pos() != orig_pos` at `PropagationApplier.hh:116`'}

Unconverged tracks (from runs that didn't fail):

In [4]:
summed = analyze.summarize_instances(ws.result[['unconverged']])
unconv = summed[('unconverged', 'mean')]
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


## Summit

In [2]:
summit = analyze.Analysis('results/summit')
print(summit)

Analysis for Celeritas v0.4.0-rc.2.17+fa123c7fe on summit


Average number of unconverged tracks:

In [8]:
unconv = analyze.summarize_instances(summit.result['unconverged'])['mean']
unconv[unconv > 0].unstack('arch') 

Unnamed: 0_level_0,arch
problem,geo


## Crusher

In [9]:
def get_step_times(results):
    new_idx = []
    times = []
    for i in results.index:
        try:
            outp = results.load_results(i, 0)
            step_times = analyze.StepTimeGetter(outp, stream=0)()
        except Exception:
            continue
        times.append([step_times[i] for i in range(2)])
        new_idx.append(i)
    return pd.DataFrame(times, index=pd.MultiIndex.from_tuples(new_idx))

In [20]:
crusher = analyze.Analysis('results/crusher')
print(crusher)

Analysis for Celeritas v0.4.0-rc.2.4+affc7bb17 on crusher


In [21]:
analyze.get_device_properties(crusher)

{'can_map_host_memory': True,
 'capability_major': 9,
 'capability_minor': 0,
 'clock_rate': 1700000,
 'device_id': 0,
 'eu_per_cu': 4,
 'max_blocks_per_grid': 2147483647,
 'max_cache_size': 8388608,
 'max_threads_per_block': 1024,
 'max_threads_per_cu': 2048,
 'memory_clock_rate': 1600000,
 'multiprocessor_count': 110,
 'name': 'gfx90a',
 'platform': 'hip',
 'regs_per_block': 65536,
 'shared_mem_per_block': 65536,
 'threads_per_warp': 64,
 'total_const_mem': 2147483647,
 'total_global_mem': 68702699520}

## Frontier

In [3]:
frontier = analyze.Analysis('results/frontier')
print(frontier)

Analysis for Celeritas v0.4.0-rc.2.4+affc7bb17 on frontier


In [12]:
frontier_rates = analyze.calc_event_rate(frontier)
summit_rates = analyze.calc_event_rate(summit, summit.summed.loc[frontier_rates.index])

counts = {
    ('summit', 'cpu'): 7,
    ('summit', 'gpu'): 1,
    ('frontier', 'cpu'): 7,
    ('frontier', 'gpu'): 1,
}

In [13]:
(frontier_rates['mean'] / summit_rates['mean']).unstack()

Unnamed: 0_level_0,arch,cpu,gpu,gpu+sync
problem,geo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simple-cms+field,orange,1.85909,0.76907,
simple-cms+field+msc,orange,1.94467,0.820616,
simple-cms+msc,orange,1.870205,0.853647,
testem15,orange,1.755824,0.799741,
testem15+field,orange,1.866668,0.875827,0.878627
testem15+field+msc,orange,1.903381,0.911272,
testem3-flat,orange,1.767941,0.725712,
testem3-flat+field,orange,1.860801,0.739295,
testem3-flat+field+msc,orange,1.91923,0.719573,0.723578
testem3-flat+msc,orange,1.842259,0.809204,


## Perlmutter

In [4]:
perlmutter = analyze.Analysis('results/perlmutter')
print(perlmutter)

Analysis for Celeritas v0.4.0 on perlmutter


In [3]:
testem3 = perlmutter.result.xs('testem3-flat+field+msc', level='problem').xs('orange', level='geo')
event_per_sec = testem3['avg_event_per_time'].unstack('arch').describe().loc['mean']
del event_per_sec['gpu+sync']

In [4]:
event_per_sec

arch
cpu    0.090867
gpu    1.619334
Name: mean, dtype: float64

In [10]:
cores_per_job = pd.Series([16, 1], index=event_per_sec.index)
watt_per_card = pd.Series([280, 250], index=event_per_sec.index)
cores_per_card = pd.Series([64, 1], index=event_per_sec.index) # AMD hardware cores, *NOT* threads
watt_per_core = watt_per_card / cores_per_card
watt_per_job = watt_per_core * cores_per_job
event_per_joule = event_per_sec / watt_per_job

In [11]:
event_per_joule

arch
cpu    0.001298
gpu    0.006477
dtype: float64

In [12]:
event_per_joule['gpu'] / event_per_joule['cpu']

4.989850269771324

In [13]:
event_per_wh = event_per_joule * 60 * 60
event_per_wh

arch
cpu     4.673169
gpu    23.318415
dtype: float64

# Occupancy

In [26]:
problem = "testem3-flat"
kernel_stats = {
    "cuda/vecgeom": analyze.load_kernels(summit, problem, "vecgeom"),
    "cuda/orange": analyze.load_kernels(summit, problem, "orange"),
    "hip/orange": analyze.load_kernels(frontier, problem, "orange"),
}
ksdf = analyze.kernel_stats_dataframe(kernel_stats)

In [27]:
ksdf[ksdf['local_mem'] > 64]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,const_mem,heap_size,local_mem,max_blocks_per_cu,max_threads_per_block,max_warps_per_eu,num_regs,occupancy,threads_per_block,kernel_index,register_mem
arch,geo,name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
cuda,vecgeom,scat-rayleigh,20776,8388608,104,4,256,32,64,0.5,256,15,256
cuda,orange,initialize-tracks,0,8388608,160,5,256,40,48,0.625,256,1,192
cuda,orange,photoel-livermore,0,8388608,80,3,256,24,68,0.375,256,12,272
cuda,orange,scat-rayleigh,0,8388608,104,4,256,32,63,0.5,256,15,252
cuda,orange,geo-boundary,0,8388608,176,4,256,32,64,0.5,256,17,256
hip,orange,initialize-tracks,0,68702699520,120,5,256,5,86,0.625,256,1,344
hip,orange,geo-boundary,0,68702699520,128,5,256,5,90,0.625,256,17,360


# Performance per core

In [None]:
cpu_power_per_task= {
    "summit": 2 * 190 / 6,
    "frontier": 225 / 8, # 64-core AMD “Optimized 3rd Gen EPYC”
    "perlmutter": 280 / 4, # AMD EPYC 7453
}

# Per-task GPU power
gpu_power_per_task = {
    "summit": 250, # V100
    "frontier": 500 / 2, # MI250x
    "perlmutter": 250, # A100
}

cpu_per_task = {
    "summit": 7, # 44 total, 2 reserved for system
    "frontier": 7, # 64 total, 8 reserved
    "perlmutter": 16,
}

In [110]:
frontier_rates = analyze.calc_event_rate(frontier)

def calc_events_per_task_sec(analysis, idx):
    summary = analysis.summed.loc[idx]
    return analyze.inverse_summary(summary['avg_event_per_time'])

def get_sm_count(analysis):
    return analyze.get_device_properties(analysis)["multiprocessor_count"]

rates = {
    "frontier": frontier_rates,
    "summit": calc_event_rate_with_index(summit, frontier_rates.index),
    "perlmutter": calc_event_rate_with_index(perlmutter, frontier_rates.index),
}

analyses = {
    "summit": summit,
    "perlmutter": perlmutter,
    "frontier": frontier,
}

sm_per_gpu = {k: get_sm_count(v) for k, v in analyses.items()}

sm_per_gpu

In [113]:
system_color = {
    "summit": "#7A954F",
    "frontier": "#BC5544",
    "perlmutter": "#3E92C7",
}

In [114]:
for k in rates:
    print(k, rates[k].loc[('testem3-flat', 'orange', 'cpu'), 'mean'])

frontier 0.11578683131384723
summit 0.06562016814217445
perlmutter 0.16765151573524067


In [115]:

(fig, ax) = plt.subplots(layout="constrained")
for k in analyses:
    r = rates[k]
    for arch in ['cpu', 'gpu']:
        # events per task-sec
        v = r[r.index.get_level_values("arch") == arch].copy()
        if arch == "cpu":
            v /= cpu_per_task[k] # -> events/cpu-sec
        else:
            v /= sm_per_gpu[k]
        print(k, arch, ":",
              "per core =", v.loc[('testem3-flat', 'orange', arch), 'mean'])
        scat = analyses[k].plot_results(ax, v)
        for s in scat:
            s.set_color(system_color[k])
            s.set_label(f"{k.title()} ({arch.upper()})")
ax.legend()
ax.set_xlabel("Problem")
ax.set_ylabel("Event rate per core/SM")
grid = ax.grid(which='both')
fig.savefig('plots/event-per-core.pdf', transparent=True)
plt.close()

summit cpu : per core = 0.00937430973459635
summit gpu : per core = 0.026576441084817742
perlmutter cpu : per core = 0.010478219733452542
perlmutter gpu : per core = 0.03428823887517954
frontier cpu : per core = 0.016540975901978176
frontier gpu : per core = 0.01533342854184768


In [118]:
# Per-task CPU power
joule_per_wh = (3600)

(fig, ax) = plt.subplots(subplot_kw=dict(yscale="log"),
        layout="constrained")
for k in analyses:
    r = rates[k]
    for arch in ['cpu', 'gpu']:
        v = r[r.index.get_level_values("arch") == arch] # events/(task * s)
        power = (cpu_power_per_task if arch == 'cpu' else gpu_power_per_task)[k] # J / s
        per_wh = v / (power / joule_per_wh) # ==> events / kwh
        print(k, arch, ":",
              "throughput =", v.loc[('testem3-flat', 'orange', arch), 'mean'],
              "power usage = ", power,
              ", per W-h =", per_wh.loc[('testem3-flat', 'orange', arch), 'mean'])
        scat = analyses[k].plot_results(ax, per_wh)
        for s in scat:
            s.set_color(system_color[k])
            s.set_label(f"{k.title()} ({arch.upper()})")

ax.legend()
ax.set_xlabel("Problem")
ax.set_ylabel("Events per W·h")
grid = ax.grid(which='both')
fig.savefig('plots/event-per-energy.pdf', transparent=True)
#fig.savefig('plots/event-per-energy.png', transparent=False, dpi=150)
plt.close()

summit cpu : throughput = 0.06562016814217445 power usage =  63.333333333333336 , per W-h = 3.7299885049236
summit gpu : throughput = 2.1261152867854194 power usage =  250 , per W-h = 30.61606012971004
perlmutter cpu : throughput = 0.16765151573524067 power usage =  70.0 , per W-h = 8.622077952098092
perlmutter gpu : throughput = 3.70312979851939 power usage =  250 , per W-h = 53.32506909867921
frontier cpu : throughput = 0.11578683131384723 power usage =  28.125 , per W-h = 14.820714408172446
frontier gpu : throughput = 1.6866771396032447 power usage =  250.0 , per W-h = 24.288150810286723
