# Bmad-X Numpy, Numba, Pytorch tests


Note: on Cori `conda install cudatoolkit=11.0`

In [1]:
from bmadx.track import make_track_a_drift, Particle, Drift

import numpy as np

import math

import matplotlib.pyplot as plt

In [2]:
c_light = 2.99792458e8 #speed of light in m/s
m_e = 0.510998950e6 #electron mass in eV

In [3]:
s = 0.0 #initial s
p0c = 4.0E+07 #Reference particle momentum in eV
mc2 = 1*m_e # electron mass in eV
#pvec1 = [2e-3,3e-3,-3e-3,-1e-3,2e-3,-2e-3] 

## Numpy

Create 10 million test particles

In [4]:
N_PARTICLE = 10_000_000

np.random.seed(999)

pvec0 = np.random.normal( size=(N_PARTICLE, 6), scale=.001)
np.std(pvec0[:, 0])

0.0010000745670385076

In [5]:
P0 = Particle(pvec0[:,0],
              pvec0[:,1],
              pvec0[:,2],
              pvec0[:,3],
              pvec0[:,4],
              pvec0[:,5],
              s, p0c, mc2)
P0.x

array([ 0.00012716,  0.00156627, -0.00111006, ..., -0.00063516,
       -0.00116169,  0.00162879])

In [6]:
track_a_drift = make_track_a_drift(np)
D1 = Drift(L=1)

In [7]:
P1 = track_a_drift(P0, D1)
np.std(P1.x)

0.0014142797576549417

In [8]:
%%timeit
P1 = track_a_drift(P0, D1)

297 ms ± 1.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Numba CPU

In [9]:
import numba
from numba import guvectorize, float64, jit
numba.config.NUMBA_NUM_THREADS

128

In [10]:
track_a_drift = make_track_a_drift(np)
#track_a_drift = numba.njit(make_track_a_drift(np))

In [11]:
#%%timeit
#track_a_drift(P0, D1)

In [12]:
params = D1
g = numba.njit( make_track_a_drift(np))


@guvectorize([(float64[:], float64[:])], '(n)->(n)')
def vg_numba(a_in, a_out):
    p_in = Particle(x  = a_in[0],
                    px = a_in[1],
                    y  = a_in[2],
                    py = a_in[3],
                    z  = a_in[4],
                    pz = a_in[5],
                    s=s, p0c=p0c, mc2=mc2)
    p_out = g(p_in, params)
    a_out[0] = p_out.x
    a_out[1] = p_out.px    
    a_out[2] = p_out.y
    a_out[3] = p_out.py  
    a_out[4] = p_out.z
    a_out[5] = p_out.pz  

@guvectorize([(float64[:], float64[:])], '(n)->(n)', target='parallel')
def vg_numba_parallel(a_in, a_out):
    p_in = Particle(x  = a_in[0],
                    px = a_in[1],
                    y  = a_in[2],
                    py = a_in[3],
                    z  = a_in[4],
                    pz = a_in[5],
                    s=s, p0c=p0c, mc2=mc2)
    p_out = g(p_in, params)
    a_out[0] = p_out.x
    a_out[1] = p_out.px    
    a_out[2] = p_out.y
    a_out[3] = p_out.py  
    a_out[4] = p_out.z
    a_out[5] = p_out.pz  
    

pvec1 = np.zeros_like(pvec0)      
vg_numba_parallel(pvec0, pvec1)
np.std(pvec1[:,0])

0.0014142797576549417

In [13]:
%%timeit
vg_numba(pvec0, pvec1)

144 ms ± 212 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%%timeit
vg_numba_parallel(pvec0, pvec1)

33.8 ms ± 940 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
np.std(pvec1[:, 0])

0.0014142797576549417

## Numba CUDA

In [16]:
from numba import cuda

In [17]:
from numba import cuda, guvectorize, float64

@guvectorize([(float64[:], float64[:])], '(n)->(n)', target='cuda')
def crashes(a_in, a_out):
    pass

In [18]:
# Particle method
params = D1
g = cuda.jit(make_track_a_drift(math), device=True)

@guvectorize([(float64[:], float64[:])], '(n)->(n)', target='cuda')
def vg_parallel_cuda(a_in, a_out):
    p_in = Particle(x  = a_in[0],
                    px = a_in[1],
                    y  = a_in[2],
                    py = a_in[3],
                    z  = a_in[4],
                    pz = a_in[5],
                    s=s, p0c=p0c, mc2=mc2)
    p_out = g(p_in, params)
    a_out[0] = p_out.x
    a_out[1] = p_out.px    
    a_out[2] = p_out.y
    a_out[3] = p_out.py  
    a_out[4] = p_out.z
    a_out[5] = p_out.pz  

In [19]:
%%timeit
vg_parallel_cuda(pvec0, pvec1)

98.5 ms ± 4.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
np.std(pvec1[:,0])

0.0014142797576549417

## PyTorch

In [21]:
import torch

In [22]:
tkwargs = {
    "dtype" : torch.double
}

In [23]:
track_a_drift = make_track_a_drift(torch)

In [24]:
tvec0= torch.tensor(pvec0, requires_grad=True, **tkwargs)
ts = torch.tensor(s, **tkwargs)
tp0c = torch.tensor(p0c, **tkwargs)
tmc2 = torch.tensor(mc2, **tkwargs)

tparticles0 = Particle(tvec0[:,0],
                tvec0[:,1],
                tvec0[:,2],
                tvec0[:,3],
                tvec0[:,4],
                tvec0[:,5],
                ts, tp0c, tmc2)

In [25]:
tparticles1 = track_a_drift(tparticles0, params)

tparticles1.x.std()

tensor(0.0014, dtype=torch.float64, grad_fn=<StdBackward0>)

In [26]:
%%timeit
tparticles1 = track_a_drift(tparticles0, params)

215 ms ± 20.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Compiler time: 0.16 s


# System Information

In [27]:
!echo $NERSC_HOST

perlmutter


In [28]:
numba.config.NUMBA_NUM_THREADS

128

In [29]:
!nvidia-smi

Mon Oct 17 15:31:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:03:00.0 Off |                    0 |
| N/A   28C    P0    60W / 400W |   4080MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   26C    P0    50W / 400W |      3MiB / 40960MiB |      0%      Default |
|       