In [3]:
import os
import sys
from time import perf_counter
import cupy as cp
__file__="."
test_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(test_dir, ".."))
from pyquda import init, core, quda, mpi
import pyquda.pyqcu as pyquda_pyqcu
from pyquda.enum_quda import QudaParity
from pyquda.field import LatticeFermion
from pyquda.utils import gauge_utils

In [4]:
os.environ["QUDA_RESOURCE_PATH"] = ".cache"
latt_size = [32, 32, 32, 64]
grid_size = [1, 1, 1, 1]
Lx, Ly, Lz, Lt = latt_size
Nd, Ns, Nc = 4, 4, 3
Gx, Gy, Gz, Gt = grid_size
latt_size = [Lx // Gx, Ly // Gy, Lz // Gz, Lt // Gt]
Lx, Ly, Lz, Lt = latt_size
Vol = Lx * Ly * Lz * Lt
mpi.init(grid_size)

Disabling GPU-Direct RDMA access
Enabling peer-to-peer copy engine and direct load/store access
QUDA 1.1.0 (git 1.1.0--sm_80)
CUDA Driver version = 12040
CUDA Runtime version = 12030
Found device 0: NVIDIA GeForce RTX 4060 Laptop GPU
 -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.

Using device 0: NVIDIA GeForce RTX 4060 Laptop GPU
Loaded 20 sets of cached parameters from .cache/tunecache.tsv
Loaded 20 sets of cached parameters from .cache/tunecache.tsv
cublasCreated successfully


In [7]:
def compare(round):
    # generate a vector p randomly
    p = LatticeFermion(latt_size, cp.random.randn(Lt, Lz, Ly, Lx, Ns, Nc * 2).view(cp.complex128))
    Mp = LatticeFermion(latt_size)
    Mp1 = LatticeFermion(latt_size)
    Mp2 = LatticeFermion(latt_size)

    print('===============round ', round, '======================')

    # Set parameters in Dslash and use m=-3.5 to make kappa=1
    dslash = core.getDslash(latt_size, -3.5, 0, 0, anti_periodic_t=False)
    # Generate gauge and then load it
    U = gauge_utils.gaussGauge(latt_size, round)
    dslash.loadGauge(U)

    cp.cuda.runtime.deviceSynchronize()
    t1 = perf_counter()
    quda.dslashQuda(Mp.even_ptr, p.odd_ptr, dslash.invert_param, QudaParity.QUDA_EVEN_PARITY)
    quda.dslashQuda(Mp.odd_ptr, p.even_ptr, dslash.invert_param, QudaParity.QUDA_ODD_PARITY)
    cp.cuda.runtime.deviceSynchronize()
    t2 = perf_counter()
    print(f'Quda dslash: {t2 - t1} sec')

    # then execute my code
    param = pyquda_pyqcu.QcuParam()
    param.lattice_size = latt_size
    grid = pyquda_pyqcu.QcuParam()
    grid.lattice_size = grid_size

    cp.cuda.runtime.deviceSynchronize()
    t1 = perf_counter()
    pyquda_pyqcu.mpiDslashQcu(Mp1.even_ptr, p.odd_ptr, U.data_ptr, param, 0, grid)
    pyquda_pyqcu.mpiDslashQcu(Mp1.odd_ptr, p.even_ptr, U.data_ptr, param, 1, grid)
    cp.cuda.runtime.deviceSynchronize()
    t2 = perf_counter()
    print(f'QCU dslash: {t2 - t1} sec')
    print('quda difference: ', cp.linalg.norm(Mp1.data - Mp.data) / cp.linalg.norm(Mp.data))

In [8]:
for i in range(0, 10):
    compare(i)

Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.13606588699985878 sec
mpi wilson dslash total time: (without malloc free memcpy) :0.186112081 sec
QCU dslash: 0.5368518139998741 sec
mpi wilson dslash total time: (without malloc free memcpy) :0.186776113 sec
quda difference:  2.7602268669363147e-16
Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.16045559800022602 sec
mpi wilson dslash total time: (without malloc free memcpy) :0.291091948 sec
QCU dslash: 0.664620062999802 sec
mpi wilson dslash total time: (without malloc free memcpy) :0.180657179 sec
quda difference:  2.759734436873648e-16
Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.06312690799995835 sec
mpi wilson dslash total time: (without malloc free memcpy) :0.299412654 sec
mpi wilson dslash total time: (without malloc free memcpy) :0.188500137 sec
QCU dslash: 0.6373879459997625 sec
quda difference:  2.759869741263248