In [6]:
import os
import sys
from time import perf_counter

import cupy as cp

test_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(test_dir, ".."))

from pyquda import init, core, quda, pyqcu, mpi
from pyquda.enum_quda import QudaParity
from pyquda.field import LatticeFermion
from pyquda.utils import gauge_utils

os.environ["QUDA_RESOURCE_PATH"] = ".cache"

In [7]:
latt_size = [16, 16, 16, 32]
grid_size = [1, 1, 1, 1]
Lx, Ly, Lz, Lt = latt_size
Nd, Ns, Nc = 4, 4, 3
Gx, Gy, Gz, Gt = grid_size
latt_size = [Lx // Gx, Ly // Gy, Lz // Gz, Lt // Gt]
Lx, Ly, Lz, Lt = latt_size
Vol = Lx * Ly * Lz * Lt
mpi.init(grid_size)


In [8]:
def compare(round):
    # generate a vector p randomly
    p = LatticeFermion(latt_size, cp.random.randn(Lt, Lz, Ly, Lx, Ns, Nc * 2).view(cp.complex128))
    Mp = LatticeFermion(latt_size)
    Mp1 = LatticeFermion(latt_size)

    print('===============round ', round, '======================')

    # Set parameters in Dslash and use m=-3.5 to make kappa=1
    dslash = core.getDslash(latt_size, -3.5, 0, 0, anti_periodic_t=False)
    # Generate gauge and then load it
    U = gauge_utils.gaussGauge(latt_size, round)
    dslash.loadGauge(U)

    cp.cuda.runtime.deviceSynchronize()
    t1 = perf_counter()
    quda.dslashQuda(Mp.even_ptr, p.odd_ptr, dslash.invert_param, QudaParity.QUDA_EVEN_PARITY)
    quda.dslashQuda(Mp.odd_ptr, p.even_ptr, dslash.invert_param, QudaParity.QUDA_ODD_PARITY)
    cp.cuda.runtime.deviceSynchronize()
    t2 = perf_counter()
    print(f'Quda dslash: {t2 - t1} sec')

    # then execute my code
    param = pyqcu.QcuParam()
    param.lattice_size = latt_size

    cp.cuda.runtime.deviceSynchronize()
    t1 = perf_counter()
    pyqcu.dslashQcu(Mp1.even_ptr, p.odd_ptr, U.data_ptr, param, 0)
    pyqcu.dslashQcu(Mp1.odd_ptr, p.even_ptr, U.data_ptr, param, 1)
    cp.cuda.runtime.deviceSynchronize()
    t2 = perf_counter()
    print(f'QCU dslash: {t2 - t1} sec')

    print('difference: ', cp.linalg.norm(Mp1.data - Mp.data) / cp.linalg.norm(Mp.data))


In [9]:
for i in range(0, 5):
    compare(i)


Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.02937575400028436 sec
QCU dslash: 2.2172904800008837 sec
difference:  2.801809750105575e-16
Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.029179369001212763 sec
QCU dslash: 1.206342333000066 sec
difference:  2.8018663302238585e-16
Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.029170587000407977 sec
QCU dslash: 1.0685533739997481 sec
difference:  2.8025862643791075e-16
Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.029195077000622405 sec
QCU dslash: 1.0352881620001426 sec
difference:  2.802950385580066e-16
Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.029166947000703658 sec
QCU dslash: 1.1110246959997312 sec
difference:  2.8023235633997327e-16
