In [7]:
from pyquda.utils import gauge_utils
from pyquda.field import LatticeFermion
from pyquda.enum_quda import QudaParity
import pyquda.pyqcu as pyqcu_pyquda
from pyquda import init, core, quda, mpi
import os
import sys
from time import perf_counter
import cupy as cp
__file__ = "."
test_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(test_dir, ".."))

In [8]:
os.environ["QUDA_RESOURCE_PATH"] = ".cache"
latt_size = [32, 32, 32, 64]
grid_size = [1, 1, 1, 1]
Lx, Ly, Lz, Lt = latt_size
Nd, Ns, Nc = 4, 4, 3
Gx, Gy, Gz, Gt = grid_size
latt_size = [Lx // Gx, Ly // Gy, Lz // Gz, Lt // Gt]
Lx, Ly, Lz, Lt = latt_size
Vol = Lx * Ly * Lz * Lt
mpi.init(grid_size)

In [9]:
round = 1
# generate a vector p randomly
p = LatticeFermion(latt_size, cp.random.randn(
    Lt, Lz, Ly, Lx, Ns, Nc * 2).view(cp.complex128))
Mp = LatticeFermion(latt_size)
Mp1 = LatticeFermion(latt_size)
print('===============round ', round, '======================')
# Set parameters in Dslash and use m=-3.5 to make kappa=1
dslash = core.getDslash(latt_size, -3.5, 0, 0, anti_periodic_t=False)
# Generate gauge and then load it
U = gauge_utils.gaussGauge(latt_size, round)
dslash.loadGauge(U)
cp.cuda.runtime.deviceSynchronize()
t1 = perf_counter()
quda.dslashQuda(Mp.even_ptr, p.odd_ptr, dslash.invert_param,
                QudaParity.QUDA_EVEN_PARITY)
quda.dslashQuda(Mp.odd_ptr, p.even_ptr, dslash.invert_param,
                QudaParity.QUDA_ODD_PARITY)
cp.cuda.runtime.deviceSynchronize()
t2 = perf_counter()
print(f'Quda dslash: {t2 - t1} sec')
# # then execute my code
# param = pyqcu_pyquda.QcuParam()
# param.lattice_size = latt_size
# grid = pyqcu_pyquda.QcuParam()
# grid.lattice_size = grid_size
# cp.cuda.runtime.deviceSynchronize()
# t1 = perf_counter()
# pyqcu_pyquda.mpiDslashQcu(Mp1.even_ptr, p.odd_ptr, U.data_ptr, param, 0, grid)
# pyqcu_pyquda.mpiDslashQcu(Mp1.odd_ptr, p.even_ptr, U.data_ptr, param, 1, grid)
# cp.cuda.runtime.deviceSynchronize()
# t2 = perf_counter()
# print(f'QCU dslash: {t2 - t1} sec')
# print('quda difference: ', cp.linalg.norm(Mp1.data - Mp.data) / cp.linalg.norm(Mp.data))

Creating Gaussian distrbuted Lie group field with sigma = 1.000000e-01
Quda dslash: 0.029352483000366192 sec


In [10]:
print(type(Mp.data))
print(Lx*Ly*Lz*Lt)
print(Mp.data.shape)
print(Mp.data[len(Mp.data)/2:].shape)
print(Mp.data[0].shape[2])
x_gpu = cp.array(range(9)).reshape([3,3])
print(x_gpu)
print(x_gpu*x_gpu*2)
print(Mp.data.shape)
print(Mp.data.shape[1:-1])
print(Mp.data.shape[1:-1][::-1])
print(Mp.data[0,0,0,0,0])

<class 'cupy.ndarray'>
2097152
(2, 64, 32, 32, 16, 4, 3)
(1, 64, 32, 32, 16, 4, 3)
32
[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[  0   2   8]
 [ 18  32  50]
 [ 72  98 128]]
(2, 64, 32, 32, 16, 4, 3)
(64, 32, 32, 16, 4)
(4, 16, 32, 32, 64)
[[-7.79463167+3.82041185j -2.67918233-8.81205315j -0.96079799+2.7464239j ]
 [-2.5789167 -4.62066228j -8.90062615-0.31839921j -0.5981887 +2.69039185j]
 [ 0.03278843+0.04172689j  1.34531541+5.50177627j  1.61409576+3.78606659j]
 [ 0.58109637-1.75806598j -0.65134949-1.95508613j  8.15004829-6.18272868j]]


In [11]:
import pyqcu.wilson_dslash
Mp_pyqcu = LatticeFermion(latt_size)
t1 = perf_counter()
pyqcu.wilson_dslash.run(src=p.data, dest=Mp_pyqcu.data, U=U.data)
t2 = perf_counter()
print(f'QCU dslash: {t2 - t1} sec')
print('quda difference: ', cp.linalg.norm(Mp_pyqcu.data - Mp.data) / cp.linalg.norm(Mp.data))

QCU dslash: 0.28550472799997806 sec
quda difference:  1.0


In [12]:
# M_TEST_LIST = [LatticeFermion(latt_size) for i in range(20)]