# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define, io, qcu, eigen, cg, bistabcg, amg, linalg, gauge, demo
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv
params[define._LAT_X_] = 8
params[define._LAT_Y_] = 8
params[define._LAT_Z_] = 8
params[define._LAT_T_] = 8
params[define._LAT_XYZT_] = params[define._LAT_X_] * \
    params[define._LAT_Y_] * params[define._LAT_Z_] * params[define._LAT_T_]
params[define._DATA_TYPE_] = define._LAT_C128_
sigma = 1.0
seed = 12138
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
argv[define._MASS_] = -3.5
argv[define._TOL_] = 1e-12
kappa = 1 / (2 * argv[define._MASS_] + 8)
U, src, dest, set_ptrs, wilson_cg_params, wilson_dslash_eo_params, wilson_dslash_oe_params, wilson_dslash_eo_dag_params, wilson_dslash_oe_dag_params = demo.give(
    params=params, sigma=sigma, seed=seed)


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (wilson:x=4,y=4,z=4,t=4;clover:x=8,y=8,z=8,t=8) that QCU support (when '#define _BLOCK_SIZE_ 

# Clover

In [2]:
clover_even = cp.zeros((define._LAT_S_, define._LAT_C_, define._LAT_S_, define._LAT_C_,
                       params[define._LAT_T_], params[define._LAT_Z_], params[define._LAT_Y_], int(params[define._LAT_X_]/define._LAT_P_),), dtype=src.dtype)
clover_odd = cp.zeros((define._LAT_S_, define._LAT_C_, define._LAT_S_, define._LAT_C_,
                       params[define._LAT_T_], params[define._LAT_Z_], params[define._LAT_Y_], int(params[define._LAT_X_]/define._LAT_P_),), dtype=src.dtype)
clover_dslash_eo_params = params.copy()
clover_dslash_eo_params[define._SET_INDEX_] = 5
clover_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN2_
clover_dslash_eo_params[define._PARITY_] = define._EVEN_
clover_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, clover_dslash_eo_params, argv)
clover_dslash_oe_params = params.copy()
clover_dslash_oe_params[define._SET_INDEX_] = 6
clover_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN2_
clover_dslash_oe_params[define._PARITY_] = define._ODD_
clover_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, clover_dslash_oe_params, argv)
clover_src = cp.zeros_like(src[define._EVEN_])
clover_src = (linalg.initialize_random_vector(clover_src.flatten())).reshape(clover_src.shape)
clover_dest = cp.zeros_like(clover_src)
_clover_dest = cp.zeros_like(clover_src)
qcu.applyCloverQcu(clover_even, U, set_ptrs, clover_dslash_eo_params)
qcu.applyCloverDslashQcu(_clover_dest, clover_src, U, set_ptrs, clover_dslash_eo_params)
qcu.applyDslashQcu(clover_dest, clover_src, clover_even,
                   U, set_ptrs, clover_dslash_eo_params)
print(cp.linalg.norm(_clover_dest - clover_dest))
qcu.applyCloverQcu(clover_odd, U, set_ptrs, clover_dslash_oe_params)
qcu.applyCloverDslashQcu(_clover_dest, clover_src, U, set_ptrs, clover_dslash_oe_params)
qcu.applyDslashQcu(clover_dest, clover_src, clover_odd,
                   U, set_ptrs, clover_dslash_oe_params)
print(cp.linalg.norm(_clover_dest - clover_dest))

gridDim.x               :64
blockDim.x              :32
host_params[_LAT_X_]    :4
host_params[_LAT_Y_]    :8
host_params[_LAT_Z_]    :8
host_params[_LAT_T_]    :8
host_params[_LAT_XYZT_] :2048
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_DATA_TYPE_]:4
host_params[_SET_INDEX_]:5
host_params[_SET_PLAN_] :2
host_params[_MG_X_]     :4
host_params[_MG_Y_]     :4
host_params[_MG_Z_]     :4
host_params[_MG_T_]     :8
host_params[_LAT_E_]    :24
host_argv[_MASS_]       :6.569320e-99
host_argv[_TOL_]        :6.924549e-310
lat_2dim[_XY_]          :32
lat_2dim[_XZ_]          :32
lat_2dim[_XT_]          :32
lat_2dim[_YZ_]          :64
lat_2dim[_YT_]          :64
lat_2dim[_ZT_]          :64
lat_3dim[_YZT_]         :512
lat_3dim[_XZT_]         :256
lat_3dim[_XYT_]         :256
lat_3dim[_

# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [3]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, U, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, U, set_ptrs, wilson_dslash_oe_params)
    # qcu.applyDslashQcu(tmp0, src, clover_even,
    #                    U, set_ptrs, clover_dslash_eo_params)
    # qcu.applyDslashQcu(tmp1, tmp0, clover_odd,
    #                    U, set_ptrs, clover_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, U, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, U, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))


def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, U, set_ptrs, wilson_dslash_eo_params)
    return dest


def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, U, set_ptrs, wilson_dslash_eo_dag_params)
    return dest


def dslash(src):
    return dslash_no_dag(src)


def bistabcg_dslash(src):
    return pdslash_no_dag(src)


print(cp.linalg.norm((dest[define._EVEN_]-kappa *
                      dslash(dest[define._ODD_]))-src[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000379938 sec
nan


# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [4]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(src.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = dest[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 8, 8, 8, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000164894 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000181528 sec
nan
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000115537 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000105596 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000107122 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000104819 sec
nan


In [5]:
qcu_U = U.copy()

# QUDA

In [6]:
import numpy as np
import cupy as cp

# from check_pyquda import weak_field

from pyquda import init, pyquda as quda
from pyquda.field import Ns, Nc
from pyquda.enum_quda import QudaParity
from pyquda_utils import core
init([1, 1, 1, 1], [params[define._LAT_X_], params[define._LAT_Y_],
     params[define._LAT_Z_], params[define._LAT_T_]], 1, 1.0)
latt_info = core.getDefaultLattice()
Lx, Ly, Lz, Lt = latt_info.size
xi_0, nu = 1.0, 1.0
kappa = 1.0
mass = 1 / (2 * kappa) - 4
coeff = 1.0
coeff_r, coeff_t = 1.0, 1.0

core.init([1, 1, 1, 1], [4, 4, 4, 8], -1, xi_0 / nu, resource_path=".cache")

dslash = core.getDefaultDirac(mass, 1e-12, 1000, xi_0, coeff_t, coeff_r)
# dslash = core.getDslash(latt_size=latt_info.size, mass=argv[define._MASS_], maxiter=params[define._MAX_ITER_], tol=argv[define._TOL_], xi_0=1,
#                         clover_coeff_r=1, clover_coeff_t=1, multigrid=False, anti_periodic_t=False)


PyQUDA INFO: Using CUDA backend cupy
PyQUDA INFO: Using the grid size [1, 1, 1, 1]
PyQUDA INFO: Using the default lattice LatticeInfo([np.int32(8), np.int32(8), np.int32(8), np.int32(8)], 1, 1.0)


Stack (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/root/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/root/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/root/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
    handle._run()
  File "/usr/lib/python3.10/asyncio/events.py", line 80, in 

Disabling GPU-Direct RDMA access
Enabling peer-to-peer copy engine and direct load/store access
QUDA 1.1.0 (git 1.1.0-b58f1ecb5-sm_80)
CUDA Driver version = 12080
CUDA Runtime version = 12040
Graphic driver version = 572.42
Found device 0: NVIDIA GeForce RTX 4060 Laptop GPU
 -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.

Using device 0: NVIDIA GeForce RTX 4060 Laptop GPU
Initializing monitoring on device 0: NVIDIA GeForce RTX 4060 Laptop GPU
cublasCreated successfully


In [7]:
argv

array([-3.5e+00,  1.0e-12], dtype=float32)

In [8]:
quda_p = cp.zeros((Lt, Lz, Ly, Lx, Ns, Nc), "<c16")
quda_p = (linalg.initialize_random_vector(
    quda_p.flatten())).reshape(quda_p.shape)
print(quda_p.shape)
qcu_p = io.tzyxsc2sctzyx(quda_p)
qcu_p = io.xxxtzyx2pxxxtzyx(qcu_p)
qcu_Mp = cp.zeros_like(qcu_p)
print(qcu_p.shape)
quda_Mp = cp.zeros((Lt, Lz, Ly, Lx, Ns, Nc), "<c16")
print(quda_Mp.shape)


(8, 8, 8, 8, 4, 3)
Splited Array Shape: (2, 4, 3, 8, 8, 8, 4)
(2, 4, 3, 8, 8, 8, 4)
(8, 8, 8, 8, 4, 3)


In [9]:

U = core.LatticeGauge(latt_info)


In [None]:

U.data[:] = io.ccdptzyx2dptzyxcc(qcu_U).copy()

: 

In [None]:

print(type(U.data))
print(U.data.shape)
print(U.data.dtype)
dslash.loadGauge(U)


In [None]:

a = core.LatticeFermion(latt_info, cp.asarray(
    core.cb2(quda_p.get(), [0, 1, 2, 3])))
b = core.LatticeFermion(latt_info)
quda.dslashQuda(b.even_ptr, a.odd_ptr, dslash.invert_param,
                QudaParity.QUDA_EVEN_PARITY)
quda.dslashQuda(b.odd_ptr, a.even_ptr, dslash.invert_param,
                QudaParity.QUDA_ODD_PARITY)
quda_Mp[:] = cp.array(b.lexico())
qcu.applyDslashQcu(qcu_Mp[define._EVEN_], qcu_p[define._ODD_], clover_even,
                   qcu_U, set_ptrs, clover_dslash_eo_params)
qcu.applyDslashQcu(qcu_Mp[define._ODD_], qcu_p[define._EVEN_], clover_odd,
                   qcu_U, set_ptrs, clover_dslash_oe_params)
_qcu_Mp=io.sctzyx2tzyxsc(io.pxxxtzyx2xxxtzyx(qcu_Mp))
print(linalg.norm(cp.array(quda_Mp)))
print(linalg.norm(_qcu_Mp))
print(linalg.norm(cp.array(quda_Mp)-_qcu_Mp))

In [None]:
print(quda_Mp.shape)

In [None]:
print(_qcu_Mp[0, 0, 0, 1])
print(quda_Mp[0, 0, 0, 1])

In [None]:
qcu_U.dtype

# Origin CG. (pass, don't run this)

In [None]:
# b_e = src[define._EVEN_].flatten()
# b_o = src[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, U, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = pdslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, U, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give _dest
# _dest = cp.zeros_like(dest)
# _dest[define._EVEN_] = x_e.reshape(
#     dest[define._EVEN_].shape)
# _dest[define._ODD_] = x_o.reshape(
#     dest[define._ODD_].shape)
# print(np.linalg.norm(_dest-dest) /
#       np.linalg.norm(dest))

# End for pyqcu. (pass, don't run this)

In [None]:
# demo.end(set_ptrs=set_ptrs,params=params)