# Init

## Import

In [2]:
from pyquda_utils import core
from pyquda.field import LatticeGauge
import cupy as cp
import numpy as np
from pyqcu.cuda import define, io, qcu, eigen, cg, bistabcg, amg, linalg, gauge
from time import perf_counter
from opt_einsum import contract
from pyqcu.cuda.set import params, argv, set_ptrs



## Give

In [3]:
params[define._LAT_X_] = 8
params[define._LAT_Y_] = 8
params[define._LAT_Z_] = 8
params[define._LAT_T_] = 8
params[define._LAT_XYZT_] = params[define._LAT_X_] * \
    params[define._LAT_Y_] * params[define._LAT_Z_] * params[define._LAT_T_]
params[define._DATA_TYPE_] = define._LAT_C64_
sigma = 0.1
seed = 12138
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
argv[define._MASS_] = -3.5
argv[define._TOL_] = 1e-12
kappa = 1 / (2 * argv[define._MASS_] + 8)
print(define.dtype(params[define._DATA_TYPE_]))
src = cp.ones(params[define._LAT_XYZT_]*define._LAT_SC_,
              dtype=define.dtype(params[define._DATA_TYPE_]))
src = io.fermion2psctzyx(src, params)
print("Src data:", src.data)
print("Src shape:", src.shape)
argv = argv.astype(src.real.dtype)
print("Arguments:", argv)
print("Arguments data:", argv.data)
print("Arguments dtype:", argv.dtype)
print("Demo is running...")
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)
qcu_gauge = gauge.give_gauss_SU3(sigma=sigma, seed=seed,
                         dtype=src.dtype, size=params[define._LAT_XYZT_]*define._LAT_D_)
qcu_gauge = io.gauge2dptzyxcc(qcu_gauge, params)
qcu_gauge = io.dptzyxcc2ccdptzyx(qcu_gauge)
print("qcu_gauge data:", qcu_gauge.data)
print("qcu_gauge shape:", qcu_gauge.shape)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
clover_even = cp.zeros((define._LAT_S_, define._LAT_C_, define._LAT_S_, define._LAT_C_,
                       params[define._LAT_T_], params[define._LAT_Z_], params[define._LAT_Y_], int(params[define._LAT_X_]/define._LAT_P_),), dtype=src.dtype)
clover_odd = cp.zeros((define._LAT_S_, define._LAT_C_, define._LAT_S_, define._LAT_C_,
                       params[define._LAT_T_], params[define._LAT_Z_], params[define._LAT_Y_], int(params[define._LAT_X_]/define._LAT_P_),), dtype=src.dtype)
clover_dslash_eo_params = params.copy()
clover_dslash_eo_params[define._SET_INDEX_] = 5
clover_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN2_
clover_dslash_eo_params[define._PARITY_] = define._EVEN_
clover_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, clover_dslash_eo_params, argv)
clover_dslash_oe_params = params.copy()
clover_dslash_oe_params[define._SET_INDEX_] = 6
clover_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN2_
clover_dslash_oe_params[define._PARITY_] = define._ODD_
clover_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, clover_dslash_oe_params, argv)
clover_src = cp.zeros_like(src)
print(clover_src.shape)
clover_src = (linalg.initialize_random_vector(
    clover_src.flatten())).reshape(clover_src.shape)
clover_dest = cp.zeros_like(clover_src)
_clover_dest = cp.zeros_like(clover_src)
qcu.applyCloverQcu(clover_even, qcu_gauge, set_ptrs, clover_dslash_eo_params)
qcu.applyCloverDslashQcu(_clover_dest, clover_src,
                         qcu_gauge, set_ptrs, clover_dslash_eo_params)
qcu.applyDslashQcu(clover_dest, clover_src, clover_even,
                   qcu_gauge, set_ptrs, clover_dslash_eo_params)
print(cp.linalg.norm(_clover_dest - clover_dest))
qcu.applyCloverQcu(clover_odd, qcu_gauge, set_ptrs, clover_dslash_oe_params)
qcu.applyCloverDslashQcu(_clover_dest, clover_src,
                         qcu_gauge, set_ptrs, clover_dslash_oe_params)
qcu.applyDslashQcu(clover_dest, clover_src, clover_odd,
                   qcu_gauge, set_ptrs, clover_dslash_oe_params)
print(cp.linalg.norm(_clover_dest - clover_dest))


<class 'numpy.complex64'>
Src data: <MemoryPointer 0x906a60000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7ff4d9cb50b0>>
Src shape: (2, 4, 3, 8, 8, 8, 4)
Arguments: [-3.5e+00  1.0e-12]
Arguments data: <memory at 0x7ff3f4030dc0>
Arguments dtype: float32
Demo is running...
Set pointers: [0 0 0 0 0 0 0 0 0 0]
Set pointers data: <memory at 0x7ff3f4030e80>
U: [ 0.98811775-0.1077546j   0.01724124+0.10598562j  0.01480209+0.01621542j
 -0.01351784+0.10397463j  0.98431575+0.05315351j -0.01137744+0.13105059j
 -0.02779366+0.01569425j -0.00476513+0.12939954j  0.9896003 +0.05390258j]
_U: [ 0.        +0.j          0.        +0.j          0.        +0.j
  0.        +0.j          0.        +0.j          0.        +0.j
 -0.02779366+0.01569424j -0.00476513+0.12939954j  0.9896003 +0.05390257j]
Gauge: 147456
qcu_gauge data: <MemoryPointer 0x906ac0000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7ff4d93547b0>>
qcu_gauge shape: (3, 3, 4, 2, 8, 8, 8, 4)
gridDim.x               :64
bloc

## Define

In [4]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, qcu_gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, qcu_gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, qcu_gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, qcu_gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))


def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, qcu_gauge, set_ptrs, wilson_dslash_eo_params)
    return dest


def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, qcu_gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest


def dslash(src):
    return dslash_no_dag(src)


def bistabcg_dslash(src):
    return pdslash_no_dag(src)



# Quda

## Init

In [None]:
grid_size = [1, 1, 1, 1]
latt_size = [params[define._LAT_X_], params[define._LAT_Y_],
             params[define._LAT_Z_], params[define._LAT_T_]]
xi_0, nu = 1.0, 1.0
kappa = 1.0
mass = 1 / (2 * kappa) - 4
coeff = 1.0
coeff_r, coeff_t = 1.0, 1.0
core.init(grid_size, latt_size, -1, xi_0 / nu, resource_path=".cache")
latt_info = core.getDefaultLattice()
Lx, Ly, Lz, Lt = latt_info.size

PyQUDA INFO: Using CUDA backend cupy
PyQUDA INFO: Using the grid size [1, 1, 1, 1]
PyQUDA INFO: Using the default lattice LatticeInfo([np.int32(8), np.int32(8), np.int32(8), np.int32(8)], -1, 1.0)
PyQUDA INFO: Using QUDA_RESOURCE_PATH=.cache
Disabling GPU-Direct RDMA access
Enabling peer-to-peer copy engine and direct load/store access
QUDA 1.1.0 (git 1.1.0-b58f1ecb5-sm_80)
CUDA Driver version = 12080
CUDA Runtime version = 12040
Graphic driver version = 572.42
Found device 0: NVIDIA GeForce RTX 4060 Laptop GPU
 -- This might result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.

Using device 0: NVIDIA GeForce RTX 4060 Laptop GPU
Initializing monitoring on device 0: NVIDIA GeForce RTX 4060 Laptop GPU
Loaded 23 sets of cached parameters from .cache/tunecache.tsv
cublasCreated successfully


### Give

In [9]:

quda_dslash = core.getDefaultDirac(mass, 1e-12, 1000, xi_0, coeff_t, coeff_r)
quda_gauge = LatticeGauge(latt_info=latt_info)
print(type(qcu_gauge))
print(qcu_gauge.dtype)
print(qcu_gauge.shape)
print(type(quda_gauge.data))
print(quda_gauge.data.dtype)
print(quda_gauge.data.shape)
quda_gauge.data[:] = io.ccdptzyx2dptzyxcc(
    qcu_gauge).astype(quda_gauge.data.dtype)
print(quda_gauge.data[0])
quda_dslash.loadGauge(quda_gauge)


<class 'cupy.ndarray'>
complex64
(3, 3, 4, 2, 8, 8, 8, 4)
<class 'cupy.ndarray'>
complex128
(4, 2, 8, 8, 8, 4, 3, 3)


: 