# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define, io, qcu, eigen, cg, bistabcg, amg, linalg, gauge, demo
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params,argv
params[define._LAT_X_] = 4
params[define._LAT_Y_] = 4
params[define._LAT_Z_] = 4
params[define._LAT_T_] = 4
params[define._LAT_XYZT_] = params[define._LAT_X_] * \
    params[define._LAT_Y_] * params[define._LAT_Z_] * params[define._LAT_T_]
params[define._DATA_TYPE_] = define._LAT_C64_
sigma=1.0
seed=12138
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
argv[define._TOL_] = 1e-12
kappa = 1 / (2 * argv[define._MASS_] + 8)



    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=4) that QCU support (when '#define _BLOCK_SIZE_ 32 // for test small lattice')

In [2]:
U, src, dest, set_ptrs, wilson_cg_params, wilson_dslash_eo_params, wilson_dslash_oe_params, wilson_dslash_eo_dag_params, wilson_dslash_oe_dag_params = demo.give(
    params=params, sigma=sigma, seed=seed)

My rank is  0
Parameters: [    4     4     4     4   256     1     1     1     1     0     0     1
     0 10000     3     0     0     4     4     4     8    24]
Src data: <MemoryPointer 0x905600000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7fbce66260b0>>
Src shape: (2, 4, 3, 4, 4, 4, 2)
Arguments: [0.e+00 1.e-12]
Arguments data: <memory at 0x7fbd249b9900>
Arguments dtype: float32
Demo is running...
Set pointers: [0 0 0 0 0 0 0 0 0 0]
Set pointers data: <memory at 0x7fbd249b9840>
U: [ 0.09091689-0.68413734j  0.25713703+0.5643516j  -0.359751  +0.09827108j
  0.10606002+0.43286037j -0.11158494+0.09308038j -0.53892285+0.69987863j
 -0.56455034+0.0797259j  -0.51050884+0.5776273j   0.27054834+0.08635756j]
_U: [ 0.        +0.j          0.        +0.j          0.        +0.j
  0.        +0.j          0.        +0.j          0.        +0.j
 -0.56455034+0.07972592j -0.5105088 +0.57762736j  0.27054828+0.08635758j]
Gauge: 9216
U data: <MemoryPointer 0x905606000 device=0 mem=<cupy.cuda.

# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [3]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, U, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, U, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, U, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, U, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))


def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, U, set_ptrs, wilson_dslash_eo_params)
    return dest


def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, U, set_ptrs, wilson_dslash_eo_dag_params)
    return dest


def dslash(src):
    return dslash_no_dag(src)


def bistabcg_dslash(src):
    return pdslash_no_dag(src)

def matvec(src):
    return bistabcg_dslash(src)

print(cp.linalg.norm((dest[define._EVEN_]-kappa *
                      dslash(dest[define._ODD_]))-src[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000272061 sec
6.3079625e-07


In [4]:
dest[define._EVEN_]

array([[[[[[ 1.22341597e+00-2.11040407e-01j,
             9.48441327e-01+2.92964935e-01j],
           [ 1.07710218e+00+2.51541674e-01j,
             5.96388042e-01-2.02659950e-01j],
           [ 1.28736401e+00-2.23329216e-02j,
             8.78054738e-01+8.11285198e-01j],
           [ 1.00727391e+00+2.47124285e-01j,
             9.96571779e-01-1.11196779e-01j]],

          [[ 1.19774866e+00-6.17209196e-01j,
             1.15524435e+00+2.36841232e-01j],
           [ 1.06326640e+00+2.59047091e-01j,
             1.26831603e+00+1.24824420e-01j],
           [ 1.05423510e+00+1.70442328e-01j,
             3.62512410e-01+4.34609085e-01j],
           [ 7.44824827e-01-5.06322026e-01j,
             9.19366479e-01+1.07257187e-01j]],

          [[ 9.75233316e-01+5.17275572e-01j,
             1.25694084e+00-2.23004952e-01j],
           [ 1.21983457e+00+6.14252687e-01j,
             9.71791148e-01+3.36400211e-01j],
           [ 7.37377286e-01-1.41526639e-01j,
             8.82613540e-01+2.18661755e-0

In [5]:
kappa*dslash(dest[define._ODD_])+src[define._ODD_]

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000123517 sec


array([[[[[[ 1.22341597e+00-2.11040407e-01j,
             9.48441327e-01+2.92964935e-01j],
           [ 1.07710218e+00+2.51541674e-01j,
             5.96388042e-01-2.02659950e-01j],
           [ 1.28736401e+00-2.23329216e-02j,
             8.78054738e-01+8.11285198e-01j],
           [ 1.00727391e+00+2.47124285e-01j,
             9.96571779e-01-1.11196779e-01j]],

          [[ 1.19774866e+00-6.17209196e-01j,
             1.15524435e+00+2.36841232e-01j],
           [ 1.06326640e+00+2.59047091e-01j,
             1.26831603e+00+1.24824420e-01j],
           [ 1.05423510e+00+1.70442328e-01j,
             3.62512410e-01+4.34609085e-01j],
           [ 7.44824827e-01-5.06322026e-01j,
             9.19366479e-01+1.07257187e-01j]],

          [[ 9.75233316e-01+5.17275572e-01j,
             1.25694084e+00-2.23004952e-01j],
           [ 1.21983457e+00+6.14252687e-01j,
             9.71791148e-01+3.36400211e-01j],
           [ 7.37377286e-01-1.41526639e-01j,
             8.82613540e-01+2.18661755e-0

# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [6]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(src.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = dest[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 4, 4, 4, 2)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000141425 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000150646 sec
0.0
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000072334 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000072634 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000070462 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000155814 sec
0.0


## solver

### give b

In [7]:
b_e = src[define._EVEN_].flatten()
b_o = src[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, U, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
b__o = pdslash_dag(b__o)
b = b__o.copy()

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000086864 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000077715 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000074325 sec


### iterate

In [8]:
x = cg.slover(
    b=b__o, matvec=cg_dslash, tol=argv[define._TOL_])

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000137964 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000248783 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000463845 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000088975 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000171343 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000122777 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000110611 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000075868 sec
Iteration 0: Residual = 1.289972e+02, Time = 0.003689 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000075588 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000070915 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000078615 sec
multi-gpu wilson dslash total time: (witho

### check

In [9]:
x_o = x.copy()
qcu.applyWilsonDslashQcu(tmp, x_o, U, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_dest
qcu_dest = cp.zeros_like(dest)
qcu_dest[define._EVEN_] = x_e.reshape(
    dest[define._EVEN_].shape)
qcu_dest[define._ODD_] = x_o.reshape(
    dest[define._ODD_].shape)
print(cp.linalg.norm(qcu_dest-dest) /
      cp.linalg.norm(dest))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000176327 sec
1.4533688e-07


In [10]:
qcu_dest[0]

array([[[[[[ 1.22341597e+00-2.11040378e-01j,
             9.48441386e-01+2.92964816e-01j],
           [ 1.07710218e+00+2.51541644e-01j,
             5.96388102e-01-2.02659965e-01j],
           [ 1.28736401e+00-2.23328248e-02j,
             8.78054678e-01+8.11285317e-01j],
           [ 1.00727391e+00+2.47124314e-01j,
             9.96571839e-01-1.11196786e-01j]],

          [[ 1.19774866e+00-6.17209017e-01j,
             1.15524435e+00+2.36841321e-01j],
           [ 1.06326640e+00+2.59047031e-01j,
             1.26831603e+00+1.24824539e-01j],
           [ 1.05423510e+00+1.70442402e-01j,
             3.62512589e-01+4.34609234e-01j],
           [ 7.44824767e-01-5.06321967e-01j,
             9.19366419e-01+1.07257143e-01j]],

          [[ 9.75233257e-01+5.17275572e-01j,
             1.25694084e+00-2.23004878e-01j],
           [ 1.21983445e+00+6.14252687e-01j,
             9.71791148e-01+3.36400181e-01j],
           [ 7.37377346e-01-1.41526580e-01j,
             8.82613540e-01+2.18661919e-0

In [11]:
dest[0]

array([[[[[[ 1.22341597e+00-2.11040407e-01j,
             9.48441327e-01+2.92964935e-01j],
           [ 1.07710218e+00+2.51541674e-01j,
             5.96388042e-01-2.02659950e-01j],
           [ 1.28736401e+00-2.23329216e-02j,
             8.78054738e-01+8.11285198e-01j],
           [ 1.00727391e+00+2.47124285e-01j,
             9.96571779e-01-1.11196779e-01j]],

          [[ 1.19774866e+00-6.17209196e-01j,
             1.15524435e+00+2.36841232e-01j],
           [ 1.06326640e+00+2.59047091e-01j,
             1.26831603e+00+1.24824420e-01j],
           [ 1.05423510e+00+1.70442328e-01j,
             3.62512410e-01+4.34609085e-01j],
           [ 7.44824827e-01-5.06322026e-01j,
             9.19366479e-01+1.07257187e-01j]],

          [[ 9.75233316e-01+5.17275572e-01j,
             1.25694084e+00-2.23004952e-01j],
           [ 1.21983457e+00+6.14252687e-01j,
             9.71791148e-01+3.36400211e-01j],
           [ 7.37377286e-01-1.41526639e-01j,
             8.82613540e-01+2.18661755e-0

# End for pyqcu. (pass, don't run this)

In [12]:
# demo.end(set_ptrs=set_ptrs,params=params)