# Init for pyqcu.

In [1]:

import cupy as cp
import numpy as np
import functools
from pyqcu import define, io, qcu, eigen, cg, bistabcg, amg, linalg
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
XX = 2
YY = 2
ZZ = 2
TT = 2
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
params[define._MG_X_] = int(params[define._LAT_X_]/XX/define._LAT_P_)
params[define._MG_Y_] = int(params[define._LAT_Y_]/YY)
params[define._MG_Z_] = int(params[define._LAT_Z_]/ZZ)
params[define._MG_T_] = int(params[define._LAT_T_]/TT)
print("Parameters:", params)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

# Read from hdf5 files.

In [2]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2sctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)
# testvectors_filename = gauge_filename.replace(
#     "gauge", "testvectors")
# print("Testvectors filename:", testvectors_filename)
# testvectors = io.eigenvectors2sctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=testvectors_filename))
# testvectors = io.xxxtzyx2mg_xxxtzyx(
#     testvectors, params)
# print("Testvectors data:", testvectors.data)
# print("Testvectors shape:", testvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0x928400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [3]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

##RANK:0##LOOP:118##Residual:(Fermion out data: <MemoryPointer 0x928400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f25393df5b0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0x922400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f255b05def0>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.785451648 sec
######TIME  :3522.23######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [4]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))

def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_params)
    return dest

def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest

def dslash(src):
    return dslash_no_dag(src)

def bistabcg_dslash(src):
    return pdslash_no_dag(src)

print(cp.linalg.norm((fermion_out[define._EVEN_]-kappa *
               dslash(fermion_out[define._ODD_]))-fermion_in[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001893931 sec
0.0009944807


# Give matvec.

In [5]:
# def matvec(src):
#     return gamma5_vec(pdslash_no_dag(src)).reshape(src.shape)
def matvec(src):
    return cg_dslash(src).reshape(src.shape)
# def matvec(src):
#     return bistabcg_dslash(src).reshape(src.shape)

# AMG - SETUP

In [6]:
# _, _testvectors = eigen.solver(n=fermion_in[define._EVEN_].size, k=params[define._LAT_E_],
#                                matvec=matvec, dtype=fermion_in.dtype)
# # _testvectors = amg.setup(n=fermion_in[define._EVEN_].size, k=params[define._LAT_E_],
# #                          matvec=matvec, dtype=fermion_in.dtype)
# testvectors = io.xxxtzyx2mg_xxxtzyx(
#     io.eigenvectors2sctzyx(_testvectors, params), params)
# print("Shape of testvectors: ", testvectors.shape)
# for i in range(len(_testvectors)):
#     if i == 0:
#         pass
#     else:
#         print("Ax/x", bistabcg_dslash(_testvectors[i])/_testvectors[i])
#         projections = cp.dot(_testvectors[:i].conj(), _testvectors[i])
#         max_proj = cp.max(cp.abs(projections)).get()
#         print(f"Maximum projection onto existing basis: {max_proj:.2e}")
#         j = i+1
#         if j == len(_testvectors):
#             j = 0
#         print("Difference between v_i and v_j:", cp.linalg.norm(
#             _testvectors[i]-_testvectors[j])/cp.linalg.norm(_testvectors[i]))
# for T in range(params[define._MG_T_]):
#     for Z in range(params[define._MG_Z_]):
#         for Y in range(params[define._MG_Y_]):
#             for X in range(params[define._MG_X_]):
#                 testvectors[:, :, :, T, :, Z, :, Y, :, X, :] = linalg.orthogonalize_matrix(
#                     testvectors[:, :, :, T, :, Z, :, Y, :, X, :].reshape(params[define._LAT_E_], -1)).reshape(testvectors[:, :, :, T, :, Z, :, Y, :, X, :].shape)

# io.xxx2hdf5_xxx(
#     testvectors, params, gauge_filename.replace("gauge", "testvectors"))

In [7]:
testvectors = io.xxxtzyx2mg_xxxtzyx(
    cp.ones_like(fermion_in[define._EVEN_]), params)
testvectors /= (XX*YY*ZZ*TT)**0.5

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 8, 2)


# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [8]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(gauge.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = fermion_out[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 32, 32, 32, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002486515 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002775678 sec
0.0
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.003217500 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002350208 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002349393 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002057445 sec
0.0


# MultiGrid - give grids.

In [9]:
# _src = io.xxxtzyx2mg_xxxtzyx(
#     input_array=fermion_in[define._EVEN_], params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_out[define._EVEN_], params=params)

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 8, 2)


<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [10]:
r_src = _src


def r_vec(src):
    if src.ndim == 1:
        return contract("scTtZzYyXx,scTtZzYyXx->scTZYX", cp.conj(testvectors), io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()
    return contract("scTtZzYyXx,scTtZzYyXx->scTZYX", cp.conj(testvectors), src)


r_dest = r_vec(r_src)
p_src = r_dest


def p_vec(src):
    if src.ndim == 1:
        return contract("scTtZzYyXx,scTZYX->scTtZzYyXx", testvectors, io.xxx2scTZYX(src, params)).flatten()
    return contract("scTtZzYyXx,scTZYX->scTtZzYyXx", testvectors, src)


p_dest = p_vec(p_src)
_mat = contract("scTtZzYyXx,scTtZzYyXx->scTZYX",
                cp.conj(testvectors), testvectors).flatten()
print(r_src.flatten()[:50])
print(p_dest.flatten()[:50])
print(_mat[:50])
print(
    p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50])
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))
print(cp.linalg.norm(r_src)/cp.linalg.norm(p_dest))
print(cp.linalg.norm(_mat))
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))
print(cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src))))))))
                     )/cp.linalg.norm(r_src))

[85.06271 +92.72717j  83.909325+79.079544j 75.17135 +74.97637j
 59.96822 +70.10574j  62.8976  +83.06627j  85.20416 +82.942444j
 83.96615 +87.257614j 71.37396 +59.93318j  83.66846 +84.91199j
 83.30059 +87.48178j  69.11276 +83.136505j 71.507126+84.716995j
 73.07465 +66.849815j 86.232895+76.366714j 68.57071 +73.81118j
 76.25281 +74.22365j  75.36408 +84.71486j  81.36057 +74.8402j
 77.86105 +73.528595j 80.72326 +63.705765j 72.48973 +74.28853j
 81.32994 +84.71786j  81.19293 +81.71508j  77.24133 +72.61786j
 89.19397 +68.572105j 72.29457 +76.29749j  73.593895+77.0461j
 82.057526+67.759445j 81.17849 +63.603233j 60.194786+82.343704j
 82.05291 +75.51457j  86.49907 +80.46945j  85.169395+80.00699j
 76.6777  +73.08816j  60.411148+79.13046j  88.5255  +77.89182j
 74.886246+70.65077j  74.13893 +83.50812j  75.94304 +77.715065j
 84.23813 +64.963425j 77.93952 +73.07037j  73.26339 +78.20688j
 82.10977 +78.06038j  74.57256 +86.661995j 84.6911  +73.90882j
 66.845245+68.52022j  70.55449 +84.12346j  66.370224+

<!-- # MultiGrid - verify above.
![](./image2-dev40.png) -->

# MultiGrid - R*matvec\*P.

In [11]:
def r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(src)))


D_r_src = matvec(r_src)
D_p_r_dest = matvec(p_vec(r_vec(r_src)))
p_r_D_p_r_dest = p_vec(r_vec(D_p_r_dest))
print(D_r_src.flatten()[:50])
print(D_p_r_dest.flatten()[:50])
print(p_r_D_p_r_dest.flatten()[:50])
print(cp.linalg.norm(D_r_src-D_p_r_dest)/cp.linalg.norm(D_r_src))
print(cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002660042 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001793353 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001844322 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002924080 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001905277 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002218327 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002207711 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002569800 sec
[ 13.797158   +2.5360427j  10.694167   -6.876222j
  14.132966   -6.1755176j  -8.67095   -11.078879j
 -20.776997   +8.843664j    5.4541526  +6.743944j
   5.952403  +12.309807j  -11.452175   -4.104568j
  -3.3467534  -3.6771512j   0.23284459+18.9892j
 -15.404661  +24.731049j    9.54553   +18.425533j
 -10.883911   +8.169712j   17.499891   +1.1326

# Give matvec_c

In [12]:
matvec_c = functools.partial(r_matvec_p, matvec=matvec)

# AMG-2-LEVEL

## give b

In [13]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp
# b__o -> Dslash^dag b__o
b__o = pdslash_dag(b__o)
b = b__o.copy()

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002123444 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002455629 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002131483 sec


## solver

### give x_a

In [14]:
x_a = bistabcg.slover(
    b=b, matvec=matvec, tol=1e-1)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002250757 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002106030 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002302029 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001895893 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002183158 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002139617 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002697028 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002116299 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002233729 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002136185 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001849136 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002555661 sec
Iteration 0: Res

### iterate

In [15]:
mg = True
for i in range(params[define._MAX_ITER_]):
    ######
    if i == 0:
        x_f = x_a.copy()
        r_f = b-matvec(x_f)
    if (mg):
        ######
        r_c = r_vec(r_f)
        ######
        if i == 0:
            e_c = cp.zeros_like(r_c)
        r_e_c = r_c-matvec_c(e_c)
        tol_c = linalg.norm2(r_e_c)
        print(f"CCCFFF Iteration {i}, tol_c={tol_c} FFFCCC")
        ######
        e_c = cg.slover(
            b=r_c, matvec=matvec_c, max_iter=20, x0=e_c)
        ######
        r_e_c = r_c-matvec_c(e_c)
        tol_c = linalg.norm2(r_e_c)
        print(f"CCCCCC Iteration {i}, tol_c={tol_c} CCCCCC")
        ######
        e_f = p_vec(e_c)
        ######
        x_f += e_f
        ######
    r_f = b-matvec(x_f)
    tol_f = linalg.norm2(r_f)
    print(f"FFFCCC Iteration {i}, tol_f={tol_f} CCCFFF")
    ######
    x_f = cg.slover(
        b=b, matvec=matvec, max_iter=10, x0=x_f)
    ######
    r_f = b-matvec(x_f)
    tol_f = linalg.norm2(r_f)
    print(f"FFFFFF Iteration {i}, tol_f={tol_f} FFFFFF")
    ######
    if tol_f < argv[define._TOL_]:
        break
    ######

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002469159 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001829247 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001887373 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002177189 sec
Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 8, 2)
Input Array Shape: (393216,)
Dest Shape: (4, 3, 16, 16, 16, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001859856 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002155572 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001821254 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001891092 sec
Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 8, 2)
CCCFFF Iteration 0, tol_c=0.04753053933382034 FFFCCC
Input Array Shape: (393216,)
Dest Shape: (4, 3, 16, 

KeyboardInterrupt: 

## refer

In [None]:
# x = cg.slover(
#     b=b__o, matvec=matvec, tol=1e-10)

## check

In [None]:
x_o = x.copy()
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
      np.linalg.norm(quda_fermion_out))

# MG-BISTABCG

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=matvec, tol=1e-10)
# # io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')
# # mg version
# mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
#     io.fermion2sctzyx(b__o, params), params)).flatten()
# mg_x_o = bistabcg.slover(
#     b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=matvec), tol=1e-10)
# _x_o = io.array2xxx(p_vec(io.xxx2scTZYX(mg_x_o, params)))
# # io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))
# # x_o = io.hdf5_xxx2xxx(params, 'x_o.h5')
# # _x_o = io.hdf5_xxx2xxx(params, '_x_o.h5')
# print(x_o.flatten()[:50])
# print(_x_o.flatten()[:50])
# print(np.linalg.norm(_x_o-x_o) /
#       np.linalg.norm(x_o))

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)