# Init for pyqcu.

In [1]:

import cupy as cp
import numpy as np
import functools
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg, bistabcg, amg, linalg
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
params[define._MG_X_] = 8
params[define._MG_Y_] = 8
params[define._MG_Z_] = 8
params[define._MG_T_] = 16
print("Parameters:", params)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)
define._LAT_E_ = 16


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

# Read from hdf5 files.

In [2]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)
# testvectors_filename = gauge_filename.replace(
#     "gauge", "testvectors")
# print("Testvectors filename:", testvectors_filename)
# testvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=testvectors_filename))
# print("Testvectors data:", testvectors.data)
# print("Testvectors shape:", testvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0x928400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [3]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

##RANK:0##LOOP:118Fermion out data: <MemoryPointer 0x928400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f4790bccbb0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0x922400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f47414f29f0>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
##Residual:(2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.674596736 sec
######TIME  :2582.75######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [4]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))

def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_params)
    return dest

def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest

def dslash(src):
    return dslash_no_dag(src)

def bistabcg_dslash(src):
    return pdslash_no_dag(src)

print(cp.linalg.norm((fermion_out[define._EVEN_]-kappa *
               dslash(fermion_out[define._ODD_]))-fermion_in[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001851473 sec
0.0009944807


# Give matvec.

In [5]:
# def matvec(src):
#     return gamma5_vec(pdslash_no_dag(src)).reshape(src.shape)
# def matvec(src):
#     return cg_dslash(src).reshape(src.shape)
def matvec(src):
    return bistabcg_dslash(src).reshape(src.shape)

# AMG - SETUP

In [6]:
_testvectors = amg.setup(n=fermion_in[define._EVEN_].size, k=define._LAT_E_,
                         matvec=matvec, dtype=fermion_in.dtype)
testvectors = io.xxxtzyx2mg_xxxtzyx(
    io.eigenvectors2esctzyx(_testvectors, params), params)
print("Shape of testvectors: ", testvectors.shape)
for i in range(len(_testvectors)):
    if i == 0:
        pass
    else:
        print("Ax/x", bistabcg_dslash(_testvectors[i])/_testvectors[i])
        projections = cp.dot(_testvectors[:i].conj(), _testvectors[i])
        max_proj = cp.max(cp.abs(projections)).get()
        print(f"Maximum projection onto existing basis: {max_proj:.2e}")
        j = i+1
        if j == len(_testvectors):
            j = 0
        print("Difference between v_i and v_j:", cp.linalg.norm(
            _testvectors[i]-_testvectors[j])/cp.linalg.norm(_testvectors[i]))
for S in range(define._LAT_S_):
    for C in range(define._LAT_C_):
        for T in range(params[define._MG_T_]):
            for Z in range(params[define._MG_Z_]):
                for Y in range(params[define._MG_Y_]):
                    for X in range(params[define._MG_X_]):
                        testvectors[:, S, C, T, :, Z, :, Y, :, X, :] = linalg.orthogonalize_matrix(
                            testvectors[:, S, C, T, :, Z, :, Y, :, X, :].reshape(define._LAT_E_, -1)).reshape(testvectors[:, S, C, T, :, Z, :, Y, :, X, :].shape)
# io.xxx2hdf5_xxx(
#     testvectors, params, gauge_filename.replace("gauge", "testvectors"))
# testvectors*=(define._LAT_SC_*params[define._MG_T_]*params[define._MG_Z_]*params[define._MG_Y_]*params[define._MG_X_]/define._LAT_E_)**0.5

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001802939 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001761956 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001857496 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001769334 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001853179 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001745672 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001799006 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001746545 sec
Iteration 0: Residual = 9.999999e-01, Time = 0.026360 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001875573 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001796486 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001820981 sec
multi-gpu wilson dslash total time: (witho

# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [7]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(gauge.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = fermion_out[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 32, 32, 32, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.049719824 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.049708906 sec
0.0
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.049512107 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.047964333 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.013205686 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001903151 sec
0.0


# MultiGrid - give grids.

In [8]:
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)
# _src = io.xxxtzyx2mg_xxxtzyx(
#     input_array=fermion_out[define._EVEN_], params=params)

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 8, 4, 8, 4, 8, 2)


<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [9]:
r_src = _src


def r_vec(src):
    if src.ndim == 1:
        return contract("escTtZzYyXx,scTtZzYyXx->escTZYX", cp.conj(testvectors), io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()
    return contract("escTtZzYyXx,scTtZzYyXx->escTZYX", cp.conj(testvectors), src)


r_dest = r_vec(r_src)
p_src = r_dest


def p_vec(src):
    if src.ndim == 1:
        return contract("escTtZzYyXx,escTZYX->scTtZzYyXx", testvectors, io.xxx2escTZYX(src, params)).flatten()
    return contract("escTtZzYyXx,escTZYX->scTtZzYyXx", testvectors, src)


p_dest = p_vec(p_src)
_mat = contract("escTtZzYyXx,escTtZzYyXx->etzyx",
                testvectors, cp.conj(testvectors)).flatten()
print(r_src.flatten()[:50])
print(p_dest.flatten()[:50])
print(_mat[:100])
print(
    p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50])
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))
print(cp.linalg.norm(r_src)/cp.linalg.norm(p_dest))
print(cp.linalg.norm(_mat))
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))
print(cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src))))))))
                     )/cp.linalg.norm(r_src))

[1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j]
[0.99847037+1.0150651j  0.9551952 +1.0530452j  0.9749415 +1.0516231j
 0.9590564 +1.0150403j  1.0146737 +0.97664136j 1.0259534 +0.9947815j
 0.9780872 +0.98839426j 1.0029829 +0.9940521j  1.0186476 +0.9928596j
 0.9706033 +1.0046748j  1.0513219 +0.94429445j 1.0071788 +0.95983416j
 1.0699451 +0.95032763j 0.9837946 +1.0070251j  0.98745394+1.0118475j
 1.0160251 +0.97682536j 0.99901366+0.98785883j 1.0270921 +1.011621j
 1.0348208 +0.96832347j 1.05068   +0.92627877j 0.9939937 +1.002839j
 0.9733298 +0.9908544j  0.97595197+0.9914867j  1.034774  +0.97258896j
 0.99201095+0.9952733j  1.0273341 +1.0036982j  0.9553104 +1.0103217j
 0.99636173+0.96510273j

<!-- # MultiGrid - verify above.
![](./image2-dev40.png) -->

# MultiGrid - R*matvec\*P.

In [10]:
def r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(src)))


D_r_src = matvec(r_src)
p_r_D_p_r_dest = p_vec(r_matvec_p(r_vec(r_src), matvec=matvec))
print(D_r_src.flatten()[:50])
print(p_r_D_p_r_dest.flatten()[:50])
print(cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001810865 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001755288 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001911174 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001854739 sec
[ 0.09344786-0.10208309j  0.04712683-0.05375636j  0.20205879-0.02301359j
  0.10501933-0.06024492j -0.03274214-0.00194919j -0.03296566+0.01795989j
 -0.00965858-0.00275993j -0.04963386+0.21724534j -0.11307013-0.06256664j
 -0.06049633+0.11239302j -0.04292512+0.18137288j  0.11644292+0.15530562j
 -0.1051693 +0.20847821j  0.17299628+0.01849371j  0.08634686+0.01981819j
  0.01468253-0.00553358j  0.03066605+0.04353732j  0.02836114+0.15360475j
 -0.07397485-0.06877553j -0.07699823+0.1980232j   0.09499002+0.06847632j
 -0.00526285-0.02889919j -0.2506256 +0.03647596j -0.01345444+0.23340106j
  0.08127064-0.06814575j -0.06371009+0.1326667j   0.11834753+0.02462935j
 -0.0511992 -0.

# Give matvec_c

In [11]:
matvec_c = functools.partial(r_matvec_p, matvec=matvec)

# AMG-2-LEVEL

## give b

In [12]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp
b = b__o.copy()

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001812956 sec


## solver

### give x_a

In [13]:
x_a = bistabcg.slover(
    b=b, matvec=matvec, tol=1e-1)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001821403 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001759271 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001843982 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001742831 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001849184 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001734691 sec
Iteration 0: Residual = 4.974845e+07, Time = 0.018949 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001871843 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001748489 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001821548 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001753696 sec
Iteration 1: Residual = 3.800154e+07, Time = 0.017067 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.

### iterate

In [16]:
shrink_factor_f = 0.1
shrink_factor_c = 0.1
for i in range(params[define._MAX_ITER_]):
    ######
    if i == 0:
        x_f = x_a.copy()
    ######
    r_f = b-matvec(x_f)
    tol_f = linalg.norm2(r_f)
    print(f"FFFCCC Iteration {i}, tol_f={tol_f} CCCFFF")
    ######
    x_f = bistabcg.slover(
        b=b, matvec=matvec, tol=tol_f*shrink_factor_f, x0=x_f)
    ######
    r_f = b-matvec(x_f)
    tol_f = linalg.norm2(r_f)
    print(f"FFFFFF Iteration {i}, tol_f={tol_f} FFFFFF")
    ######
    r_c = r_vec(r_f)
    ######
    if i == 0:
        e_c = r_c.copy()
    ######
    r_e_c = r_c-matvec_c(e_c)
    tol_c = linalg.norm2(r_e_c)
    print(f"CCCFFF Iteration {i}, tol_c={tol_c} FFFCCC")
    ######
    # e_c = bistabcg.slover(
    #     b=r_c, matvec=matvec_c, tol=tol_c*shrink_factor_c, x0=e_c)
    e_c = bistabcg.slover(
        b=r_c, matvec=matvec_c, tol=1e-10, x0=e_c)
    ######
    r_e_c = r_c-matvec_c(e_c)
    tol_c = linalg.norm2(r_e_c)
    print(f"CCCCCC Iteration {i}, tol_c={tol_c} CCCCCC")
    ######
    e_f = p_vec(e_c)
    ######
    x_f += e_f
    ######
    if tol_f < argv[define._TOL_]:
        break
    ######

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.049801284 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.047862330 sec
FFFCCC Iteration 0, tol_f=0.05778887867927551 CCCFFF
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.049716393 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.047965356 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001963087 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001923970 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002008389 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001853620 sec
Iteration 0: Residual = 5.778888e-02, Time = 0.122697 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001979375 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001901232 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001

KeyboardInterrupt: 

## refer

In [None]:
# x = bistabcg.slover(
#     b=b__o, matvec=matvec, tol=1e-10)

## check

In [None]:
x_o = x.copy()
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
      np.linalg.norm(quda_fermion_out))

# MG-BISTABCG

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=matvec, tol=1e-10)
# # io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')
# # mg version
# mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
#     io.fermion2sctzyx(b__o, params), params)).flatten()
# mg_x_o = bistabcg.slover(
#     b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=matvec), tol=1e-10)
# _x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
# # io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))
# # x_o = io.hdf5_xxx2xxx(params, 'x_o.h5')
# # _x_o = io.hdf5_xxx2xxx(params, '_x_o.h5')
# print(x_o.flatten()[:50])
# print(_x_o.flatten()[:50])
# print(np.linalg.norm(_x_o-x_o) /
#       np.linalg.norm(x_o))

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)