# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg, bistabcg
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
print("Parameters:", params)
params[define._MG_X_] = 8
params[define._MG_Y_] = 8
params[define._MG_Z_] = 8
params[define._MG_T_] = 8


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

In [2]:
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:0
host_params[_SET_PLAN_] :1
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [3]:
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:1
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [4]:
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:2
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [5]:
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:3
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [6]:
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:4
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [7]:
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

Set pointers: [94417755244528 94417765034000 94417765209152 94417765385552
 94417765523696              0              0              0
              0              0]
Set pointers data: <memory at 0x7f80e429b1c0>


# Read from hdf5 files.

In [8]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [9]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

##RANK:Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f80dc555df0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0xb22400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f80650ad670>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
0##LOOP:118##Residual:(2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.641860992 sec
######TIME  :2773.97######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [10]:
def cg_dslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def cg_dslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return cg_dslash_dag(cg_dslash_no_dag(src))


def bistabcg_dslash(src):
    return cg_dslash_no_dag(src)

# Give matvec.

In [11]:
def matvec(src):
    return cg_dslash(src)

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test. (pass, don't run this)

In [12]:
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = cg_dslash(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [13]:
# eigenvalues, eigenvectors = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=define._LAT_Ne_,matvec=cg_dslash,dtype=gauge.dtype)
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG. (pass, don't run this)

In [14]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [15]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# Give guage's orth_eigenvectors to hdf5 files. (pass, don't run this)

In [16]:
# _eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=eigenvectors, params=params)
randomvectors = []
for i in range(define._LAT_E_):
    _ = cp.random.randn(
        define._LAT_S_, define._LAT_C_, params[define._LAT_T_], params[define._LAT_Z_], params[define._LAT_Y_], int(params[define._LAT_X_]/define._LAT_P_)).astype(fermion_in.dtype)
    _/=cp.linalg.norm(_)
    randomvectors.append(_)
randomvectors=cp.array(randomvectors)
_eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=randomvectors, params=params)
print(_eigenvectors.shape)  # escTtZzYyXx


def orthogonalize(eigenvectors):
    _eigenvectors = eigenvectors.copy()
    size_e, size_s, size_c, size_T, size_t, size_Z, size_z, size_Y, size_y, size_X, size_x = eigenvectors.shape
    print(size_e, size_s, size_c, size_T, size_t,
          size_Z, size_z, size_Y, size_y, size_X, size_x)
    for T in range(size_T):
        for Z in range(size_Z):
            for Y in range(size_Y):
                for X in range(size_X):
                    origin_matrix = eigenvectors[:,
                                                 :, :, T, :, Z, :, Y, :, X, :]
                    _shape = origin_matrix.shape
                    _origin_matrix = origin_matrix.reshape(size_e, -1)
                    condition_number = np.linalg.cond(_origin_matrix.get())
                    print(f"矩阵条件数: {condition_number}")
                    a = _origin_matrix[:, 0]
                    b = _origin_matrix[:, -1]
                    print(cp.dot(a.conj(), b))
                    Q = cp.linalg.qr(_origin_matrix.T)[0]
                    condition_number = np.linalg.cond(Q.get())
                    print(f"矩阵条件数: {condition_number}")
                    a = Q[:, 0]
                    b = Q[:, -1]
                    print(cp.dot(a.conj(), b))
                    _eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :] = Q.T.reshape(
                        _shape)
    return _eigenvectors


orth_eigenvectors = orthogonalize(_eigenvectors)

io.xxx2hdf5_xxx(
    orth_eigenvectors, params, gauge_filename.replace("gauge", "orth_eigenvectors"))

Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 8, 4, 8, 4, 8, 4, 8, 2)
(24, 4, 3, 8, 4, 8, 4, 8, 4, 8, 2)
24 4 3 8 4 8 4 8 4 8 2
矩阵条件数: 1.2663525342941284
(1.3179876e-07+0j)
矩阵条件数: 1.0000003576278687
(-6.9849193e-10+0j)
矩阵条件数: 1.2569712400436401
(-5.112532e-07+0j)
矩阵条件数: 1.000000238418579
(9.313226e-09+0j)
矩阵条件数: 1.26030695438385
(-1.6259874e-07+0j)
矩阵条件数: 1.000000238418579
(-6.0535967e-09+0j)
矩阵条件数: 1.2394077777862549
(1.7132288e-07+0j)
矩阵条件数: 1.000000238418579
(9.313226e-09+0j)
矩阵条件数: 1.3086947202682495
(-5.427343e-07+0j)
矩阵条件数: 1.000000238418579
(6.519258e-09+0j)
矩阵条件数: 1.2775850296020508
(-3.700456e-07+0j)
矩阵条件数: 1.000000238418579
(6.0535967e-09+0j)
矩阵条件数: 1.3217713832855225
(8.1171964e-07+0j)
矩阵条件数: 1.0000003576278687
(2.0954758e-08+0j)
矩阵条件数: 1.2444708347320557
(-3.86742e-07+0j)
矩阵条件数: 1.000000238418579
(3.259629e-09+0j)
矩阵条件数: 1.264024257659912
(-6.5603007e-07+0j)
矩阵条件数: 1.000000238418579
(4.656613e-10+0j)
矩阵条件数: 1.2455568313598633
(1.2874823e-06+0j)
矩阵条件数:

# MultiGrid - give grids.

In [17]:
orth_eigenvectors_filename = gauge_filename.replace(
    "gauge", "orth_eigenvectors")
print("Orth orth_eigenvectors filename:", orth_eigenvectors_filename)
orth_eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=orth_eigenvectors_filename))
print("Orth orth_eigenvectors data:", orth_eigenvectors.data)
print("Orth orth_eigenvectors shape:", orth_eigenvectors.shape)
testvectors = io.xxxtzyx2mg_xxxtzyx(
    input_array=orth_eigenvectors, params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)
# _src = io.xxxtzyx2mg_xxxtzyx(
#     input_array=fermion_out[define._EVEN_], params=params)

Orth orth_eigenvectors filename: quda_wilson-bistabcg-orth_eigenvectors_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Dest Shape: (24, 4, 3, 8, 4, 8, 4, 8, 4, 8, 2)
Orth orth_eigenvectors data: <MemoryPointer 0xc0b800000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f809275a570>>
Orth orth_eigenvectors shape: (24, 4, 3, 32, 32, 32, 16)
Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 8, 4, 8, 4, 8, 4, 8, 2)
Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 8, 4, 8, 4, 8, 2)


<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [18]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)

In [19]:
r_dest.shape

(24, 8, 8, 8, 8)

<!-- # MultiGrid - P*vector.
![](./image1-dev40.png) -->


In [20]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)

In [21]:
p_dest.shape

(4, 3, 8, 4, 8, 4, 8, 4, 8, 2)

<!-- # MultiGrid - verify above.
![](./image2-dev40.png) -->

In [22]:
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))

3547.24
443.84796


In [23]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

0.99214107


In [24]:
print(cp.linalg.norm(r_src-p_vec(r_vec(r_src)))/cp.linalg.norm(r_src))

0.99214107


In [25]:
r_src.flatten()[:50]

array([1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j], dtype=complex64)

In [26]:
p_dest.flatten()[:50]

array([ 0.22030568+0.22030568j,  0.1036678 +0.1036678j ,
       -0.052515  -0.052515j  ,  0.01765973+0.01765973j,
        0.00691776+0.00691776j,  0.00387617+0.00387617j,
       -0.00456921-0.00456921j, -0.06809614-0.06809614j,
        0.03807012+0.03807012j,  0.05219358+0.05219358j,
       -0.05855714-0.05855714j,  0.17254657+0.17254657j,
        0.0329134 +0.0329134j ,  0.00505831+0.00505831j,
        0.31718278+0.31718278j, -0.3125062 -0.3125062j ,
        0.08071575+0.08071575j,  0.17364031+0.17364031j,
        0.060215  +0.060215j  , -0.05349008-0.05349008j,
        0.04073057+0.04073057j, -0.06979492-0.06979492j,
       -0.04026631-0.04026631j,  0.0773974 +0.0773974j ,
        0.11830069+0.11830069j, -0.00398359-0.00398359j,
       -0.17708266-0.17708266j, -0.15074813-0.15074813j,
        0.08689383+0.08689383j,  0.09463979+0.09463979j,
        0.07302242+0.07302242j,  0.23716535+0.23716535j,
       -0.0831931 -0.0831931j , -0.23619159-0.23619159j,
        0.12460781+0.12460781j,

In [27]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src)

array(0.99214107, dtype=float32)

In [28]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(p_dest)

array(7.9292073, dtype=float32)

In [29]:
p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50]

array([ 0.22030565+0.22030565j,  0.10366784+0.10366784j,
       -0.05251499-0.05251499j,  0.01765975+0.01765975j,
        0.00691765+0.00691765j,  0.00387621+0.00387621j,
       -0.00456918-0.00456918j, -0.06809614-0.06809614j,
        0.03807014+0.03807014j,  0.05219363+0.05219363j,
       -0.05855711-0.05855711j,  0.17254663+0.17254663j,
        0.03291348+0.03291348j,  0.00505838+0.00505838j,
        0.31718287+0.31718287j, -0.31250623-0.31250623j,
        0.08071569+0.08071569j,  0.17364027+0.17364027j,
        0.06021494+0.06021494j, -0.05349007-0.05349007j,
        0.04073056+0.04073056j, -0.06979495-0.06979495j,
       -0.04026628-0.04026628j,  0.07739742+0.07739742j,
        0.11830075+0.11830075j, -0.00398357-0.00398357j,
       -0.17708275-0.17708275j, -0.1507482 -0.1507482j ,
        0.08689383+0.08689383j,  0.09463985+0.09463985j,
        0.07302243+0.07302243j,  0.23716548+0.23716548j,
       -0.08319309-0.08319309j, -0.2361916 -0.2361916j ,
        0.12460778+0.12460778j,

In [30]:
cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src))))))))
               )/cp.linalg.norm(r_src)  # ???

array(0.99214107, dtype=float32)

In [31]:
# _mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
#                 testvectors, cp.conj(testvectors)).flatten()
# print(cp.linalg.norm(_mat))
# print(_mat[:100])

In [32]:
testvectors.shape

(24, 4, 3, 8, 4, 8, 4, 8, 4, 8, 2)

# MultiGrid - R*matvec\*P.

In [33]:
def _r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(io.xxx2eTZYX(src, params))))


def r_matvec_p(src, matvec):
    return io.array2xxx(_r_matvec_p(src, matvec))

# MultiGrid - verify above.

In [34]:
D_r_src = matvec(r_src)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001867446 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001812483 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001940759 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001794467 sec


In [35]:
D_r_src.flatten()[:50]

array([ 0.09512203-0.15065098j,  0.02459466-0.10281426j,
        0.14802423-0.04478627j,  0.05378214-0.09193704j,
       -0.08474712+0.0208632j , -0.06649091+0.06668942j,
       -0.03820895+0.03143284j, -0.06110204+0.1998555j ,
       -0.09466349-0.10954653j, -0.09903856+0.15328293j,
       -0.11581516+0.23665299j,  0.15433274+0.0833789j ,
       -0.11228501+0.18325019j,  0.07416645+0.00099519j,
        0.05309803-0.03660914j,  0.034646  -0.03932991j,
       -0.00817882-0.03089109j,  0.03485045+0.13626698j,
       -0.12274602-0.06901661j, -0.10658678+0.15540114j,
        0.09019466+0.07479304j,  0.00329622+0.01262958j,
       -0.2735389 +0.08964469j, -0.02719212+0.18662325j,
        0.07507145-0.0618236j , -0.09112616+0.10951558j,
        0.08583128+0.02523299j, -0.0734987 -0.1731855j ,
       -0.09989857+0.04082112j, -0.1259279 +0.17906842j,
        0.00141903+0.04172344j, -0.08249421-0.07952813j,
        0.06730361-0.03395787j, -0.16046013+0.01053373j,
       -0.04409004+0.2682615j ,

In [36]:
p_r_D_p_r_dest = p_vec(_r_matvec_p(r_vec(r_src), matvec=cg_dslash))

Input Array Shape: (24, 8, 8, 8, 8)
Dest Shape: (24, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001870485 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001887327 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001833671 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001780016 sec


In [37]:
p_r_D_p_r_dest.flatten()[:50]

array([ 0.2254407 +0.22452563j,  0.10434461+0.09758894j,
       -0.05224188-0.04904595j,  0.01622486+0.01353379j,
        0.00323931+0.00878014j,  0.00833237+0.00610614j,
       -0.00771397-0.00834465j, -0.06994345-0.0813576j ,
        0.04029226+0.03836358j,  0.05450076+0.05455438j,
       -0.06012176-0.05796909j,  0.16580567+0.15752393j,
        0.03855509+0.04457482j,  0.0059777 -0.00065769j,
        0.29969904+0.2944092j , -0.31363454-0.31242678j,
        0.08829886+0.07943392j,  0.17026867+0.16646066j,
        0.05623528+0.06585549j, -0.04894788-0.05222949j,
        0.03910854+0.03570346j, -0.07901458-0.07410745j,
       -0.03847758-0.03685958j,  0.07518822+0.07526012j,
        0.10393713+0.12870374j, -0.00252504-0.00457246j,
       -0.18201658-0.17865714j, -0.14445733-0.14678448j,
        0.07703903+0.08228311j,  0.10650371+0.09538928j,
        0.07602903+0.0757048j ,  0.24184161+0.24110861j,
       -0.07374686-0.07516983j, -0.2426892 -0.2432037j ,
        0.12066089+0.11931169j,

In [38]:
cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src)

array(1.6521049, dtype=float32)

# MultiGrid - BISTABCG (TESTING......)

In [39]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001840743 sec


In [40]:
# # Dslash(x_o)=b__o
x_o = bistabcg.slover(
    b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001913016 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001810400 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001936940 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001850442 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001842961 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001793969 sec
Iteration 0: Residual = 4.974845e+07, Time = 0.024006 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001876132 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001797835 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001932600 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001813079 sec
Iteration 1: Residual = 3.800154e+07, Time = 0.018391 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.

In [41]:
# mg version
mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
    io.fermion2sctzyx(b__o, params), params)).flatten()
mg_x_o = bistabcg.slover(
    b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=bistabcg_dslash), tol=1e-10, max_iter=1000000)
_x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 8, 4, 8, 4, 8, 2)
Input Array Shape: (98304,)
Dest Shape: (24, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001894953 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001822582 sec
Input Array Shape: (98304,)
Dest Shape: (24, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001873537 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001734729 sec
Input Array Shape: (98304,)
Dest Shape: (24, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001873968 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001771506 sec
Iteration 0: Residual = 7.789024e+05, Time = 0.181265 s
Input Array Shape: (98304,)
Dest Shape: (24, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001855462 sec
multi-gpu wilson dslash total time: (without malloc free 

# MG-BISTABCG

In [42]:
def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
    n = b.size
    dtype = b.dtype
    buffers = {key: cp.zeros(n, dtype=dtype)
               for key in ['r', 'r_tilde', 'p', 'v', 's', 't', 'x']}
    x0 = None if x0 is None else x0.copy()

    def initialize_random_vector(v):
        v.real, v.imag = cp.random.randn(n).astype(
            v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
        norm = cp.linalg.norm(v)
        if norm > 0:
            cp.divide(v, norm, out=v)
        return v

    def dot(x, y):
        return cp.sum(x.conj() * y)

    def _r_vec(src):
        return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

    def _p_vec(src):
        return p_vec(io.xxx2eTZYX(src, params)).flatten()

    def _r_matvec_p(src):
        return _r_vec(matvec(_p_vec(src)))

    x, r, r_tilde, p, v, s, t = buffers['x'], buffers['r'], buffers[
        'r_tilde'], buffers['p'], buffers['v'], buffers['s'], buffers['t']
    if x0 is not None:
        cp.copyto(x, x0)
    else:
        initialize_random_vector(x)
    r = b - matvec(x)
    cp.copyto(r_tilde, r)
    rho_prev = 1.0
    alpha = 1.0
    omega = 1.0
    start_time = perf_counter()
    iter_times = []
    for i in range(max_iter):
        iter_start_time = perf_counter()
        rho = dot(r_tilde, r)
        beta = (rho/rho_prev)*(alpha/omega)
        rho_prev = rho
        p = r+(p-v*omega)*beta
        r_norm2 = dot(r, r)
        v = matvec(p)
        alpha = rho / dot(r_tilde, v)
        s = r-v*alpha
        t = matvec(s)
        omega = dot(t, s)/dot(t, t)
        r = s-t*omega  # update r
        # COARSE START
        r_c = _r_vec(r)
        e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
                              tol=1e-2, max_iter=100)
        e = _p_vec(e_c)
        # COARSE END
        # FINE START
        # x = x+p*alpha+s*omega # update x # don't use ?
        x += e  # just this like?
        # FINE END
        iter_time = perf_counter() - iter_start_time
        print(
            f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
        iter_times.append(iter_time)
        if r_norm2.real < tol:
            print(
                f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
            break
    total_time = perf_counter() - start_time
    avg_iter_time = sum(iter_times) / len(iter_times)
    print("\nPerformance Statistics:")
    print(f"Total time: {total_time:.6f} s")
    print(f"Average time per iteration: {avg_iter_time:.6f} s")
    return x.copy()

In [43]:
def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
    n = b.size
    dtype = b.dtype
    buffers = {key: cp.zeros(n, dtype=dtype)
               for key in ['r', 'r_tilde', 'p', 'v', 's', 't', 'x']}
    x0 = None if x0 is None else x0.copy()

    def initialize_random_vector(v):
        v.real, v.imag = cp.random.randn(n).astype(
            v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
        norm = cp.linalg.norm(v)
        if norm > 0:
            cp.divide(v, norm, out=v)
        return v

    def dot(x, y):
        return cp.sum(x.conj() * y)

    def _r_vec(src):
        return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

    def _p_vec(src):
        return p_vec(io.xxx2eTZYX(src, params)).flatten()

    def _r_matvec_p(src):
        return _r_vec(matvec(_p_vec(src)))

    x, r, r_tilde, p, v, s, t = buffers['x'], buffers['r'], buffers[
        'r_tilde'], buffers['p'], buffers['v'], buffers['s'], buffers['t']
    if x0 is not None:
        cp.copyto(x, x0)
    else:
        initialize_random_vector(x)
    r = b - matvec(x)
    cp.copyto(r_tilde, r)
    rho_prev = 1.0
    alpha = 1.0
    omega = 1.0
    start_time = perf_counter()
    iter_times = []
    for i in range(max_iter):
        iter_start_time = perf_counter()
        rho = dot(r_tilde, r)
        beta = (rho/rho_prev)*(alpha/omega)
        rho_prev = rho
        p = r+(p-v*omega)*beta
        r_norm2 = dot(r, r)
        v = matvec(p)
        alpha = rho / dot(r_tilde, v)
        s = r-v*alpha
        t = matvec(s)
        omega = dot(t, s)/dot(t, t)
        r = s-t*omega
        x = x+p*alpha+s*omega
        # COARSE START
        r = b-matvec(x)
        r_c = _r_vec(r)
        e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
                              tol=1e-2, max_iter=100)
        # COARSE END
        # FINE START
        e = _p_vec(e_c)
        x += e  # or just this like?
        r = b-matvec(x)
        # FINE END
        iter_time = perf_counter() - iter_start_time
        print(
            f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
        iter_times.append(iter_time)
        if r_norm2.real < tol:
            print(
                f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
            break
    total_time = perf_counter() - start_time
    avg_iter_time = sum(iter_times) / len(iter_times)
    print("\nPerformance Statistics:")
    print(f"Total time: {total_time:.6f} s")
    print(f"Average time per iteration: {avg_iter_time:.6f} s")
    return x.copy()

In [44]:
def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
    tol_factor = 0.1

    def initialize_random_vector(v):
        n = b.size
        v.real, v.imag = cp.random.randn(n).astype(
            v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
        norm = cp.linalg.norm(v)
        if norm > 0:
            cp.divide(v, norm, out=v)
        return v

    def dot(x, y):
        return cp.sum(x.conj() * y)

    def _r_vec(src):
        return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

    def _p_vec(src):
        return p_vec(io.xxx2eTZYX(src, params)).flatten()

    def _r_matvec_p(src):
        return _r_vec(matvec(_p_vec(src)))

    if x0 is not None:
        x = x0
    else:
        x0 = cp.zeros_like(b)
        initialize_random_vector(x0)
        x = x0
    start_time = perf_counter()
    iter_times = []
    r = b-matvec(x)
    _tol = dot(r, r).real
    for i in range(max_iter):
        iter_start_time = perf_counter()
        _tol *= tol_factor
        print(f"@@@wanted tol: {_tol}")
        x = bistabcg.slover(
            b=b, matvec=matvec, tol=tol, max_iter=5, x0=x)
        # COARSE START
        r = b-matvec(x)
        r_c = _r_vec(r)
        e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
                              tol=tol, max_iter=5)
        # COARSE END
        # FINE START
        e = _p_vec(e_c)
        x += e  # or just this like?
        # FINE END
        r = b-matvec(x)
        r_norm2 = dot(r, r)
        _tol = max(_tol, r_norm2.real)
        iter_time = perf_counter() - iter_start_time
        print(
            f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
        iter_times.append(iter_time)
        if r_norm2.real < tol:
            print(
                f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
            break
    total_time = perf_counter() - start_time
    avg_iter_time = sum(iter_times) / len(iter_times)
    print("\nPerformance Statistics:")
    print(f"Total time: {total_time:.6f} s")
    print(f"Average time per iteration: {avg_iter_time:.6f} s")
    return x.copy()

In [45]:
x_o = slover(
    b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001897292 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001767785 sec
@@@wanted tol: 4974845.5
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001827766 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001755609 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001907408 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001772351 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001945089 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001743224 sec
Iteration 0: Residual = 4.974845e+07, Time = 0.019767 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001963586 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001770668 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001961212 sec
multi-gpu wilson 

KeyboardInterrupt: 

# MultiGrid - verify above.

In [None]:
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
      np.linalg.norm(quda_fermion_out))

In [None]:
x_o = io.hdf5_xxx2xxx(params, 'x_o.h5')
_x_o = io.hdf5_xxx2xxx(params, '_x_o.h5')

In [None]:
x_o.flatten()[:50]

In [None]:
_x_o.flatten()[:50]

In [None]:
print(np.linalg.norm(_x_o-x_o) /
      np.linalg.norm(x_o))

# End for CG & BISTABCG. (pass, don't run this)

In [None]:
# cg_solver.end()
# bistabcg_solver.end()

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)