# Init for pyqcu.

In [2]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg, bistabcg
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
print("Parameters:", params)
params[define._MG_X_] = 16
params[define._MG_Y_] = 16
params[define._MG_Z_] = 16
params[define._MG_T_] = 16


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

In [3]:
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:0
host_params[_SET_PLAN_] :1
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [4]:
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:1
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [5]:
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:2
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [6]:
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:3
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [7]:
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:4
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [8]:
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

Set pointers: [94701774088992 94701783878992 94701784054208 94701784230608
 94701784368560              0              0              0
              0              0]
Set pointers data: <memory at 0x7fddea2ada80>


# Read from hdf5 files.

In [9]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)
orth_eigenvectors_filename = gauge_filename.replace("gauge", "orth_eigenvectors")
print("Orth orth_eigenvectors filename:", orth_eigenvectors_filename)
orth_eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=orth_eigenvectors_filename))
print("Orth orth_eigenvectors data:", orth_eigenvectors.data)
print("Orth orth_eigenvectors shape:", orth_eigenvectors.shape)


Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [None]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [11]:
def cg_dslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def cg_dslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return cg_dslash_dag(cg_dslash_no_dag(src))

def bistabcg_dslash(src):
    return cg_dslash_no_dag(src)



# Give matvec.

In [12]:
def matvec(src):
    return cg_dslash(src)

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test. (pass, don't run this)

In [13]:
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = cg_dslash(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [14]:
# eigen_solver = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=define._LAT_Ne_,matvec=cg_dslash,dtype=gauge.dtype)
# eigenvalues, eigenvectors = eigen_solver.run()
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG. (pass, don't run this)

In [15]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# cg_solver = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# x_o = cg_solver.run()
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [16]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# bistabcg_solver = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# x_o = bistabcg_solver.run()
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# Give guage's orth_eigenvectors to hdf5 files. (pass, don't run this)

In [17]:
# _eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=eigenvectors, params=params)
# _eigenvectors.shape  # escTtZzYyXx
# def orthogonalize(eigenvectors):
#     _eigenvectors = eigenvectors.copy()
#     size_e, size_s, size_c, size_T, size_t, size_Z, size_z, size_Y, size_y, size_X, size_x = eigenvectors.shape
#     print(size_e, size_s, size_c, size_T, size_t,
#           size_Z, size_z, size_Y, size_y, size_X, size_x)
#     for T in range(size_T):
#         for Z in range(size_Z):
#             for Y in range(size_Y):
#                 for X in range(size_X):
#                     origin_matrix = eigenvectors[:,
#                                                  :, :, T, :, Z, :, Y, :, X, :]
#                     _shape = origin_matrix.shape
#                     _origin_matrix = origin_matrix.reshape(size_e, -1)
#                     condition_number = np.linalg.cond(_origin_matrix.get())
#                     print(f"矩阵条件数: {condition_number}")
#                     a = _origin_matrix[:, 0]
#                     b = _origin_matrix[:, -1]
#                     print(cp.dot(a.conj(), b))
#                     Q = cp.linalg.qr(_origin_matrix.T)[0]
#                     condition_number = np.linalg.cond(Q.get())
#                     print(f"矩阵条件数: {condition_number}")
#                     a = Q[:, 0]
#                     b = Q[:, -1]
#                     print(cp.dot(a.conj(), b))
#                     _eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :] = Q.T.reshape(
#                         _shape)
#     return _eigenvectors
# orth_eigenvectors = orthogonalize(_eigenvectors)
# io.xxx2hdf5_xxx(
#     orth_eigenvectors, params, gauge_filename.replace("gauge", "orth_eigenvectors"))

# MultiGrid - give grids.

In [18]:
testvectors = io.xxxtzyx2mg_xxxtzyx(input_array=orth_eigenvectors, params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)

Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 16, 2, 16, 2, 16, 2, 16, 1)
Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 16, 1)


# MultiGrid - R*vector.
![](./image0-dev40.png)

In [19]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)

In [20]:
r_dest.shape

(24, 16, 16, 16, 16)

# MultiGrid - P*vector.
![](./image1-dev40.png)


In [21]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)

In [22]:
p_dest.shape

(4, 3, 16, 2, 16, 2, 16, 2, 16, 1)

# MultiGrid - verify above.
![](./image2-dev40.png)

In [23]:
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))

3547.24
59.900368


In [24]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

0.9849297


In [25]:
print(cp.linalg.norm(r_src-p_vec(r_vec(r_src)))/cp.linalg.norm(r_src))

0.9849297


In [26]:
r_src.flatten()[:50]

array([1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j], dtype=complex64)

In [27]:
r_src.flatten()[-50:]

array([1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j], dtype=complex64)

In [28]:
p_dest.flatten()[:50]

array([ 0.04659319+0.02324029j,  0.02172935+0.02373764j,
        0.00433014+0.01751j   , -0.00296394+0.0202329j ,
        0.01604445+0.02883552j,  0.01156181+0.02185278j,
        0.01796439+0.02046544j,  0.03806579+0.02856953j,
       -0.00057994+0.02139519j,  0.00814394+0.01970419j,
        0.02377587+0.02053813j,  0.04644986+0.0209235j ,
        0.02449472+0.0263091j ,  0.02686587+0.02050207j,
        0.01920257+0.02244986j,  0.01384766+0.02883052j,
        0.04893504+0.01918519j,  0.02309111+0.0152785j ,
        0.00582137+0.01310601j, -0.00259548+0.01449173j,
        0.01643368+0.02228693j,  0.01126553+0.01567948j,
        0.01503985+0.01776963j,  0.03252835+0.01923173j,
       -0.00255426+0.01460834j,  0.00750565+0.01568521j,
        0.02139773+0.01291577j,  0.04278   +0.01793526j,
        0.02640039+0.0215553j ,  0.0235587 +0.02106245j,
        0.01814134+0.01824667j,  0.01417138+0.02420709j,
        0.03960485+0.00448066j,  0.02083965+0.01245028j,
        0.00711764+0.01032599j,

In [29]:
p_dest.flatten()[-50:]

array([ 0.01094885+0.01570752j,  0.01983576+0.020632j  ,
       -0.00574602+0.01129889j,  0.00797576+0.01618678j,
        0.01372513+0.02112185j,  0.00853069+0.0224162j ,
        0.02052952+0.01594587j,  0.01078657+0.02281301j,
        0.00635708+0.0228156j ,  0.00165828+0.02389975j,
        0.02050215+0.02315249j,  0.00875455+0.02168846j,
        0.00200658+0.01898067j,  0.0008822 +0.00909485j,
        0.00746235+0.02880097j,  0.0055498 +0.02184863j,
        0.01128549+0.0184437j ,  0.02195598+0.02432124j,
        0.01021375+0.02255759j,  0.0214696 +0.02921664j,
        0.02400528+0.03360877j,  0.015725  +0.04037272j,
        0.03828564+0.03577142j,  0.0207401 +0.04201239j,
        0.00896282+0.03697643j,  0.00637373+0.03573557j,
        0.02691852+0.03914295j,  0.02166207+0.03265171j,
        0.01566513+0.02397102j,  0.0190839 +0.01345441j,
        0.00699722+0.03539582j,  0.01016165+0.02843208j,
        0.02356643+0.03060293j,  0.03815781+0.04208371j,
        0.01397155+0.02071342j,

In [30]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src)

array(0.9849297, dtype=float32)

In [31]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(p_dest)

array(58.326553, dtype=float32)

In [32]:
p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50]

array([ 2.0849543e-06+7.0168392e-07j,  6.1799312e-08+3.5346488e-07j,
       -3.5061737e-07+2.7540869e-07j, -1.0533192e-06+6.4866674e-07j,
       -1.7180119e-08+9.7527732e-07j, -7.1109511e-08+2.4397053e-07j,
        4.5579938e-08+3.1661494e-07j,  1.8433783e-06+1.0852751e-06j,
       -8.3046461e-07+5.8203534e-07j, -1.5043167e-07+2.6044762e-07j,
        4.2583912e-07+2.8543712e-07j,  1.5054635e-06+5.7549403e-07j,
        4.3504215e-07+1.2200624e-06j,  4.7519507e-07+4.2979565e-07j,
        1.5095910e-07+4.3753877e-07j, -5.4562104e-08+1.0110193e-06j,
        2.2571689e-06+6.9206726e-07j,  9.5457906e-08+2.0293641e-07j,
       -3.4790759e-07+1.8645432e-07j, -8.3304406e-07+5.0699094e-07j,
        1.9302160e-08+1.0836305e-06j, -6.3569090e-08+2.3137167e-07j,
        4.2084711e-08+2.6805753e-07j,  1.5825755e-06+7.4145015e-07j,
       -8.9206083e-07+4.6192935e-07j, -1.1955872e-07+1.9013939e-07j,
        4.0238859e-07+1.2770616e-07j,  1.3909023e-06+3.6574977e-07j,
        5.1001996e-07+9.2893941e-0

In [33]:
cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))))/cp.linalg.norm(r_src) #???

array(0.99999994, dtype=float32)

In [34]:
# _mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
#                 testvectors, cp.conj(testvectors)).flatten()
# print(cp.linalg.norm(_mat))
# print(_mat[:100])

# MultiGrid - R*matvec\*P.

In [35]:
def _r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(io.xxx2eTZYX(src, params))))


def r_matvec_p(src, matvec):
    return io.array2xxx(_r_matvec_p(src, matvec))

# MultiGrid - verify above.

In [36]:
D_r_src = matvec(r_src)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001938783 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001865870 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001924071 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001891892 sec


In [37]:
D_r_src.flatten()[:50]

array([ 0.09512203-0.15065098j,  0.02459466-0.10281426j,
        0.14802423-0.04478627j,  0.05378214-0.09193704j,
       -0.08474712+0.0208632j , -0.06649091+0.06668942j,
       -0.03820895+0.03143284j, -0.06110204+0.1998555j ,
       -0.09466349-0.10954653j, -0.09903856+0.15328293j,
       -0.11581516+0.23665299j,  0.15433274+0.0833789j ,
       -0.11228501+0.18325019j,  0.07416645+0.00099519j,
        0.05309803-0.03660914j,  0.034646  -0.03932991j,
       -0.00817882-0.03089109j,  0.03485045+0.13626698j,
       -0.12274602-0.06901661j, -0.10658678+0.15540114j,
        0.09019466+0.07479304j,  0.00329622+0.01262958j,
       -0.2735389 +0.08964469j, -0.02719212+0.18662325j,
        0.07507145-0.0618236j , -0.09112616+0.10951558j,
        0.08583128+0.02523299j, -0.0734987 -0.1731855j ,
       -0.09989857+0.04082112j, -0.1259279 +0.17906842j,
        0.00141903+0.04172344j, -0.08249421-0.07952813j,
        0.06730361-0.03395787j, -0.16046013+0.01053373j,
       -0.04409004+0.2682615j ,

In [38]:
p_r_D_p_r_dest=p_vec(_r_matvec_p(r_dest,matvec=cg_dslash))

Input Array Shape: (24, 16, 16, 16, 16)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001988235 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001953379 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001918504 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001908448 sec


In [39]:
p_r_D_p_r_dest.flatten()[:50]

array([ 4.86242701e-04+1.15070325e-04j,  1.94462009e-05-8.00223097e-06j,
       -1.28275366e-04-1.10325218e-05j, -3.54040007e-04-3.14918907e-06j,
        3.76428652e-05+2.09686317e-04j, -1.45988961e-05-2.06654440e-05j,
        1.09659668e-05+2.51320562e-05j,  4.41023614e-04+7.67709789e-05j,
       -3.90621193e-04+5.70648954e-05j, -8.83624889e-05-2.40500922e-05j,
        1.00018566e-04+8.05497184e-05j,  4.49191139e-04+4.10566572e-05j,
       -1.59319297e-05+9.45660431e-05j,  5.00679162e-05+9.86587984e-05j,
        2.91853848e-05+9.79452307e-05j, -8.77865168e-05+1.49863772e-04j,
        5.23884781e-04+1.14719958e-04j,  1.76138201e-05-2.61575606e-05j,
       -1.28341548e-04-3.21850166e-05j, -2.82396097e-04-1.01822279e-05j,
        5.37518317e-05+2.27699697e-04j, -2.58081309e-05-2.42325532e-05j,
        8.07831748e-06+7.38699600e-06j,  3.83968523e-04+2.04991338e-05j,
       -3.87999637e-04+1.67712551e-05j, -7.25566642e-05-4.39972646e-05j,
        9.37112636e-05+2.87809671e-05j,  4.07712592

In [40]:
cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src)

array(0.99997723, dtype=float32)

In [41]:
r_dest.flatten()[:50]

array([-0.23515025-0.0043213j , -0.22336413-0.02604285j,
       -0.23544359-0.02095123j, -0.2075752 -0.01506144j,
        0.20574223+0.01090441j,  0.20943129-0.00165882j,
        0.20778269+0.01311309j,  0.19190224+0.03145706j,
       -0.20874417-0.02377207j, -0.20707351-0.02077318j,
       -0.19330534-0.01891523j, -0.21399519-0.04410193j,
        0.21663521+0.04601838j,  0.20285456+0.03325263j,
        0.2086901 +0.00763768j,  0.21433663+0.032929j  ,
       -0.23770127-0.01773039j, -0.22200549-0.02634005j,
       -0.19559099-0.0249592j , -0.23870035-0.00976155j,
        0.2251733 +0.02126457j,  0.18922749-0.01719255j,
        0.19232392+0.01558745j,  0.19756025+0.02490024j,
       -0.20747781-0.03059887j, -0.20759493-0.04224508j,
       -0.20937707-0.01805812j, -0.19636576-0.03145631j,
        0.19385953+0.03192898j,  0.2194323 +0.025717j  ,
        0.21178111+0.00622029j,  0.2182849 +0.01917719j,
       -0.19124664-0.01461449j, -0.20485187-0.02368497j,
       -0.19561641-0.02218995j,

In [42]:
p_vec(r_dest).flatten()[:50]

array([ 0.04659319+0.02324029j,  0.02172935+0.02373764j,
        0.00433014+0.01751j   , -0.00296394+0.0202329j ,
        0.01604445+0.02883552j,  0.01156181+0.02185278j,
        0.01796439+0.02046544j,  0.03806579+0.02856953j,
       -0.00057994+0.02139519j,  0.00814394+0.01970419j,
        0.02377587+0.02053813j,  0.04644986+0.0209235j ,
        0.02449472+0.0263091j ,  0.02686587+0.02050207j,
        0.01920257+0.02244986j,  0.01384766+0.02883052j,
        0.04893504+0.01918519j,  0.02309111+0.0152785j ,
        0.00582137+0.01310601j, -0.00259548+0.01449173j,
        0.01643368+0.02228693j,  0.01126553+0.01567948j,
        0.01503985+0.01776963j,  0.03252835+0.01923173j,
       -0.00255426+0.01460834j,  0.00750565+0.01568521j,
        0.02139773+0.01291577j,  0.04278   +0.01793526j,
        0.02640039+0.0215553j ,  0.0235587 +0.02106245j,
        0.01814134+0.01824667j,  0.01417138+0.02420709j,
        0.03960485+0.00448066j,  0.02083965+0.01245028j,
        0.00711764+0.01032599j,

# MultiGrid - CG (BUG!!!)

In [43]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)

In [44]:
# # # Dslash(x_o)=b__o
# # cg_solver = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# # x_o = cg_solver.run()

# # mg version
# mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(b__o, params), params)).flatten()
# cg_solver = cg.slover(b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=cg_dslash),
#                       tol=1e-5, max_iter=1000000)
# mg_x_o = cg_solver.run()

In [45]:
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)


In [46]:
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# MultiGrid - BISTABCG (TESTING......)

In [47]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001982987 sec


In [48]:
# # Dslash(x_o)=b__o
# bistabcg_solver = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# x_o = bistabcg_solver.run()
# io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')

In [49]:
# mg version
mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(b__o, params), params)).flatten()
bistabcg_solver = bistabcg.slover(
    b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=bistabcg_dslash), tol=1e-10, max_iter=1000000)
mg_x_o = bistabcg_solver.run()
_x_o=io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')


Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 16, 1)
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002225302 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002038236 sec
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.040553618 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001925606 sec
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.045482421 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001935734 sec
Iteration 0: Residual = 7.499789e+05, Time = 0.484485 s
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.039964035 sec
multi-gpu wilson dslash total

KeyboardInterrupt: 

In [None]:
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

In [None]:
x_o=io.hdf5_xxx2xxx(params,'x_o.h5')
_x_o=io.hdf5_xxx2xxx(params,'_x_o.h5')

Dest Shape: (6291456,)
Dest Shape: (6291456,)


In [None]:
x_o.flatten()[:50]

array([74.27604 +92.02284j , 75.06629 +87.020996j, 63.950462+81.42317j ,
       71.90431 +81.19496j , 82.32697 +77.19529j , 82.58721 +76.42146j ,
       75.92041 +76.61929j , 83.29001 +63.37289j , 90.26471 +89.01645j ,
       85.437584+67.62439j , 82.527954+61.06994j , 66.877235+63.152378j,
       86.607956+60.486282j, 69.97304 +78.853806j, 74.98956 +78.34286j ,
       82.30312 +82.18618j , 80.42873 +82.24485j , 75.13653 +69.27514j ,
       84.342064+79.84675j , 83.38979 +62.740913j, 70.93342 +69.10556j ,
       78.41749 +76.8612j  , 92.75483 +71.3723j  , 76.95801 +63.98175j ,
       76.83701 +84.04088j , 86.23326 +71.82796j , 74.12482 +74.47032j ,
       80.77271 +88.2747j  , 88.081   +68.75456j , 86.93835 +63.496307j,
       76.293976+74.005066j, 85.7808  +80.60354j , 73.14485 +80.3956j  ,
       86.552826+75.09205j , 84.15033 +56.783974j, 82.566315+92.946976j,
       80.47373 +77.690834j, 69.55831 +75.1817j  , 69.10493 +87.87933j ,
       78.504234+86.637924j, 85.1525  +73.68484j , 

In [None]:
_x_o.flatten()[:50]

array([66.847145+62.071053j, 65.7746  +64.32228j , 62.77684 +55.22659j ,
       64.532135+57.384136j, 61.747215+63.420807j, 54.253204+63.536137j,
       65.64795 +59.847565j, 49.11704 +66.15434j , 64.961174+63.937435j,
       55.86785 +66.95802j , 48.697647+61.199223j, 59.995064+51.557964j,
       62.02426 +65.27819j , 62.55652 +51.941917j, 54.948822+52.38839j ,
       62.121773+61.34056j , 51.890274+64.851j   , 53.718792+65.21905j ,
       63.779926+71.71518j , 57.84163 +63.586067j, 61.04749 +60.77568j ,
       56.099327+62.94026j , 55.647743+74.57672j , 49.90782 +60.134953j,
       66.65774 +58.41134j , 60.748177+67.344574j, 66.397095+62.058064j,
       72.403046+62.990658j, 62.56825 +63.934586j, 58.13651 +61.227074j,
       52.613888+62.562634j, 65.98464 +65.92459j , 64.20118 +53.003407j,
       60.09849 +71.15988j , 47.549126+70.186874j, 69.97476 +58.859016j,
       69.200745+63.028435j, 63.339504+58.396515j, 67.01509 +58.644043j,
       71.88318 +51.202713j, 62.149536+71.344025j, 

In [None]:
print(np.linalg.norm(_x_o-x_o) /
      np.linalg.norm(x_o))

0.23569165


# End for CG & BISTABCG. (pass, don't run this)

In [None]:
# cg_solver.end()
# bistabcg_solver.end()

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)