# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
print("Parameters:", params)


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

In [2]:
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:0
host_params[_SET_PLAN_] :1
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [3]:
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:1
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [4]:
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:2
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [5]:
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:3
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [6]:
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:4
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [7]:
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

Set pointers: [94894152579456 94894162366848 94894162544864 94894162720352
 94894162858624              0              0              0
              0              0]
Set pointers data: <memory at 0x7f7fce218e80>


# Read from hdf5 files.

In [8]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)
orth_eigenvectors_filename = gauge_filename.replace("gauge", "orth_eigenvectors")
print("Orth orth_eigenvectors filename:", orth_eigenvectors_filename)
orth_eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=orth_eigenvectors_filename))
print("Orth orth_eigenvectors data:", orth_eigenvectors.data)
print("Orth orth_eigenvectors shape:", orth_eigenvectors.shape)


Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [9]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

##RANK:0##LOOP:118##Residual:(2.27222e-10,1.97371e-23i)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f7fce6fa930>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0xb22400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f7fd0324a70>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.823967360 sec
######TIME  :4070.66######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG Dslash
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [10]:
def cg_dslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def cg_dslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return cg_dslash_dag(cg_dslash_no_dag(src))


def matvec(src):
    return cg_dslash(src)

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test. (pass, don't run this)

In [11]:
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = matvec(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [12]:
# eigen_solver = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=define._LAT_Ne_,matvec=matvec,dtype=gauge.dtype)
# eigenvalues, eigenvectors = eigen_solver.run()
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG

In [13]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp
# b__o -> Dslash^dag b__o
b__o = cg_dslash_dag(b__o)
# Dslash(x_o)=b__o
cg_solver = cg.slover(b=b__o, matvec=matvec, tol=1e-10, max_iter=1000000)
x_o = cg_solver.run()
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001907357 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001858916 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001860560 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001881355 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001778297 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001814943 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001747972 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001854331 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001805147 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001886233 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001787760 sec
Iteration 0: Residual = 4.704832e+05, Time = 0.016955 s
multi-gpu wilson dslash total time: (witho

In [14]:
np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
    np.linalg.norm(quda_fermion_out)

array(3.4577965e-07, dtype=float32)

# Give guage's orth_eigenvectors to hdf5 files. (pass, don't run this)

In [15]:
# _eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=eigenvectors, params=params)
# _eigenvectors.shape  # escTtZzYyXx
# def orthogonalize(eigenvectors):
#     _eigenvectors = eigenvectors.copy()
#     size_e, size_s, size_c, size_T, size_t, size_Z, size_z, size_Y, size_y, size_X, size_x = eigenvectors.shape
#     print(size_e, size_s, size_c, size_T, size_t,
#           size_Z, size_z, size_Y, size_y, size_X, size_x)
#     for T in range(size_T):
#         for Z in range(size_Z):
#             for Y in range(size_Y):
#                 for X in range(size_X):
#                     origin_matrix = eigenvectors[:,
#                                                  :, :, T, :, Z, :, Y, :, X, :]
#                     _shape = origin_matrix.shape
#                     _origin_matrix = origin_matrix.reshape(size_e, -1)
#                     condition_number = np.linalg.cond(_origin_matrix.get())
#                     print(f"矩阵条件数: {condition_number}")
#                     a = _origin_matrix[:, 0]
#                     b = _origin_matrix[:, -1]
#                     print(cp.dot(a.conj(), b))
#                     Q = cp.linalg.qr(_origin_matrix.T)[0]
#                     condition_number = np.linalg.cond(Q.get())
#                     print(f"矩阵条件数: {condition_number}")
#                     a = Q[:, 0]
#                     b = Q[:, -1]
#                     print(cp.dot(a.conj(), b))
#                     _eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :] = Q.T.reshape(
#                         _shape)
#     return _eigenvectors
# orth_eigenvectors = orthogonalize(_eigenvectors)
# io.xxx2hdf5_xxx(
#     orth_eigenvectors, params, gauge_filename.replace("gauge", "orth_eigenvectors"))

# MultiGrid - give grids

In [16]:
testvectors = io.xxxtzyx2mg_xxxtzyx(input_array=orth_eigenvectors, params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_out[define._EVEN_], params=params)

Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 8, 4, 4, 8, 4, 8, 4, 4)
Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 4, 8, 4, 8, 4, 4)


# MultiGrid - R*vector
![](./image0-dev40.png)

In [17]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)

In [18]:
r_dest.shape

(24, 8, 4, 4, 4)

# MultiGrid - P*vector
![](./image1-dev40.png)


In [19]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)

In [20]:
p_dest.shape

(4, 3, 8, 4, 4, 8, 4, 8, 4, 4)

# MultiGrid - verify above
![](./image2-dev40.png)

In [21]:
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))

275995.97
273596.62


In [22]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

0.13157189


In [23]:
print(cp.linalg.norm(r_src-p_vec(r_vec(r_src)))/cp.linalg.norm(r_src))

0.13157189


In [24]:
r_src.flatten()[:50]

array([85.06271 +92.72717j , 83.909325+79.079544j, 75.17135 +74.97637j ,
       59.96822 +70.10574j , 62.8976  +83.06627j , 85.20416 +82.942444j,
       83.96615 +87.257614j, 71.37396 +59.93318j , 83.66846 +84.91199j ,
       83.30059 +87.48178j , 69.11276 +83.136505j, 71.507126+84.716995j,
       73.07465 +66.849815j, 86.232895+76.366714j, 68.57071 +73.81118j ,
       76.25281 +74.22365j , 75.36408 +84.71486j , 81.36057 +74.8402j  ,
       77.86105 +73.528595j, 80.72326 +63.705765j, 72.48973 +74.28853j ,
       81.32994 +84.71786j , 81.19293 +81.71508j , 77.24133 +72.61786j ,
       89.19397 +68.572105j, 72.29457 +76.29749j , 73.593895+77.0461j  ,
       82.057526+67.759445j, 81.17849 +63.603233j, 60.194786+82.343704j,
       82.05291 +75.51457j , 86.49907 +80.46945j , 85.169395+80.00699j ,
       76.6777  +73.08816j , 60.411148+79.13046j , 88.5255  +77.89182j ,
       74.886246+70.65077j , 74.13893 +83.50812j , 75.94304 +77.715065j,
       84.23813 +64.963425j, 77.93952 +73.07037j , 

In [25]:
p_dest.flatten()[:50]

array([93.41222 +72.98684j , 88.589966+73.473206j, 82.77342 +63.466263j,
       82.88548 +72.25285j , 76.22191 +81.9991j  , 75.8486  +82.14936j ,
       75.48599 +74.86893j , 62.339397+82.30746j , 89.61882 +87.05483j ,
       68.38684 +82.4664j  , 62.103523+79.18884j , 63.96153 +62.853924j,
       58.15811 +86.18837j , 76.942505+70.193405j, 77.75906 +75.29802j ,
       81.30434 +81.61147j , 83.67299 +79.39325j , 70.98912 +73.93057j ,
       81.20628 +83.96094j , 64.60312 +83.39235j , 68.45625 +70.36452j ,
       75.966736+77.71112j , 70.8271  +91.58108j , 63.370636+75.864685j,
       83.87373 +74.52101j , 72.73335 +83.38083j , 74.80433 +70.6549j  ,
       88.76382 +76.26838j , 67.34412 +88.166664j, 61.886147+86.6857j  ,
       73.00889 +76.401474j, 80.51925 +85.03609j , 81.63306 +72.03488j ,
       76.5295  +85.57402j , 58.5947  +83.89573j , 94.43098 +82.90075j ,
       77.051895+79.797424j, 74.051605+68.64515j , 86.53514 +67.580055j,
       84.909874+77.39846j , 74.08455 +82.0465j  , 

In [26]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src)

array(0.13157189, dtype=float32)

In [37]:
p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50]

array([93.412254+72.98685j , 88.58998 +73.47323j , 82.77342 +63.466278j,
       82.88546 +72.25287j , 76.22201 +81.999054j, 75.848694+82.14933j ,
       75.48607 +74.8689j  , 62.339462+82.30742j , 89.61886 +87.05488j ,
       68.38687 +82.46645j , 62.103558+79.18888j , 63.961567+62.85396j ,
       58.15818 +86.1884j  , 76.94257 +70.19341j , 77.759125+75.29804j ,
       81.304375+81.61149j , 83.67303 +79.39326j , 70.989136+73.93059j ,
       81.20628 +83.960976j, 64.6031  +83.392365j, 68.45633 +70.36447j ,
       75.96683 +77.71108j , 70.82718 +91.581055j, 63.3707  +75.86465j ,
       83.87375 +74.52105j , 72.73338 +83.380875j, 74.80437 +70.654945j,
       88.76386 +76.26842j , 67.34419 +88.166695j, 61.886215+86.68573j ,
       73.00894 +76.40149j , 80.5193  +85.0361j  , 81.63309 +72.03489j ,
       76.52951 +85.57404j , 58.59469 +83.89575j , 94.430954+82.900764j,
       77.05197 +79.797386j, 74.05167 +68.64513j , 86.53522 +67.58002j ,
       84.90994 +77.39843j , 74.08458 +82.04655j , 

In [None]:
cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))))/cp.linalg.norm(r_src) #???

array(0.13157189, dtype=float32)

In [27]:
_mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
                testvectors, cp.conj(testvectors)).flatten()
print(cp.linalg.norm(_mat))
print(_mat[:100])

5.0557094
[0.003666  +6.02348900e-12j 0.0026554 -8.21720123e-12j
 0.0022105 -7.35900316e-12j 0.00270242+1.13765421e-12j
 0.00259649-6.19093318e-12j 0.00240936+2.38682446e-12j
 0.00266349-1.01298908e-11j 0.00373225+5.74245209e-12j
 0.00217243-3.18564576e-12j 0.0023548 -3.06997114e-12j
 0.0029909 -8.19643745e-12j 0.00424045-1.10721016e-11j
 0.00315794+9.08987469e-12j 0.00292291+2.12531034e-12j
 0.00271305-1.86014355e-12j 0.00257359-3.79370233e-12j
 0.00369942-2.46072303e-12j 0.00266941-2.66278297e-12j
 0.00205646+7.77967274e-12j 0.00201446+2.45456867e-14j
 0.00254975-2.53985960e-12j 0.00198968+6.13845086e-12j
 0.00208983+5.04393966e-12j 0.00287932+6.26981236e-12j
 0.00205674+3.95417856e-12j 0.00202364-7.40934224e-12j
 0.00230501+4.54526911e-12j 0.00310857+8.04914295e-12j
 0.00304014-2.63735214e-12j 0.00251305-2.97544086e-12j
 0.00230759+2.04160083e-12j 0.0022495 +7.09273352e-12j
 0.00287219+8.99250986e-12j 0.00211243+5.41409105e-12j
 0.00170461-1.72869466e-12j 0.00233439-2.47441564e-12j


# End for pyqcu. (pass, don't run this)

In [28]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)