# Init for pyqcu.

In [None]:

import cupy as cp
import numpy as np
import functools
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg, bistabcg, amg, linalg
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
params[define._MG_X_] = 8
params[define._MG_Y_] = 8
params[define._MG_Z_] = 8
params[define._MG_T_] = 8
print("Parameters:", params)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)
params[define._LAT_E_] = 12


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

# Read from hdf5 files.

In [2]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)
# testvectors_filename = gauge_filename.replace(
#     "gauge", "testvectors")
# print("Testvectors filename:", testvectors_filename)
# testvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=testvectors_filename))
# print("Testvectors data:", testvectors.data)
# print("Testvectors shape:", testvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0x928400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [3]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

Fermion out data: <MemoryPointer 0x928400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f7a5c3d2cf0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0x922400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f7a150da2b0>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
##RANK:0##LOOP:118##Residual:(2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.640832000 sec
######TIME  :2515.33######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [4]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))

def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_params)
    return dest

def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest

def dslash(src):
    return dslash_no_dag(src)

def bistabcg_dslash(src):
    return pdslash_no_dag(src)

print(cp.linalg.norm((fermion_out[define._EVEN_]-kappa *
               dslash(fermion_out[define._ODD_]))-fermion_in[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001818535 sec
0.0009944807


# Give matvec.

In [5]:
# def matvec(src):
#     return gamma5_vec(pdslash_no_dag(src)).reshape(src.shape)
# def matvec(src):
#     return cg_dslash(src).reshape(src.shape)
def matvec(src):
    return bistabcg_dslash(src).reshape(src.shape)

# AMG - SETUP

In [None]:
_testvectors = amg.setup(n=fermion_in[define._EVEN_].size, k=params[define._LAT_E_],
                         matvec=matvec, dtype=fermion_in.dtype)
testvectors = io.xxxtzyx2mg_xxxtzyx(
    io.eigenvectors2esctzyx(_testvectors, params), params)
print("Shape of testvectors: ", testvectors.shape)
for i in range(len(_testvectors)):
    if i==0:
        pass
    else:
        print("Ax/x", bistabcg_dslash(_testvectors[i])/_testvectors[i])
        projections = cp.dot(_testvectors[:i].conj(), _testvectors[i])
        max_proj = cp.max(cp.abs(projections)).get()
        print(f"Maximum projection onto existing basis: {max_proj:.2e}")
        j = i+1
        if j == len(_testvectors):
            j = 0
        print("Difference between v_i and v_j:", cp.linalg.norm(
            _testvectors[i]-_testvectors[j])/cp.linalg.norm(_testvectors[i]))
for T in range(params[define._MG_T_]):
    for Z in range(params[define._MG_Z_]):
        for Y in range(params[define._MG_Y_]):
            for X in range(params[define._MG_X_]):
                testvectors[:, :, :, T, :, Z, :, Y, :, X, :] = linalg.orthogonalize_matrix(
                    testvectors[:, :, :, T, :, Z, :, Y, :, X, :].reshape(params[define._LAT_E_], -1)).reshape(testvectors[:, :, :, T, :, Z, :, Y, :, X, :].shape)
# io.xxx2hdf5_xxx(
#     testvectors, params, gauge_filename.replace("gauge", "testvectors"))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001828490 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001798783 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001844768 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001740281 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001806349 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001744247 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001825618 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001732968 sec
Iteration 0: Residual = 9.999999e-01, Time = 0.024683 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001827388 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001764970 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001803810 sec
multi-gpu wilson dslash total time: (witho

# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [7]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(gauge.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = fermion_out[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 32, 32, 32, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.021350110 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.021432283 sec
0.0
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.021277244 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.020650024 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.021397111 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.020637922 sec
0.0


# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [None]:
# eigenvalues, _eigenvectors = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=params[define._LAT_E_], matvec=matvec, dtype=gauge.dtype)
# print(eigenvalues)
# eigenvectors = io.xxxtzyx2mg_xxxtzyx(
#     io.eigenvectors2esctzyx(_eigenvectors, params), params)
# print("Shape of eigenvectors: ", eigenvectors.shape)
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = _eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = matvec(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(_eigenvectors[i] - _eigenvectors[j])/cp.linalg.norm(_eigenvectors[i]):.2e}")
# # for T in range(params[define._MG_T_]):
# #     for Z in range(params[define._MG_Z_]):
# #         for Y in range(params[define._MG_Y_]):
# #             for X in range(params[define._MG_X_]):
# #                 eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :] = linalg.orthogonalize_matrix(
# #                     eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :].reshape(params[define._LAT_E_], -1)).reshape(eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :].shape)
# #                 print(f"Condition number of eigenvectors[:, :, :, {T}, :, {Z}, :, {Y}, :, {X}, :].T.get(): {np.linalg.cond(eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :].reshape(params[define._LAT_E_], -1).T.get())}")
# testvectors = linalg.orthogonalize_matrix(eigenvectors)
# # testvectors = eigenvectors
# # io.xxx2hdf5_xxx(
# #     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# # io.xxx2hdf5_xxx(
# #     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))
# print(_eigenvectors.flatten()[:50])
# print(eigenvectors.flatten()[:50])

# Origin CG. (pass, don't run this)

In [9]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=matvec, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [10]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=matvec, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# MultiGrid - give grids.

In [11]:
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)
# _src = io.xxxtzyx2mg_xxxtzyx(
#     input_array=fermion_out[define._EVEN_], params=params)

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 8, 4, 8, 4, 8, 2)


<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [12]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)
_mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
                testvectors, cp.conj(testvectors)).flatten()
print(r_src.flatten()[:50])
print(p_dest.flatten()[:50])
print(_mat[:100])
print(
    p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50])
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))
print(cp.linalg.norm(_mat))
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))
print(cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src))))))))
                     )/cp.linalg.norm(r_src))

[1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j
 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j 1.+1.j]
[1.1316085 +0.97290325j 1.1677741 +0.9811758j  1.0206206 +0.7987683j
 1.0778488 +0.92771906j 0.98684543+1.0332992j  0.944275  +1.0448174j
 0.93170226+0.94410336j 0.6917768 +1.0461594j  1.0637015 +1.1192406j
 0.72527033+1.0725586j  0.71114624+1.0591164j  0.7222965 +0.8292029j
 0.6715984 +1.0946276j  0.9244542 +0.87174964j 0.9114419 +0.9528378j
 1.0126066 +1.0277419j  0.9679123 +1.0423104j  0.9109542 +0.98289186j
 0.9881547 +1.0248761j  0.8166652 +1.0404435j  0.8938703 +0.85892653j
 0.98097944+0.9828001j  0.93107706+1.1786513j  0.7692192 +0.9553598j
 1.0539947 +0.94583845j 0.8320094 +1.0883316j  0.9273122 +0.96285963j
 1.0941741 +1.015300

<!-- # MultiGrid - verify above.
![](./image2-dev40.png) -->

# MultiGrid - R*matvec\*P.

In [13]:
def _r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(io.xxx2eTZYX(src, params))))


def r_matvec_p(src, matvec):
    return io.array2xxx(_r_matvec_p(src, matvec))
D_r_src = matvec(r_src)
p_r_D_p_r_dest = p_vec(_r_matvec_p(r_vec(r_src), matvec=matvec))
print(D_r_src.flatten()[:50])
print(p_r_D_p_r_dest.flatten()[:50])
print(cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002022697 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001781170 sec
Input Array Shape: (12, 8, 8, 8, 8)
Dest Shape: (12, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001887234 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001813882 sec
[ 0.09344786-0.10208309j  0.04712683-0.05375636j  0.20205879-0.02301359j
  0.10501933-0.06024492j -0.03274214-0.00194919j -0.03296566+0.01795989j
 -0.00965858-0.00275993j -0.04963386+0.21724534j -0.11307013-0.06256664j
 -0.06049633+0.11239302j -0.04292512+0.18137288j  0.11644292+0.15530562j
 -0.1051693 +0.20847821j  0.17299628+0.01849371j  0.08634686+0.01981819j
  0.01468253-0.00553358j  0.03066605+0.04353732j  0.02836114+0.15360475j
 -0.07397485-0.06877553j -0.07699823+0.1980232j   0.09499002+0.06847632j
 -0.00526285-0.02889919j -0.2506256 +0.03647596j -0.01345444+0.23340106j
  0.08127064-0.06814575

# MG-BISTABCG

In [14]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001843234 sec


In [15]:
# # Dslash(x_o)=b__o
x_o = bistabcg.slover(
    b=b__o, matvec=matvec, tol=1e-10, max_iter=1000000)
# io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001883581 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001889823 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001872768 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001772165 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001812918 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001754664 sec
Iteration 0: Residual = 4.974845e+07, Time = 0.018261 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001864390 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001762029 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001837736 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001750759 sec
Iteration 1: Residual = 3.800150e+07, Time = 0.017150 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.

In [None]:
# mg version
mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
    io.fermion2sctzyx(b__o, params), params)).flatten()
mg_x_o = bistabcg.slover(
    b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=matvec), tol=1e-10, max_iter=1000000)
_x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
# io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 8, 4, 8, 4, 8, 2)
Input Array Shape: (49152,)
Dest Shape: (12, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001902214 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001785878 sec
Input Array Shape: (49152,)
Dest Shape: (12, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001951766 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001784565 sec
Input Array Shape: (49152,)
Dest Shape: (12, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001854221 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001783141 sec
Iteration 0: Residual = 4.906880e+07, Time = 2.397107 s
Input Array Shape: (49152,)
Dest Shape: (12, 8, 8, 8, 8)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001975740 sec
multi-gpu wilson dslash total time: (without malloc free 

In [None]:
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
      np.linalg.norm(quda_fermion_out))
# x_o = io.hdf5_xxx2xxx(params, 'x_o.h5')
# _x_o = io.hdf5_xxx2xxx(params, '_x_o.h5')
print(x_o.flatten()[:50])
print(_x_o.flatten()[:50])
print(np.linalg.norm(_x_o-x_o) /
      np.linalg.norm(x_o))

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)