# Init for pyqcu.

In [2]:
import cupy as cp
import numpy as np
import functools
from pyqcu.cuda import define
from pyqcu.cuda import io
from pyqcu.cuda import qcu
from pyqcu.cuda import eigen, cg, bistabcg
from time import perf_counter
from opt_einsum import contract
from pyqcu.cuda.set import params, argv, set_ptrs
params[define._MAX_ITER_] = 1000
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
params[define._DATA_TYPE_] = define._LAT_C128_
argv = argv.astype(define.dtype_half(params[define._DATA_TYPE_]))
argv[define._MASS_] = 0.0
argv[define._TOL_] = 1e-8
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
params[define._MG_X_] = int(params[define._LAT_X_]/params[define._LAT_P_])
params[define._MG_Y_] = params[define._LAT_Y_]
params[define._MG_Z_] = params[define._LAT_Z_]
params[define._MG_T_] = int(params[define._LAT_T_]/2)
print("Parameters:", params)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

My rank is  0
Parameters: [     32      32      32      32 1048576       1       1       1       1
       0       0       1       0    1000       4       0       0       1
      32      32      16      24]
gridDim.x               :16384
blockDim.x              :32
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :1000
host_params[_DATA_TYPE_]:4
host_params[_SET_INDEX_]:0
host_params[_SET_PLAN_] :1
host_params[_MG_X_]     :1
host_params[_MG_Y_]     :32
host_params[_MG_Z_]     :32
host_params[_MG_T_]     :16
host_params[_LAT_E_]    :24
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-08
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]      

# Read from hdf5 files.

In [4]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename).astype(define.dtype(params[define._DATA_TYPE_]))
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename).astype(define.dtype(params[define._DATA_TYPE_]))
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename).astype(define.dtype(params[define._DATA_TYPE_]))
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename).astype(define.dtype(params[define._DATA_TYPE_]))
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename)).astype(define.dtype(params[define._DATA_TYPE_]))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0x951200000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [5]:
# qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
qcu.applyWilsonCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

Fermion out data: <MemoryPointer 0x951200000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7ff758db8b30>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0x945200000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7ff714f12db0>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
##RANK:0##LOOP:143##Residual:(7.79475e-09,-1.03008e-28i)
multi-gpu wilson Cg total time: (without malloc free memcpy) :5.836207206 sec
######TIME  :17439.8######
##RANK      :0
##LOOP      :-2
##tmp0      :(4.51227e-09,4.03897e-27i)
##rho_prev  :(1.02483e-08,-6.16084e-29i)
##rho       :(7.79475e-09,-1.03008e-28i)
##alpha     :(2.27121,-2.04663e-18i)
##beta      :(0.760587,-5.4789e-21i)
##send_tmp  :(7.79475e-09,-1.88834e-29i)
##norm2_tmp :(492090,-3.22025e-15i)
##diff_tmp  :(1.58401e-14,6.52841e-35i)
##lat_4dim  :(524288,0i)
Difference: 3.071569685546415e-08


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [None]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))

def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_params)
    return dest

def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest

def dslash(src):
    return dslash_no_dag(src)

def bistabcg_dslash(src):
    return pdslash_no_dag(src)

print(cp.linalg.norm((fermion_out[define._EVEN_]-kappa *
               dslash(fermion_out[define._ODD_]))-fermion_in[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.007938659 sec
1.852514506009482e-12


# Give matvec.

In [None]:
# def matvec(src):
#     return gamma5_vec(pdslash_no_dag(src)).reshape(src.shape)
# def matvec(src):
#     return cg_dslash(src).reshape(src.shape)
def matvec(src):
    return bistabcg_dslash(src).reshape(src.shape)

# Origin CG. (pass, don't run this)

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)