# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg, bistabcg, amg
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
params[define._MG_X_] = 16
params[define._MG_Y_] = 16
params[define._MG_Z_] = 16
params[define._MG_T_] = 16
print("Parameters:", params)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

# Read from hdf5 files.

In [2]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [3]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

##RANK:0##LOOP:118##Residual:(Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7fd44034dfb0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0xb22400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7fd3c5d8b3b0>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.731905664 sec
######TIME  :3150.76######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [4]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))

def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_params)
    return dest

def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest

def dslash(src):
    return dslash_no_dag(src)

def bistabcg_dslash(src):
    return pdslash_no_dag(src)

print(cp.linalg.norm((fermion_out[define._EVEN_]-kappa *
               dslash(fermion_out[define._ODD_]))-fermion_in[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001949831 sec
0.0009944807


# AMG - SETUP

In [5]:
n = fermion_in[define._EVEN_].size
_testvectors = amg.setup(n=n, k=define._LAT_E_,
                         matvec=pdslash_no_dag, dtype=fermion_in.dtype)
testvectors = io.xxxtzyx2mg_xxxtzyx(
    io.eigenvectors2esctzyx(_testvectors, params), params)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001929854 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001860885 sec
(given) rayleigh_quotient_current: 0.99995327
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001977945 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001878685 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002001874 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001890927 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001934815 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001876469 sec
Iteration 0: Residual = 2.085259e+00, Time = 0.075897 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001979902 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001879900 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001927619 

In [6]:
cp.linalg.norm(_testvectors[-13])

array(1.0000001, dtype=float32)

In [7]:
_testvectors[-13]@_testvectors[:-13].T

array([ 0.00066658+3.51069611e-04j, -0.00040457-1.72454747e-05j,
       -0.00154197-5.36637308e-05j, -0.00034652-4.44134319e-04j,
       -0.00060299-1.97542555e-04j, -0.000424  +1.04021645e-04j,
        0.00099202-1.94274195e-04j,  0.00015209+6.35773060e-04j,
       -0.0008137 +2.16922694e-04j,  0.00028883-1.70680418e-04j,
       -0.00020731-2.59080669e-04j], dtype=complex64)

In [8]:
testvectors.shape

(24, 4, 3, 16, 2, 16, 2, 16, 2, 16, 1)

# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [9]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(gauge.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = fermion_out[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 32, 32, 32, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001914988 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001928246 sec
0.0
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001935973 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001924819 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001934898 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001890085 sec
0.0


# Give matvec.

In [10]:
# def matvec(src):
#     return gamma5_vec(pdslash_no_dag(src)).reshape(src.shape)
# def matvec(src):
#     return cg_dslash(src).reshape(src.shape)
def matvec(src):
    return bistabcg_dslash(src).reshape(src.shape)

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test. (pass, don't run this)

In [None]:
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = cg_dslash(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [None]:
# eigenvalues, eigenvectors = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=define._LAT_Ne_,matvec=matvec,dtype=gauge.dtype)
# print(eigenvalues)
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG. (pass, don't run this)

In [13]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [14]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# MultiGrid - give grids.

In [15]:
# orth_eigenvectors_filename = gauge_filename.replace(
#     "gauge", "orth_eigenvectors")
# print("Orth orth_eigenvectors filename:", orth_eigenvectors_filename)
# orth_eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=orth_eigenvectors_filename))
# print("Orth orth_eigenvectors data:", orth_eigenvectors.data)
# print("Orth orth_eigenvectors shape:", orth_eigenvectors.shape)
# testvectors = io.xxxtzyx2mg_xxxtzyx(
#     input_array=orth_eigenvectors, params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)
# _src = io.xxxtzyx2mg_xxxtzyx(
#     input_array=fermion_out[define._EVEN_], params=params)

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 16, 1)


<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [16]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)

In [17]:
r_dest.shape

(24, 16, 16, 16, 16)

<!-- # MultiGrid - P*vector.
![](./image1-dev40.png) -->


In [18]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)

In [19]:
p_dest.shape

(4, 3, 16, 2, 16, 2, 16, 2, 16, 1)

<!-- # MultiGrid - verify above.
![](./image2-dev40.png) -->

In [20]:
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))

3547.24
0.04556928


In [21]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

0.9999928


In [22]:
print(cp.linalg.norm(r_src-p_vec(r_vec(r_src)))/cp.linalg.norm(r_src))

0.9999928


In [23]:
r_src.flatten()[:50]

array([1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j], dtype=complex64)

In [24]:
p_dest.flatten()[:50]

array([ 9.8280534e-06+2.13875546e-06j,  1.2020784e-05+9.33812680e-06j,
        8.6367018e-06+6.87919191e-06j,  2.2185774e-05+2.79535016e-05j,
       -2.1204751e-06-7.54559414e-06j,  8.4795829e-06+3.54005033e-06j,
        4.6840360e-06+7.38702829e-06j, -6.2731601e-07+5.87946943e-06j,
        7.9138772e-07-2.83349664e-07j, -7.0038282e-06+1.00682628e-05j,
        1.7665079e-05+1.03346265e-05j,  9.1276152e-06+7.23122639e-07j,
       -8.0410227e-06-1.69860505e-06j,  2.2245382e-05-7.52337110e-06j,
        1.2951352e-05+2.22641866e-05j,  3.7844563e-06+8.89122930e-07j,
       -8.3091309e-06-1.17092841e-05j,  2.5563602e-06+9.99297754e-07j,
        1.5905367e-05+2.50184421e-05j,  1.3350997e-06+4.47402226e-06j,
        1.8189432e-05+6.59383477e-06j,  1.1853133e-05-1.59068691e-06j,
        8.0790414e-06+2.42871829e-06j,  1.4985970e-05+1.91627187e-05j,
        7.1873737e-06+1.84489309e-05j,  2.2814813e-06-8.79565050e-07j,
        1.5670848e-05+6.99466273e-06j,  4.2752226e-06+5.89124738e-06j,
      

In [25]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src)

array(0.9999928, dtype=float32)

In [26]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(p_dest)

array(77842.234, dtype=float32)

In [27]:
p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50]

array([ 3.20225758e-19-1.1616201e-21j,  3.51078613e-19+2.9758303e-19j,
        1.64551272e-19+5.1993336e-20j,  3.01923870e-19+6.5192854e-19j,
       -3.97236687e-20-3.1076429e-19j, -2.76687941e-20-8.1621013e-20j,
       -4.82241063e-20+1.9219554e-19j, -6.00452151e-20+1.8236166e-20j,
        1.25432537e-20+3.2312544e-20j, -1.21151535e-19+9.2140062e-20j,
        3.10587062e-19+1.8356185e-19j,  1.22861304e-19-1.5905494e-19j,
       -1.69958771e-19-1.1520190e-20j,  7.73605779e-19-1.2991389e-21j,
        1.95629688e-19+5.4727986e-19j,  3.48927995e-19-1.0403851e-19j,
       -2.99120633e-19-2.3777490e-19j,  1.62808919e-20+7.7277546e-20j,
        1.57483466e-19+2.7197394e-19j, -2.76851212e-21+4.1988939e-19j,
        9.46454784e-19+3.4933985e-19j,  2.70085947e-19-1.1804405e-19j,
        2.54681931e-19+3.0298837e-19j,  2.95693856e-19+3.1631009e-19j,
        1.27176693e-19+4.0691263e-19j, -1.08682576e-19+2.8979322e-19j,
        6.70471181e-20+7.7143084e-20j,  9.33622085e-20+8.9543484e-20j,
      

In [28]:
cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src))))))))
               )/cp.linalg.norm(r_src)  # ???

array(1., dtype=float32)

In [29]:
# _mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
#                 testvectors, cp.conj(testvectors)).flatten()
# print(cp.linalg.norm(_mat))
# print(_mat[:100])

In [30]:
testvectors.shape

(24, 4, 3, 16, 2, 16, 2, 16, 2, 16, 1)

# MultiGrid - R*matvec\*P.

In [31]:
def _r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(io.xxx2eTZYX(src, params))))


def r_matvec_p(src, matvec):
    return io.array2xxx(_r_matvec_p(src, matvec))

# MultiGrid - verify above.

In [32]:
D_r_src = matvec(r_src)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001925405 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001892595 sec


In [33]:
D_r_src.flatten()[:50]

array([ 0.09344786-0.10208309j,  0.04712683-0.05375636j,
        0.20205879-0.02301359j,  0.10501933-0.06024492j,
       -0.03274214-0.00194919j, -0.03296566+0.01795989j,
       -0.00965858-0.00275993j, -0.04963386+0.21724534j,
       -0.11307013-0.06256664j, -0.06049633+0.11239302j,
       -0.04292512+0.18137288j,  0.11644292+0.15530562j,
       -0.1051693 +0.20847821j,  0.17299628+0.01849371j,
        0.08634686+0.01981819j,  0.01468253-0.00553358j,
        0.03066605+0.04353732j,  0.02836114+0.15360475j,
       -0.07397485-0.06877553j, -0.07699823+0.1980232j ,
        0.09499002+0.06847632j, -0.00526285-0.02889919j,
       -0.2506256 +0.03647596j, -0.01345444+0.23340106j,
        0.08127064-0.06814575j, -0.06371009+0.1326667j ,
        0.11834753+0.02462935j, -0.0511992 -0.15033877j,
       -0.08291769+0.12582374j, -0.09224725+0.18539107j,
        0.08126312+0.03072464j, -0.07312202+0.00698102j,
        0.0677011 -0.00123036j, -0.11635554+0.05332637j,
       -0.05891597+0.29164827j,

In [34]:
p_r_D_p_r_dest = p_vec(_r_matvec_p(r_vec(r_src), matvec=cg_dslash))

Input Array Shape: (24, 16, 16, 16, 16)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001952794 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001862184 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001937750 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001862098 sec


In [35]:
p_r_D_p_r_dest.flatten()[:50]

array([ 2.4705099e-10+2.7500609e-11j,  2.7614100e-10+2.3256376e-10j,
        2.1988401e-10+1.1617701e-10j,  3.2444411e-10+5.0834303e-10j,
       -7.8977519e-12-1.3582532e-10j,  8.8488869e-11+7.1201725e-13j,
        3.7503722e-11+1.3244526e-10j,  4.0911718e-11+4.0693445e-11j,
       -2.2598291e-11+5.8589016e-11j, -1.4055586e-10+8.7654085e-11j,
        3.1214614e-10+1.3325280e-10j,  2.2358697e-10-1.3280646e-10j,
       -2.5329228e-10-1.1105254e-10j,  5.2802362e-10-1.2029833e-10j,
        3.2708969e-10+3.8772546e-10j,  1.3672721e-10-3.6987198e-11j,
       -2.2518061e-10-2.1356571e-10j,  1.4158660e-11+5.3229431e-11j,
        2.1343705e-10+3.7969611e-10j, -5.7623364e-11+1.8627491e-10j,
        4.7261040e-10+1.8385529e-10j,  1.2525139e-10-1.7879644e-10j,
        2.3160895e-10+1.7174365e-10j,  3.0073491e-10+3.1478192e-10j,
        1.0038795e-10+3.7001893e-10j,  1.1378589e-11+1.4817472e-10j,
        1.3691008e-10+3.3308793e-11j,  3.7961384e-11+3.3680558e-11j,
        2.7506220e-11-1.4245588e-1

In [36]:
cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src)

array(1., dtype=float32)

# MultiGrid - BISTABCG (TESTING......)

In [37]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001926168 sec


In [38]:
# # Dslash(x_o)=b__o
x_o = bistabcg.slover(
    b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001923338 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001887478 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001954660 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001847139 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001942688 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001891565 sec
Iteration 0: Residual = 4.974845e+07, Time = 0.019613 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001945675 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001863208 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001907643 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001854719 sec
Iteration 1: Residual = 3.800150e+07, Time = 0.017820 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.

# MG-BISTABCG

In [39]:
# mg version
mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
    io.fermion2sctzyx(b__o, params), params)).flatten()
mg_x_o = bistabcg.slover(
    b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=bistabcg_dslash), tol=1e-10, max_iter=1000000)
_x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 16, 2, 16, 2, 16, 2, 16, 1)


Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001954053 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001888669 sec
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001968232 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001901492 sec
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001971823 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001866908 sec
Iteration 0: Residual = 3.617238e+02, Time = 0.279610 s
Input Array Shape: (1572864,)
Dest Shape: (24, 16, 16, 16, 16)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001969319 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001902785 sec
Input Array Shape: (1572864,)
Dest S

In [40]:
# def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
#     n = b.size
#     dtype = b.dtype
#     buffers = {key: cp.zeros(n, dtype=dtype)
#                for key in ['r', 'r_tilde', 'p', 'v', 's', 't', 'x']}
#     x0 = None if x0 is None else x0.copy()

#     def initialize_random_vector(v):
#         v.real, v.imag = cp.random.randn(n).astype(
#             v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
#         norm = cp.linalg.norm(v)
#         if norm > 0:
#             cp.divide(v, norm, out=v)
#         return v

#     def dot(x, y):
#         return cp.sum(x.conj() * y)

#     def _r_vec(src):
#         return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

#     def _p_vec(src):
#         return p_vec(io.xxx2eTZYX(src, params)).flatten()

#     def _r_matvec_p(src):
#         return _r_vec(matvec(_p_vec(src)))

#     x, r, r_tilde, p, v, s, t = buffers['x'], buffers['r'], buffers[
#         'r_tilde'], buffers['p'], buffers['v'], buffers['s'], buffers['t']
#     if x0 is not None:
#         cp.copyto(x, x0)
#     else:
#         initialize_random_vector(x)
#     r = b - matvec(x)
#     cp.copyto(r_tilde, r)
#     rho_prev = 1.0
#     alpha = 1.0
#     omega = 1.0
#     start_time = perf_counter()
#     iter_times = []
#     for i in range(max_iter):
#         iter_start_time = perf_counter()
#         rho = dot(r_tilde, r)
#         beta = (rho/rho_prev)*(alpha/omega)
#         rho_prev = rho
#         p = r+(p-v*omega)*beta
#         r_norm2 = dot(r, r)
#         v = matvec(p)
#         alpha = rho / dot(r_tilde, v)
#         s = r-v*alpha
#         t = matvec(s)
#         omega = dot(t, s)/dot(t, t)
#         r = s-t*omega  # update r
#         # COARSE START
#         r_c = _r_vec(r)
#         e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
#                               tol=1e-2, max_iter=100)
#         e = _p_vec(e_c)
#         # COARSE END
#         # FINE START
#         # x = x+p*alpha+s*omega # update x # don't use ?
#         x += e  # just this like?
#         # FINE END
#         iter_time = perf_counter() - iter_start_time
#         print(
#             f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
#         iter_times.append(iter_time)
#         if r_norm2.real < tol:
#             print(
#                 f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
#             break
#     total_time = perf_counter() - start_time
#     avg_iter_time = sum(iter_times) / len(iter_times)
#     print("\nPerformance Statistics:")
#     print(f"Total time: {total_time:.6f} s")
#     print(f"Average time per iteration: {avg_iter_time:.6f} s")
#     return x.copy()

In [41]:
# def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
#     n = b.size
#     dtype = b.dtype
#     buffers = {key: cp.zeros(n, dtype=dtype)
#                for key in ['r', 'r_tilde', 'p', 'v', 's', 't', 'x']}
#     x0 = None if x0 is None else x0.copy()

#     def initialize_random_vector(v):
#         v.real, v.imag = cp.random.randn(n).astype(
#             v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
#         norm = cp.linalg.norm(v)
#         if norm > 0:
#             cp.divide(v, norm, out=v)
#         return v

#     def dot(x, y):
#         return cp.sum(x.conj() * y)

#     def _r_vec(src):
#         return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

#     def _p_vec(src):
#         return p_vec(io.xxx2eTZYX(src, params)).flatten()

#     def _r_matvec_p(src):
#         return _r_vec(matvec(_p_vec(src)))

#     x, r, r_tilde, p, v, s, t = buffers['x'], buffers['r'], buffers[
#         'r_tilde'], buffers['p'], buffers['v'], buffers['s'], buffers['t']
#     if x0 is not None:
#         cp.copyto(x, x0)
#     else:
#         initialize_random_vector(x)
#     r = b - matvec(x)
#     cp.copyto(r_tilde, r)
#     rho_prev = 1.0
#     alpha = 1.0
#     omega = 1.0
#     start_time = perf_counter()
#     iter_times = []
#     for i in range(max_iter):
#         iter_start_time = perf_counter()
#         rho = dot(r_tilde, r)
#         beta = (rho/rho_prev)*(alpha/omega)
#         rho_prev = rho
#         p = r+(p-v*omega)*beta
#         r_norm2 = dot(r, r)
#         v = matvec(p)
#         alpha = rho / dot(r_tilde, v)
#         s = r-v*alpha
#         t = matvec(s)
#         omega = dot(t, s)/dot(t, t)
#         r = s-t*omega
#         x = x+p*alpha+s*omega
#         # COARSE START
#         r = b-matvec(x)
#         r_c = _r_vec(r)
#         e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
#                               tol=1e-2, max_iter=100)
#         # COARSE END
#         # FINE START
#         e = _p_vec(e_c)
#         x += e  # or just this like?
#         r = b-matvec(x)
#         # FINE END
#         iter_time = perf_counter() - iter_start_time
#         print(
#             f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
#         iter_times.append(iter_time)
#         if r_norm2.real < tol:
#             print(
#                 f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
#             break
#     total_time = perf_counter() - start_time
#     avg_iter_time = sum(iter_times) / len(iter_times)
#     print("\nPerformance Statistics:")
#     print(f"Total time: {total_time:.6f} s")
#     print(f"Average time per iteration: {avg_iter_time:.6f} s")
#     return x.copy()

In [42]:
# def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
#     tol_factor = 0.1

#     def initialize_random_vector(v):
#         n = b.size
#         v.real, v.imag = cp.random.randn(n).astype(
#             v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
#         norm = cp.linalg.norm(v)
#         if norm > 0:
#             cp.divide(v, norm, out=v)
#         return v

#     def dot(x, y):
#         return cp.sum(x.conj() * y)

#     def _r_vec(src):
#         return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

#     def _p_vec(src):
#         return p_vec(io.xxx2eTZYX(src, params)).flatten()

#     def _r_matvec_p(src):
#         return _r_vec(matvec(_p_vec(src)))

#     if x0 is not None:
#         x = x0
#     else:
#         x0 = cp.zeros_like(b)
#         initialize_random_vector(x0)
#         x = x0
#     start_time = perf_counter()
#     iter_times = []
#     r = b-matvec(x)
#     _tol = dot(r, r).real
#     for i in range(max_iter):
#         iter_start_time = perf_counter()
#         _tol *= tol_factor
#         print(f"@@@wanted tol: {_tol}")
#         x = bistabcg.slover(
#             b=b, matvec=matvec, tol=tol, max_iter=5, x0=x)
#         # COARSE START
#         r = b-matvec(x)
#         r_c = _r_vec(r)
#         e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
#                               tol=tol, max_iter=5)
#         # COARSE END
#         # FINE START
#         e = _p_vec(e_c)
#         x += e  # or just this like?
#         # FINE END
#         r = b-matvec(x)
#         r_norm2 = dot(r, r)
#         _tol = max(_tol, r_norm2.real)
#         iter_time = perf_counter() - iter_start_time
#         print(
#             f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
#         iter_times.append(iter_time)
#         if r_norm2.real < tol:
#             print(
#                 f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
#             break
#     total_time = perf_counter() - start_time
#     avg_iter_time = sum(iter_times) / len(iter_times)
#     print("\nPerformance Statistics:")
#     print(f"Total time: {total_time:.6f} s")
#     print(f"Average time per iteration: {avg_iter_time:.6f} s")
#     return x.copy()

In [43]:
# x_o = slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)

# MultiGrid - verify above.

In [44]:
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
      np.linalg.norm(quda_fermion_out))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001871660 sec
3.26263e-07


In [45]:
x_o = io.hdf5_xxx2xxx(params, 'x_o.h5')
_x_o = io.hdf5_xxx2xxx(params, '_x_o.h5')

Dest Shape: (6291456,)
Dest Shape: (6291456,)


In [46]:
x_o.flatten()[:50]

array([74.276   +92.022934j, 75.066284+87.021j   , 63.95041 +81.42323j ,
       71.90429 +81.19499j , 82.326904+77.19525j , 82.587204+76.421486j,
       75.92041 +76.61926j , 83.29    +63.37289j , 90.26462 +89.01651j ,
       85.437645+67.62443j , 82.5279  +61.069904j, 66.8772  +63.152367j,
       86.60795 +60.486275j, 69.973   +78.85382j , 74.98958 +78.34287j ,
       82.30311 +82.186226j, 80.42874 +82.24484j , 75.13649 +69.2751j  ,
       84.34209 +79.84676j , 83.38982 +62.74091j , 70.93335 +69.10554j ,
       78.417465+76.86116j , 92.754814+71.372284j, 76.95796 +63.981724j,
       76.83697 +84.040886j, 86.23323 +71.827896j, 74.124794+74.47034j ,
       80.772736+88.274734j, 88.08098 +68.75462j , 86.938446+63.496304j,
       76.293945+74.005066j, 85.780846+80.60355j , 73.144844+80.3956j  ,
       86.5529  +75.092064j, 84.150314+56.783943j, 82.56631 +92.947044j,
       80.47365 +77.69081j , 69.55825 +75.1817j  , 69.10492 +87.87934j ,
       78.5043  +86.63787j , 85.15253 +73.68485j , 

In [47]:
_x_o.flatten()[:50]

array([ 0.94202995+1.455163j  ,  0.6810659 +1.0243286j ,
        0.14904276+0.48175567j,  3.404803  +3.847834j  ,
        0.45091546-0.4487326j ,  2.8191705 +1.1257694j ,
        1.1920974 +1.8817868j , -0.725287  +1.7963102j ,
       -0.12183613-0.7326988j , -0.35258937+3.0394516j ,
        2.4121218 +1.4984748j ,  0.34131056+1.1852884j ,
       -0.62651443+0.970388j  ,  3.7424202 -1.9315913j ,
        0.91722494+2.7300196j , -0.06451736+0.6157532j ,
        0.42403102-0.9506295j ,  0.8377355 -0.25334197j,
        3.015805  +4.275037j  ,  1.1172382 -0.11041683j,
        2.3676262 +1.7360263j ,  2.1270936 +1.1524255j ,
        0.08459899+0.45597854j,  1.3422663 +3.1583004j ,
        1.6892697 +2.4024334j , -0.25711954-0.7631059j ,
        4.5965257 +2.5142102j ,  0.65963167+1.6117074j ,
        0.7383895 +2.7356882j ,  0.7340044 +0.9513806j ,
        0.20289344+1.5902905j ,  2.2223902 +3.4720485j ,
        1.0129685 +1.741838j  ,  1.3476112 +3.4223146j ,
        0.16725719+2.2519631j ,

In [48]:
print(np.linalg.norm(_x_o-x_o) /
      np.linalg.norm(x_o))

0.98388505


# End for CG & BISTABCG. (pass, don't run this)

In [49]:
# cg_solver.end()
# bistabcg_solver.end()

# End for pyqcu. (pass, don't run this)

In [50]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)