# Init for pyqcu.

In [None]:
import cupy as cp
import numpy as np
import functools
from pyqcu.cuda import define
from pyqcu.cuda import io
from pyqcu.cuda import qcu
from pyqcu.cuda import eigen, cg, bistabcg
from opt_einsum import contract
from pyqcu.cuda.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
print("Parameters:", params)

In [None]:
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)

In [None]:
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)

In [None]:
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)

In [None]:
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)

In [None]:
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)

In [None]:
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

# Read from hdf5 files.

In [8]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
print("Eigenvectors filename:", eigenvectors_filename)
eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
print("Eigenvectors data:", eigenvectors.data)
print("Eigenvectors shape:", eigenvectors.shape)


Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f4a827305b0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
Eigenvectors filename: quda_wilson-bistabcg-eigenvectors_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Dest Shape: (24, 6291456)
Eigenvectors data: <MemoryPointer 0xb2e400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f4a840b5230>>
Eigenvectors shape: (24, 4, 3, 32, 32, 32, 16)


# Run wilson bistabcg from pyqcu test.

In [9]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f4a827305b0>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0xb22400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f4aae2e14b0>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
##RANK:0##LOOP:118##Residual:(2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.649020416 sec
######TIME  :5122.88######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [10]:
def cg_dslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def cg_dslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return cg_dslash_dag(cg_dslash_no_dag(src))

def bistabcg_dslash(src):
    return cg_dslash_no_dag(src)



# Give matvec.

In [11]:
def matvec(src):
    return cg_dslash(src)

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test. (pass, don't run this)

In [12]:
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = cg_dslash(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [13]:
# eigenvalues, eigenvectors = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=params[define._LAT_E_],matvec=cg_dslash,dtype=gauge.dtype)
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG. (pass, don't run this)

In [14]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [15]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# Give guage's orth_eigenvectors to hdf5 files. (pass, don't run this)

In [16]:
_eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=eigenvectors, params=params)
_eigenvectors.shape  # escTtZzYyXx
def orthogonalize(eigenvectors):
    _eigenvectors = eigenvectors.copy()
    size_e, size_s, size_c, size_T, size_t, size_Z, size_z, size_Y, size_y, size_X, size_x = eigenvectors.shape
    print(size_e, size_s, size_c, size_T, size_t,
          size_Z, size_z, size_Y, size_y, size_X, size_x)
    for T in range(size_T):
        for Z in range(size_Z):
            for Y in range(size_Y):
                for X in range(size_X):
                    origin_matrix = eigenvectors[:,
                                                 :, :, T, :, Z, :, Y, :, X, :]
                    _shape = origin_matrix.shape
                    _origin_matrix = origin_matrix.reshape(size_e, -1)
                    condition_number = np.linalg.cond(_origin_matrix.get())
                    print(f"矩阵条件数: {condition_number}")
                    a = _origin_matrix[:, 0]
                    b = _origin_matrix[:, -1]
                    print(cp.dot(a.conj(), b))
                    Q = cp.linalg.qr(_origin_matrix.T)[0]
                    condition_number = np.linalg.cond(Q.get())
                    print(f"矩阵条件数: {condition_number}")
                    a = Q[:, 0]
                    b = Q[:, -1]
                    print(cp.dot(a.conj(), b))
                    _eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :] = Q.T.reshape(
                        _shape)
    return _eigenvectors
orth_eigenvectors = orthogonalize(_eigenvectors)
io.xxx2hdf5_xxx(
    orth_eigenvectors, params, gauge_filename.replace("gauge", "orth_eigenvectors"))

Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 8, 4, 4, 8, 4, 8, 4, 4)
24 4 3 8 4 4 8 4 8 4 4
矩阵条件数: 10.946216583251953
(1.00406524e-07+6.1820253e-07j)
矩阵条件数: 1.0000003576278687
(3.7252903e-09-2.3283064e-09j)
矩阵条件数: 12.363529205322266
(2.787988e-07+8.899269e-08j)
矩阵条件数: 1.000000238418579
(-9.313226e-10-2.3283064e-09j)
矩阵条件数: 12.361640930175781
(1.916895e-08-5.809656e-08j)
矩阵条件数: 1.000000238418579
(1.1641532e-09+2.3283064e-10j)
矩阵条件数: 11.339791297912598
(9.6724875e-08+2.9815442e-07j)
矩阵条件数: 1.000000238418579
(1.6298145e-09-1.8626451e-09j)
矩阵条件数: 13.33226490020752
(2.9658764e-07+2.3333916e-07j)
矩阵条件数: 1.000000238418579
(-8.1490725e-10+6.0535967e-09j)
矩阵条件数: 11.425483703613281
(4.158734e-07-1.5032597e-07j)
矩阵条件数: 1.0000003576278687
(2.3283064e-10-3.4924597e-09j)
矩阵条件数: 13.077044486999512
(1.8342804e-07-6.2940956e-08j)
矩阵条件数: 1.0000003576278687
(3.1432137e-09-1.2805685e-09j)
矩阵条件数: 12.491043090820312
(2.0362663e-07+2.2336714e-08j)
矩阵条件数: 1.000000238418579
(-5.122274e-

# MultiGrid - give grids.

In [17]:
orth_eigenvectors_filename = gauge_filename.replace("gauge", "orth_eigenvectors")
print("Orth orth_eigenvectors filename:", orth_eigenvectors_filename)
orth_eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=orth_eigenvectors_filename))
print("Orth orth_eigenvectors data:", orth_eigenvectors.data)
print("Orth orth_eigenvectors shape:", orth_eigenvectors.shape)
testvectors = io.xxxtzyx2mg_xxxtzyx(input_array=orth_eigenvectors, params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)

Orth orth_eigenvectors filename: quda_wilson-bistabcg-orth_eigenvectors_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Dest Shape: (24, 4, 3, 8, 4, 4, 8, 4, 8, 4, 4)
Orth orth_eigenvectors data: <MemoryPointer 0xbc9800000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f4a80d3ebf0>>
Orth orth_eigenvectors shape: (24, 4, 3, 32, 32, 32, 16)
Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 8, 4, 4, 8, 4, 8, 4, 4)
Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 4, 8, 4, 8, 4, 4)


<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [18]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)

In [19]:
r_dest.shape

(24, 8, 4, 4, 4)

<!-- # MultiGrid - P*vector.
![](./image1-dev40.png) -->


In [20]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)

In [21]:
p_dest.shape

(4, 3, 8, 4, 4, 8, 4, 8, 4, 4)

In [22]:
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))

3547.24
3531.9172


In [23]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

0.09284756


In [24]:
print(cp.linalg.norm(r_src-p_vec(r_vec(r_src)))/cp.linalg.norm(r_src))

0.09284756


In [25]:
r_src.flatten()[:50]

array([1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j], dtype=complex64)

In [26]:
p_dest.flatten()[:50]

array([1.1924026 +0.96220475j, 1.1314651 +0.9596404j ,
       1.0552521 +0.8234522j , 1.0549538 +0.92514884j,
       0.9615977 +1.0393802j , 0.95300674+1.0415285j ,
       0.9523357 +0.9495436j , 0.78334856+1.0426004j ,
       1.1427466 +1.1257019j , 0.8673905 +1.072619j  ,
       0.7877028 +1.033166j  , 0.81615794+0.8263445j ,
       0.7418158 +1.0979842j , 0.9804515 +0.89743346j,
       0.99102736+0.9627855j , 1.045865  +1.0467168j ,
       1.0703461 +1.0456425j , 0.90627563+0.97132605j,
       1.0373149 +1.0902432j , 0.82147443+1.0707421j ,
       0.86649394+0.89142334j, 0.95891976+0.98738074j,
       0.8901605 +1.1672864j , 0.79725903+0.9623244j ,
       1.0696645 +0.9613617j , 0.9215935 +1.0815051j ,
       0.9532187 +0.92094266j, 1.1357502 +1.0021203j ,
       0.85560083+1.1228731j , 0.78500533+1.1069124j ,
       0.92780966+0.97881925j, 1.028975  +1.0910051j ,
       1.0465842 +0.9475676j , 0.9770593 +1.115533j  ,
       0.74214095+1.0821452j , 1.2050415 +1.0633073j ,
       0.9

In [27]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src)

array(0.09284756, dtype=float32)

In [28]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(p_dest)

array(0.09325037, dtype=float32)

In [29]:
p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50]

array([1.1924028 +0.9622046j , 1.131465  +0.9596402j ,
       1.055252  +0.82345235j, 1.0549535 +0.9251491j ,
       0.96159875+1.0393807j , 0.9530078 +1.0415289j ,
       0.9523366 +0.9495442j , 0.7833492 +1.0426009j ,
       1.1427467 +1.1257019j , 0.8673905 +1.0726188j ,
       0.78770274+1.0331659j , 0.8161577 +0.8263445j ,
       0.74181604+1.0979842j , 0.9804518 +0.8974335j ,
       0.9910274 +0.96278584j, 1.0458647 +1.0467169j ,
       1.0703461 +1.0456423j , 0.9062754 +0.971326j  ,
       1.0373149 +1.0902435j , 0.8214741 +1.0707424j ,
       0.866495  +0.89142364j, 0.95892066+0.9873811j ,
       0.8901612 +1.167287j  , 0.79725975+0.96232486j,
       1.0696646 +0.9613615j , 0.9215934 +1.081505j  ,
       0.9532185 +0.92094254j, 1.13575   +1.0021201j ,
       0.85560113+1.1228732j , 0.78500557+1.1069125j ,
       0.9278096 +0.97881943j, 1.028975  +1.0910052j ,
       1.0465842 +0.94756746j, 0.97705925+1.1155329j ,
       0.7421407 +1.0821455j , 1.2050409 +1.0633075j ,
       0.9

In [30]:
cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))))/cp.linalg.norm(r_src) #???

array(0.09284756, dtype=float32)

In [31]:
# _mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
#                 testvectors, cp.conj(testvectors)).flatten()
# print(cp.linalg.norm(_mat))
# print(_mat[:100])

# MultiGrid - R*matvec\*P.

In [32]:
def _r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(io.xxx2eTZYX(src, params))))


def r_matvec_p(src, matvec):
    return io.array2xxx(_r_matvec_p(src, matvec))

# MultiGrid - verify above.

In [33]:
D_r_src = matvec(r_src)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001831541 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001781509 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002017780 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001753978 sec


In [34]:
D_r_src.flatten()[:50]

array([ 0.09512203-0.15065098j,  0.02459466-0.10281426j,
        0.14802423-0.04478627j,  0.05378214-0.09193704j,
       -0.08474712+0.0208632j , -0.06649091+0.06668942j,
       -0.03820895+0.03143284j, -0.06110204+0.1998555j ,
       -0.09466349-0.10954653j, -0.09903856+0.15328293j,
       -0.11581516+0.23665299j,  0.15433274+0.0833789j ,
       -0.11228501+0.18325019j,  0.07416645+0.00099519j,
        0.05309803-0.03660914j,  0.034646  -0.03932991j,
       -0.00817882-0.03089109j,  0.03485045+0.13626698j,
       -0.12274602-0.06901661j, -0.10658678+0.15540114j,
        0.09019466+0.07479304j,  0.00329622+0.01262958j,
       -0.2735389 +0.08964469j, -0.02719212+0.18662325j,
        0.07507145-0.0618236j , -0.09112616+0.10951558j,
        0.08583128+0.02523299j, -0.0734987 -0.1731855j ,
       -0.09989857+0.04082112j, -0.1259279 +0.17906842j,
        0.00141903+0.04172344j, -0.08249421-0.07952813j,
        0.06730361-0.03395787j, -0.16046013+0.01053373j,
       -0.04409004+0.2682615j ,

In [35]:
p_r_D_p_r_dest=p_vec(_r_matvec_p(r_dest,matvec=cg_dslash))

Input Array Shape: (24, 8, 4, 4, 4)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002042453 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001754898 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001808091 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001740555 sec


In [36]:
p_r_D_p_r_dest.flatten()[:50]

array([0.01284266+0.00043204j, 0.01207354+0.00220424j,
       0.0121465 +0.0025559j , 0.01484103+0.00671565j,
       0.00851426+0.0052227j , 0.00844077+0.00481056j,
       0.00867679+0.00409095j, 0.00811415+0.00531656j,
       0.00683777+0.00171679j, 0.0057756 +0.00181463j,
       0.00582205+0.00221079j, 0.00453556+0.00140342j,
       0.00929137+0.00846156j, 0.01103785+0.00450657j,
       0.01158802+0.00349668j, 0.0132275 +0.00333044j,
       0.01015243+0.00213161j, 0.00825791+0.00220426j,
       0.01114164+0.00498186j, 0.01093537+0.00712627j,
       0.00751002+0.00507913j, 0.00815575+0.00541701j,
       0.00889412+0.00674953j, 0.00828097+0.00466273j,
       0.00554528+0.00108429j, 0.00750764+0.00175549j,
       0.00710701+0.00027447j, 0.00992816+0.00175117j,
       0.01150915+0.01007911j, 0.00842286+0.00877829j,
       0.00990541+0.00592939j, 0.01135234+0.0060305j ,
       0.00789523+0.00191099j, 0.00920649+0.00426514j,
       0.00813269+0.00681124j, 0.0152209 +0.00934403j,
       0.0

In [37]:
cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src)

array(0.9956338, dtype=float32)

In [38]:
r_dest.flatten()[:50]

array([-26.228638-2.6980443j,  26.825924+2.2300706j,
       -27.157005-2.886311j ,  27.309052+2.1306527j,
        26.245262+3.1135411j,  26.968142+2.7365303j,
        27.338694+2.955482j ,  26.714493+2.8954268j,
       -26.562653-3.4426003j,  27.022251+3.223345j ,
        27.010738+2.8851447j,  26.904491+2.7066288j,
        26.563435+2.7313137j, -26.834703-2.6179743j,
        26.203568+2.8582335j,  26.13324 +2.7540636j,
       -26.757036-2.5445836j,  26.579767+2.9527426j,
        26.924713+2.852963j ,  26.084093+2.554791j ,
        26.118988+3.0851789j,  27.292454+2.5094728j,
        27.045673+2.9704423j,  25.87228 +2.7780027j,
        26.435436+3.0725877j,  26.982637+3.2305465j,
        26.135303+2.6885123j,  26.39829 +2.4337387j,
        26.226591+1.8348447j,  26.38916 +2.9997063j,
       -26.302986-2.5908453j, -26.14183 -1.8610659j,
        26.349154+3.348411j ,  26.792229+2.4964402j,
        26.288826+2.7947683j,  26.808006+2.8376613j,
        26.121277+3.1562634j,  27.091488+2.345

In [39]:
p_vec(r_dest).flatten()[:50]

array([1.1924026 +0.96220475j, 1.1314651 +0.9596404j ,
       1.0552521 +0.8234522j , 1.0549538 +0.92514884j,
       0.9615977 +1.0393802j , 0.95300674+1.0415285j ,
       0.9523357 +0.9495436j , 0.78334856+1.0426004j ,
       1.1427466 +1.1257019j , 0.8673905 +1.072619j  ,
       0.7877028 +1.033166j  , 0.81615794+0.8263445j ,
       0.7418158 +1.0979842j , 0.9804515 +0.89743346j,
       0.99102736+0.9627855j , 1.045865  +1.0467168j ,
       1.0703461 +1.0456425j , 0.90627563+0.97132605j,
       1.0373149 +1.0902432j , 0.82147443+1.0707421j ,
       0.86649394+0.89142334j, 0.95891976+0.98738074j,
       0.8901605 +1.1672864j , 0.79725903+0.9623244j ,
       1.0696645 +0.9613617j , 0.9215935 +1.0815051j ,
       0.9532187 +0.92094266j, 1.1357502 +1.0021203j ,
       0.85560083+1.1228731j , 0.78500533+1.1069124j ,
       0.92780966+0.97881925j, 1.028975  +1.0910051j ,
       1.0465842 +0.9475676j , 0.9770593 +1.115533j  ,
       0.74214095+1.0821452j , 1.2050415 +1.0633073j ,
       0.9

# MultiGrid - CG (BUG!!!)

In [40]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp
# b__o -> Dslash^dag b__o
b__o = cg_dslash_dag(b__o)
print(b__o.flatten()[:50])

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001815357 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001825504 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001834390 sec
[ 0.06233358+0.15173662j  0.32942653+0.06118965j -0.04789817-0.00079036j
 -0.05084395-0.16727257j  0.03047681+0.36633623j -0.06406617+0.09027088j
  0.14260948+0.22115517j  0.18566191-0.05795419j  0.45213223+0.05551624j
  0.0242672 +0.15382648j -0.17152047-0.05546987j  0.0933491 -0.00444984j
  0.14769971+0.17339706j -0.06226981+0.10680568j -0.26130497-0.03517842j
  0.03391361+0.08685231j  0.00804639-0.21704197j  0.19566107-0.10458302j
 -0.13376355+0.18717241j  0.042871  -0.2088939j   0.11243784+0.12554109j
  0.00457525+0.21281981j  0.28957486+0.05365086j  0.15917301-0.27011013j
 -0.01850295+0.28555632j  0.07021451-0.0173465j   0.11242735+0.05856681j
  0.18075407+0.17617095j  0.0450139 -0.12209082j  0.10786152+0.02099669j
 -0.0124023 -0.0336045j 

In [41]:
# # Dslash(x_o)=b__o
x_o = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001940321 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001775059 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001803437 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001740248 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001944689 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001870753 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001913263 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001792419 sec
Iteration 0: Residual = 6.809580e+04, Time = 0.017800 s
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001851408 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001851849 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001839451 sec
multi-gpu wilson dslash total time: (witho

In [42]:
# mg version
mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(b__o, params), params)).flatten()
print(mg_b__o.flatten()[:50])

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 4, 8, 4, 8, 4, 4)
[-1.6048808 -0.19050843j  1.4453042 +0.5755182j  -1.3121517 -0.22736463j
  1.1189554 -0.06788515j  1.205285  +0.11489812j  1.4770985 +0.52863j
  1.6383854 +0.5712879j   1.2981555 +0.13938954j -1.3145074 -0.48618785j
  1.1190689 +0.33740783j  1.4299953 +0.35779268j  1.4583635 +0.34325194j
  1.3344676 +0.08935298j -1.0335841 +0.11224735j  1.3183542 +0.29898208j
  1.3941326 +0.34783325j -1.2098143 -0.12871222j  1.4660698 +0.25616106j
  0.95744395+0.18327132j  1.2752951 +0.2026344j   1.4468336 +0.24979019j
  1.298039  -0.17197014j  1.3627579 +0.2638182j   1.2071009 +0.05129927j
  0.9886751 +0.44078037j  1.4041576 +0.11589839j  1.4587599 +0.36406872j
  1.4496286 +0.18874273j  1.6078446 +0.0376209j   1.0907787 -0.17589304j
 -1.2909334 -0.15439248j -1.1368427 -0.27375048j  1.604973  +0.21888134j
  1.1608192 -0.0560858j   1.2630206 +0.1871444j   1.339143  +0.0864124j
  1.3402606 +0.33621055j  1.1882145 -0.2313

In [43]:
mg_x_o = cg.slover(b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=cg_dslash),
                      tol=1e-5, max_iter=1000000)
# mg_x_o = bistabcg.slover(b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=cg_dslash),
#                       tol=1e-5, max_iter=1000000)
# _x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))

Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001925962 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002019345 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001824757 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002117865 sec
Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002149885 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001852440 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001815689 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001747550 sec
Iteration 0: Residual = 3.178048e+05, Time = 0.058277 s
Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001858014 sec
multi-gpu wilson dslash total time:

KeyboardInterrupt: 

In [51]:
mg_eigenvalues, mg_eigenvectors = eigen.solver(
    n=mg_b__o.size, k=2,matvec=functools.partial(r_matvec_p, matvec=cg_dslash),dtype=gauge.dtype,max_iter=10000)

Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001967839 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001891241 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001986343 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001899013 sec
Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002021483 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001884914 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001960977 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001885932 sec
Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001971828 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001882593 sec
multi-gpu

In [52]:
print(mg_eigenvalues)
for i, ev in enumerate(mg_eigenvalues):
    print(f"λ_{i} = {ev:.2e}")
    # Verify eigenvector
    v = mg_eigenvectors[i]
    w = cp.zeros_like(v)
    w = r_matvec_p(v, matvec=cg_dslash)
    error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
    print(f"Relative error: {error:.2e}")
    j = i+1
    if j == len(mg_eigenvalues):
        j = 0
    print(
        f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(mg_eigenvectors[i] - mg_eigenvectors[j])/cp.linalg.norm(mg_eigenvectors[i]):.2e}")

[-1.5609118e-04+0.j  5.5776393e-01+0.j]
λ_0 = -1.56e-04+0.00e+00j
Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002027681 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001933523 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001906477 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001872385 sec
Relative error: 1.00e+00
Diff between λ_0 and λ_1: 1.41e+00
λ_1 = 5.58e-01+0.00e+00j
Input Array Shape: (12288,)
Dest Shape: (24, 8, 4, 4, 4)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002117850 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001865113 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001892883 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001862160 sec
Relative error: 2.39e-02
Diff between λ_1 and λ_0: 1.41e+00


In [None]:
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)


In [None]:
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
    np.linalg.norm(quda_fermion_out))

# MultiGrid - BISTABCG (TESTING......)

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp

In [None]:
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')

In [None]:
# # mg version
# mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
#     io.fermion2sctzyx(b__o, params), params)).flatten()
# mg_x_o = bistabcg.slover(
#     b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=bistabcg_dslash), tol=1e-10, max_iter=1000000)
# _x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
# io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')

In [None]:
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

In [None]:
# x_o=io.hdf5_xxx2xxx(params,'x_o.h5')
# _x_o=io.hdf5_xxx2xxx(params,'_x_o.h5')

In [None]:
# x_o.flatten()[:50]

In [None]:
# _x_o.flatten()[:50]

In [None]:
# print(np.linalg.norm(_x_o-x_o) /
#       np.linalg.norm(x_o))

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)