# Init for pyqcu.

In [None]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg, bistabcg
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
params[define._MG_X_] = int(params[define._LAT_X_]/params[define._LAT_P_])
params[define._MG_Y_] = params[define._LAT_Y_]
params[define._MG_Z_] = params[define._LAT_Z_]
params[define._MG_T_] = int(params[define._LAT_T_]/2)
print("Parameters:", params)
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

# Read from hdf5 files.

In [None]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
# eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
# print("Eigenvalues filename:", eigenvalues_filename)
# eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
# print("Eigenvalues data:", eigenvalues.data)
# print("Eigenvalues shape:", eigenvalues.shape)
# eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
# print("Eigenvectors filename:", eigenvectors_filename)
# eigenvectors = io.eigenvectors2esctzyx(
#     params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
# print("Eigenvectors data:", eigenvectors.data)
# print("Eigenvectors shape:", eigenvectors.shape)

# Run wilson bistabcg from pyqcu test.

In [None]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [None]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))

def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_params)
    return dest

def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    return dest

def dslash(src):
    return dslash_no_dag(src)

def bistabcg_dslash(src):
    return pdslash_no_dag(src)

print(cp.linalg.norm((fermion_out[define._EVEN_]-kappa *
               dslash(fermion_out[define._ODD_]))-fermion_in[define._ODD_]))

# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [None]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(gauge.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = fermion_out[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

# Give matvec.

In [1]:
# def matvec(src):
#     return gamma5_vec(pdslash_no_dag(src)).reshape(src.shape)
# def matvec(src):
#     return cg_dslash(src).reshape(src.shape)
def matvec(src):
    return bistabcg_dslash(src).reshape(src.shape)

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test. (pass, don't run this)

In [None]:
# for i, ev in enumerate(eigenvalues):
#     print(f"λ_{i} = {ev:.2e}")
#     # Verify eigenvector
#     v = eigenvectors[i]
#     w = cp.zeros_like(v)
#     w = cg_dslash(v)
#     error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
#     print(f"Relative error: {error:.2e}")
#     j = i+1
#     if j == len(eigenvalues):
#         j = 0
#     print(
#         f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [None]:
# eigenvalues, eigenvectors = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=define._LAT_Ne_,matvec=matvec,dtype=gauge.dtype)
# print(eigenvalues)
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG. (pass, don't run this)

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = cg_dslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=cg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
#       np.linalg.norm(quda_fermion_out))

# Origin BISTABCG. (pass, don't run this)

In [None]:
# b_e = fermion_in[define._EVEN_].flatten()
# b_o = fermion_in[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # Dslash(x_o)=b__o
# x_o = bistabcg.slover(
#     b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give qcu_fermion_out
# qcu_fermion_out = cp.zeros_like(quda_fermion_out)
# qcu_fermion_out[define._EVEN_] = x_e.reshape(
#     quda_fermion_out[define._EVEN_].shape)
# qcu_fermion_out[define._ODD_] = x_o.reshape(
#     quda_fermion_out[define._ODD_].shape)
# print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
#     np.linalg.norm(quda_fermion_out))

# Give guage's orth_eigenvectors to hdf5 files. (pass, don't run this)

In [None]:
# _eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=eigenvectors, params=params)
randomvectors = []
for i in range(define._LAT_E_):
    _ = cp.random.randn(
        define._LAT_S_, define._LAT_C_, params[define._LAT_T_], params[define._LAT_Z_], params[define._LAT_Y_], int(params[define._LAT_X_]/define._LAT_P_)).astype(fermion_in.dtype)
    _ /= cp.linalg.norm(_)
    randomvectors.append(_)
randomvectors = cp.array(randomvectors)
_eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=randomvectors, params=params)
print(_eigenvectors.shape)  # escTtZzYyXx


def orthogonalize(eigenvectors):
    _eigenvectors = eigenvectors.copy()
    size_e, size_s, size_c, size_T, size_t, size_Z, size_z, size_Y, size_y, size_X, size_x = eigenvectors.shape
    print(size_e, size_s, size_c, size_T, size_t,
          size_Z, size_z, size_Y, size_y, size_X, size_x)
    for T in range(size_T):
        for Z in range(size_Z):
            for Y in range(size_Y):
                for X in range(size_X):
                    origin_matrix = eigenvectors[:,
                                                 :, :, T, :, Z, :, Y, :, X, :]
                    _shape = origin_matrix.shape
                    _origin_matrix = origin_matrix.reshape(size_e, -1)
                    condition_number = np.linalg.cond(_origin_matrix.get())
                    print(f"矩阵条件数: {condition_number}")
                    a = _origin_matrix[:, 0]
                    b = _origin_matrix[:, -1]
                    print(cp.dot(a.conj(), b))
                    Q = cp.linalg.qr(_origin_matrix.T)[0]
                    condition_number = np.linalg.cond(Q.get())
                    print(f"矩阵条件数: {condition_number}")
                    a = Q[:, 0]
                    b = Q[:, -1]
                    print(cp.dot(a.conj(), b))
                    _eigenvectors[:, :, :, T, :, Z, :, Y, :, X, :] = Q.T.reshape(
                        _shape)
    return _eigenvectors


orth_eigenvectors = orthogonalize(_eigenvectors)

io.xxx2hdf5_xxx(
    orth_eigenvectors, params, gauge_filename.replace("gauge", "orth_eigenvectors"))

# MultiGrid - give grids.

In [None]:
orth_eigenvectors_filename = gauge_filename.replace(
    "gauge", "orth_eigenvectors")
print("Orth orth_eigenvectors filename:", orth_eigenvectors_filename)
orth_eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=orth_eigenvectors_filename))
print("Orth orth_eigenvectors data:", orth_eigenvectors.data)
print("Orth orth_eigenvectors shape:", orth_eigenvectors.shape)
testvectors = io.xxxtzyx2mg_xxxtzyx(
    input_array=orth_eigenvectors, params=params)
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)
# _src = io.xxxtzyx2mg_xxxtzyx(
#     input_array=fermion_out[define._EVEN_], params=params)

<!-- # MultiGrid - R*vector.
![](./image0-dev40.png) -->

In [None]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", testvectors, src)


r_dest = r_vec(r_src)

In [None]:
r_dest.shape

<!-- # MultiGrid - P*vector.
![](./image1-dev40.png) -->


In [None]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(testvectors), src)


p_dest = p_vec(p_src)

In [None]:
p_dest.shape

<!-- # MultiGrid - verify above.
![](./image2-dev40.png) -->

In [None]:
print(cp.linalg.norm(r_src))
print(cp.linalg.norm(p_dest))

In [None]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

In [None]:
print(cp.linalg.norm(r_src-p_vec(r_vec(r_src)))/cp.linalg.norm(r_src))

In [None]:
r_src.flatten()[:50]

In [None]:
p_dest.flatten()[:50]

In [None]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src)

In [None]:
cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(p_dest)

In [None]:
p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src)))))))).flatten()[:50]

In [None]:
cp.linalg.norm(r_src-p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(p_vec(r_vec(r_src))))))))
               )/cp.linalg.norm(r_src)  # ???

In [None]:
# _mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
#                 testvectors, cp.conj(testvectors)).flatten()
# print(cp.linalg.norm(_mat))
# print(_mat[:100])

In [None]:
testvectors.shape

# MultiGrid - R*matvec\*P.

In [None]:
def _r_matvec_p(src, matvec):
    return r_vec(matvec(p_vec(io.xxx2eTZYX(src, params))))


def r_matvec_p(src, matvec):
    return io.array2xxx(_r_matvec_p(src, matvec))

# MultiGrid - verify above.

In [None]:
D_r_src = matvec(r_src)

In [None]:
D_r_src.flatten()[:50]

In [None]:
p_r_D_p_r_dest = p_vec(_r_matvec_p(r_vec(r_src), matvec=cg_dslash))

In [None]:
p_r_D_p_r_dest.flatten()[:50]

In [None]:
cp.linalg.norm(D_r_src-p_r_D_p_r_dest)/cp.linalg.norm(D_r_src)

# MultiGrid - BISTABCG (TESTING......)

In [None]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp

In [None]:
# # Dslash(x_o)=b__o
x_o = bistabcg.slover(
    b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
io.xxx2hdf5_xxx(x_o, params, 'x_o.h5')

In [None]:
# mg version
mg_b__o = r_vec(io.xxxtzyx2mg_xxxtzyx(
    io.fermion2sctzyx(b__o, params), params)).flatten()
mg_x_o = bistabcg.slover(
    b=mg_b__o, matvec=functools.partial(r_matvec_p, matvec=bistabcg_dslash), tol=1e-10, max_iter=1000000)
_x_o = io.array2xxx(p_vec(io.xxx2eTZYX(mg_x_o, params)))
io.xxx2hdf5_xxx(_x_o, params, '_x_o.h5')

# MG-BISTABCG

In [None]:
def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
    n = b.size
    dtype = b.dtype
    buffers = {key: cp.zeros(n, dtype=dtype)
               for key in ['r', 'r_tilde', 'p', 'v', 's', 't', 'x']}
    x0 = None if x0 is None else x0.copy()

    def initialize_random_vector(v):
        v.real, v.imag = cp.random.randn(n).astype(
            v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
        norm = cp.linalg.norm(v)
        if norm > 0:
            cp.divide(v, norm, out=v)
        return v

    def dot(x, y):
        return cp.sum(x.conj() * y)

    def _r_vec(src):
        return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

    def _p_vec(src):
        return p_vec(io.xxx2eTZYX(src, params)).flatten()

    def _r_matvec_p(src):
        return _r_vec(matvec(_p_vec(src)))

    x, r, r_tilde, p, v, s, t = buffers['x'], buffers['r'], buffers[
        'r_tilde'], buffers['p'], buffers['v'], buffers['s'], buffers['t']
    if x0 is not None:
        cp.copyto(x, x0)
    else:
        initialize_random_vector(x)
    r = b - matvec(x)
    cp.copyto(r_tilde, r)
    rho_prev = 1.0
    alpha = 1.0
    omega = 1.0
    start_time = perf_counter()
    iter_times = []
    for i in range(max_iter):
        iter_start_time = perf_counter()
        rho = dot(r_tilde, r)
        beta = (rho/rho_prev)*(alpha/omega)
        rho_prev = rho
        p = r+(p-v*omega)*beta
        r_norm2 = dot(r, r)
        v = matvec(p)
        alpha = rho / dot(r_tilde, v)
        s = r-v*alpha
        t = matvec(s)
        omega = dot(t, s)/dot(t, t)
        r = s-t*omega  # update r
        # COARSE START
        r_c = _r_vec(r)
        e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
                              tol=1e-2, max_iter=100)
        e = _p_vec(e_c)
        # COARSE END
        # FINE START
        # x = x+p*alpha+s*omega # update x # don't use ?
        x += e  # just this like?
        # FINE END
        iter_time = perf_counter() - iter_start_time
        print(
            f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
        iter_times.append(iter_time)
        if r_norm2.real < tol:
            print(
                f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
            break
    total_time = perf_counter() - start_time
    avg_iter_time = sum(iter_times) / len(iter_times)
    print("\nPerformance Statistics:")
    print(f"Total time: {total_time:.6f} s")
    print(f"Average time per iteration: {avg_iter_time:.6f} s")
    return x.copy()

In [None]:
def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
    n = b.size
    dtype = b.dtype
    buffers = {key: cp.zeros(n, dtype=dtype)
               for key in ['r', 'r_tilde', 'p', 'v', 's', 't', 'x']}
    x0 = None if x0 is None else x0.copy()

    def initialize_random_vector(v):
        v.real, v.imag = cp.random.randn(n).astype(
            v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
        norm = cp.linalg.norm(v)
        if norm > 0:
            cp.divide(v, norm, out=v)
        return v

    def dot(x, y):
        return cp.sum(x.conj() * y)

    def _r_vec(src):
        return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

    def _p_vec(src):
        return p_vec(io.xxx2eTZYX(src, params)).flatten()

    def _r_matvec_p(src):
        return _r_vec(matvec(_p_vec(src)))

    x, r, r_tilde, p, v, s, t = buffers['x'], buffers['r'], buffers[
        'r_tilde'], buffers['p'], buffers['v'], buffers['s'], buffers['t']
    if x0 is not None:
        cp.copyto(x, x0)
    else:
        initialize_random_vector(x)
    r = b - matvec(x)
    cp.copyto(r_tilde, r)
    rho_prev = 1.0
    alpha = 1.0
    omega = 1.0
    start_time = perf_counter()
    iter_times = []
    for i in range(max_iter):
        iter_start_time = perf_counter()
        rho = dot(r_tilde, r)
        beta = (rho/rho_prev)*(alpha/omega)
        rho_prev = rho
        p = r+(p-v*omega)*beta
        r_norm2 = dot(r, r)
        v = matvec(p)
        alpha = rho / dot(r_tilde, v)
        s = r-v*alpha
        t = matvec(s)
        omega = dot(t, s)/dot(t, t)
        r = s-t*omega
        x = x+p*alpha+s*omega
        # COARSE START
        r = b-matvec(x)
        r_c = _r_vec(r)
        e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
                              tol=1e-2, max_iter=100)
        # COARSE END
        # FINE START
        e = _p_vec(e_c)
        x += e  # or just this like?
        r = b-matvec(x)
        # FINE END
        iter_time = perf_counter() - iter_start_time
        print(
            f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
        iter_times.append(iter_time)
        if r_norm2.real < tol:
            print(
                f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
            break
    total_time = perf_counter() - start_time
    avg_iter_time = sum(iter_times) / len(iter_times)
    print("\nPerformance Statistics:")
    print(f"Total time: {total_time:.6f} s")
    print(f"Average time per iteration: {avg_iter_time:.6f} s")
    return x.copy()

In [None]:
def slover(b, matvec, max_iter=1000, tol=1e-9, x0=None):
    tol_factor = 0.1

    def initialize_random_vector(v):
        n = b.size
        v.real, v.imag = cp.random.randn(n).astype(
            v.real.dtype), cp.random.randn(n).astype(v.imag.dtype)
        norm = cp.linalg.norm(v)
        if norm > 0:
            cp.divide(v, norm, out=v)
        return v

    def dot(x, y):
        return cp.sum(x.conj() * y)

    def _r_vec(src):
        return r_vec(io.xxxtzyx2mg_xxxtzyx(io.fermion2sctzyx(src, params), params)).flatten()

    def _p_vec(src):
        return p_vec(io.xxx2eTZYX(src, params)).flatten()

    def _r_matvec_p(src):
        return _r_vec(matvec(_p_vec(src)))

    if x0 is not None:
        x = x0
    else:
        x0 = cp.zeros_like(b)
        initialize_random_vector(x0)
        x = x0
    start_time = perf_counter()
    iter_times = []
    r = b-matvec(x)
    _tol = dot(r, r).real
    for i in range(max_iter):
        iter_start_time = perf_counter()
        _tol *= tol_factor
        print(f"@@@wanted tol: {_tol}")
        x = bistabcg.slover(
            b=b, matvec=matvec, tol=tol, max_iter=5, x0=x)
        # COARSE START
        r = b-matvec(x)
        r_c = _r_vec(r)
        e_c = bistabcg.slover(b=r_c, matvec=_r_matvec_p,
                              tol=tol, max_iter=5)
        # COARSE END
        # FINE START
        e = _p_vec(e_c)
        x += e  # or just this like?
        # FINE END
        r = b-matvec(x)
        r_norm2 = dot(r, r)
        _tol = max(_tol, r_norm2.real)
        iter_time = perf_counter() - iter_start_time
        print(
            f"@@@Iteration {i}: Residual = {r_norm2.real:.6e}, Time = {iter_time:.6f} s")
        iter_times.append(iter_time)
        if r_norm2.real < tol:
            print(
                f"@@@Converged at iteration {i} with residual {r_norm2.real:.6e}")
            break
    total_time = perf_counter() - start_time
    avg_iter_time = sum(iter_times) / len(iter_times)
    print("\nPerformance Statistics:")
    print(f"Total time: {total_time:.6f} s")
    print(f"Average time per iteration: {avg_iter_time:.6f} s")
    return x.copy()

In [None]:
x_o = slover(
    b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)

# MultiGrid - verify above.

In [None]:
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)
print(np.linalg.norm(qcu_fermion_out-quda_fermion_out) /
      np.linalg.norm(quda_fermion_out))

In [None]:
x_o = io.hdf5_xxx2xxx(params, 'x_o.h5')
_x_o = io.hdf5_xxx2xxx(params, '_x_o.h5')

In [None]:
x_o.flatten()[:50]

In [None]:
_x_o.flatten()[:50]

In [None]:
print(np.linalg.norm(_x_o-x_o) /
      np.linalg.norm(x_o))

# End for CG & BISTABCG. (pass, don't run this)

In [None]:
# cg_solver.end()
# bistabcg_solver.end()

# End for pyqcu. (pass, don't run this)

In [None]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)