# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
import functools
from pyqcu import define, io, qcu, eigen, cg, bistabcg, amg, linalg, gauge, demo
from time import perf_counter
from opt_einsum import contract
from pyqcu.set import params, argv
params[define._LAT_X_] = 4
params[define._LAT_Y_] = 4
params[define._LAT_Z_] = 4
params[define._LAT_T_] = 4
params[define._LAT_XYZT_] = params[define._LAT_X_] * \
    params[define._LAT_Y_] * params[define._LAT_Z_] * params[define._LAT_T_]
params[define._DATA_TYPE_] = define._LAT_C64_
sigma = 1.0
seed = 12138
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
argv[define._TOL_] = 1e-12
kappa = 1 / (2 * argv[define._MASS_] + 8)
U, src, dest, set_ptrs, wilson_cg_params, wilson_dslash_eo_params, wilson_dslash_oe_params, wilson_dslash_eo_dag_params, wilson_dslash_oe_dag_params = demo.give(
    params=params, sigma=sigma, seed=seed)


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=4) that QCU support (when '#define _BLOCK_SIZE_ 32 // for test small lattice')

# Give CG & BISTABCG Dslash.
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [2]:
def pdslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, U, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, U, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1
def pdslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, U, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, U, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1
def cg_dslash(src):
    return pdslash_dag(pdslash_no_dag(src))
def dslash_no_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, U, set_ptrs, wilson_dslash_eo_params)
    return dest
def dslash_dag(src):
    dest = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        dest, src, U, set_ptrs, wilson_dslash_eo_dag_params)
    return dest
def dslash(src):
    return dslash_no_dag(src)
def bistabcg_dslash(src):
    return pdslash_no_dag(src)
print(cp.linalg.norm((dest[define._EVEN_]-kappa *
                      dslash(dest[define._ODD_]))-src[define._ODD_]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000211548 sec
6.3079625e-07


# Verify $(\gamma_5 D)^\dag = D^\dag {\gamma_5}^\dag = D^\dag \gamma_5  = \gamma_5 D$

In [3]:
gamma5 = cp.array([[1, 0, 0, 0], [0, 1, 0, 0], [
                   0, 0, -1, 0], [0, 0, 0, -1]]).astype(src.dtype)
print(gamma5)
print(gamma5.T)


def gamma5_vec(src):
    return contract("ss,sctzyx->sctzyx", gamma5, io.fermion2sctzyx(src, params))


def vec_gamma5(src):
    return contract("sctzyx,ss->sctzyx", io.fermion2sctzyx(src, params), gamma5)


_src = dest[define._EVEN_]
print(_src.shape)
print(cp.linalg.norm(gamma5_vec(dslash_no_dag(_src))-dslash_dag(gamma5_vec(_src))))
print(cp.linalg.norm(gamma5_vec(pdslash_no_dag(_src))-pdslash_dag(gamma5_vec(_src))))

[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
[[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  1.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j -1.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j -1.+0.j]]
(4, 3, 4, 4, 4, 2)
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000247625 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000132138 sec
0.0
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000648837 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000133217 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000113166 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000079544 sec
0.0


# Origin CG. (pass, don't run this)

In [4]:
# b_e = src[define._EVEN_].flatten()
# b_o = src[define._ODD_].flatten()
# b__o = cp.zeros_like(b_o)
# tmp = cp.zeros_like(b_o)
# # b__o=b_o+kappa*D_oe(b_e)
# qcu.applyWilsonDslashQcu(tmp, b_e, U, set_ptrs, wilson_dslash_oe_params)
# b__o = b_o+kappa*tmp
# # b__o -> Dslash^dag b__o
# b__o = pdslash_dag(b__o)
# # Dslash(x_o)=b__o
# x_o = cg.slover(b=b__o, matvec=bistabcg_dslash, tol=1e-10, max_iter=1000000)
# # x_e  =b_e+kappa*D_eo(x_o)
# qcu.applyWilsonDslashQcu(tmp, x_o, U, set_ptrs, wilson_dslash_eo_params)
# x_e = b_e+kappa*tmp
# # give _dest
# _dest = cp.zeros_like(dest)
# _dest[define._EVEN_] = x_e.reshape(
#     dest[define._EVEN_].shape)
# _dest[define._ODD_] = x_o.reshape(
#     dest[define._ODD_].shape)
# print(np.linalg.norm(_dest-dest) /
#       np.linalg.norm(dest))

# EIGEN

In [5]:
# eigenvalues, eigenvectors = eigen.solver(n=src[define._EVEN_].size, k=512,
#                                matvec=bistabcg_dslash, dtype=src.dtype)
# eigenvalues, eigenvectors = eigen.cupyx_solver(n=src[define._EVEN_].size, k=src[define._EVEN_].size-1,
#                                matvec=bistabcg_dslash, dtype=src.dtype)
# _testvectors = amg.setup(n=src[define._EVEN_].size, k=params[define._LAT_E_],
#                          matvec=matvec, dtype=src.dtype)
# testvectors = io.xxxtzyx2mg_xxxtzyx(
#     io.eigenvectors2esctzyx(_testvectors, params), params)
# print("Shape of testvectors: ", testvectors.shape)
# for i in range(len(_testvectors)):
#     if i == 0:
#         pass
#     else:
#         print("Ax/x", bistabcg_dslash(_testvectors[i])/_testvectors[i])
#         projections = cp.dot(_testvectors[:i].conj(), _testvectors[i])
#         max_proj = cp.max(cp.abs(projections)).get()
#         print(f"Maximum projection onto existing basis: {max_proj:.2e}")
#         j = i+1
#         if j == len(_testvectors):
#             j = 0
#         print("Difference between v_i and v_j:", cp.linalg.norm(
#             _testvectors[i]-_testvectors[j])/cp.linalg.norm(_testvectors[i]))
# for T in range(params[define._MG_T_]):
#     for Z in range(params[define._MG_Z_]):
#         for Y in range(params[define._MG_Y_]):
#             for X in range(params[define._MG_X_]):
#                 testvectors[:, :, :, T, :, Z, :, Y, :, X, :] = linalg.orthogonalize_matrix(
#                     testvectors[:, :, :, T, :, Z, :, Y, :, X, :].reshape(params[define._LAT_E_], -1)).reshape(testvectors[:, :, :, T, :, Z, :, Y, :, X, :].shape)

In [6]:
src[define._EVEN_].size

1536

In [None]:
eigenvalues, eigenvectors = eigen.scipy_solver(n=src[define._EVEN_].size, k=512,
                                               matvec=bistabcg_dslash, dtype=src.dtype)

dtype: complex64, plan: SM, max_iter: 1000.0, tol: 1e-06
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000112042 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000088304 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000121488 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000080019 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000083054 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000073645 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000147009 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000684195 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000198641 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000434395 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000167135 sec
multi-gpu wilson dslash total time: (with

In [None]:
len(eigenvalues)

512

In [None]:
eigenvalues

array([0.7775175 -4.66301441e-02j, 0.7775185 +4.66296077e-02j,
       0.7799749 +7.51549006e-03j, 0.77998024-7.51631707e-03j,
       0.7905007 -7.34329820e-02j, 0.7904988 +7.34337643e-02j,
       0.79959327+1.01972461e-01j, 0.7995953 -1.01974919e-01j,
       0.7870419 +3.57627869e-07j, 0.79005206-3.66720781e-02j,
       0.8086537 -1.16287954e-01j, 0.79004014+3.66734602e-02j,
       0.80865365+1.16288744e-01j, 0.8382619 +1.61945641e-01j,
       0.8382681 -1.61946490e-01j, 0.86676043-1.84186101e-01j,
       0.86676055+1.84184313e-01j, 0.8013411 -7.49483109e-02j,
       0.8637685 -1.71585128e-01j, 0.8466768 -1.54574335e-01j,
       0.81587493-1.05333261e-01j, 0.83257157-1.33156449e-01j,
       0.9424247 -2.21234411e-01j, 0.8247837 -1.18559487e-01j,
       0.8013427 +7.49511495e-02j, 0.8466726 +1.54574081e-01j,
       0.86376846+1.71584651e-01j, 0.81587434+1.05333947e-01j,
       0.83257   +1.33157045e-01j, 0.8120809 -9.17596966e-02j,
       0.8009661 -6.21814691e-02j, 0.85412353-1.5673242

In [None]:
eigenvectors.shape

(512, 1536)

In [None]:
for i in range(len(eigenvectors)):
    if i == 0:
        pass
    else:
        print("Ax/x", bistabcg_dslash(eigenvectors[i])/eigenvectors[i])
        projections = cp.dot(eigenvectors[:i].conj(), eigenvectors[i])
        max_proj = cp.max(cp.abs(projections)).get()
        print(f"Maximum projection onto existing basis: {max_proj:.2e}")
        j = i+1
        if j == len(eigenvectors):
            j = 0
        print("Difference between v_i and v_j:", cp.linalg.norm(
            eigenvectors[i]-eigenvectors[j])/cp.linalg.norm(eigenvectors[i]))

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000198985 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000211569 sec
Ax/x [0.7775636 +0.04666879j 0.77752715+0.04663951j 0.7775203 +0.04662768j ...
 0.77753705+0.04663166j 0.7775329 +0.04663357j 0.77754897+0.04663493j]
Maximum projection onto existing basis: 9.87e-02
Difference between v_i and v_j: 1.58522
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000109569 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000100909 sec
Ax/x [0.77999806+0.00753481j 0.77998334+0.0074868j  0.77997184+0.00746958j ...
 0.77999455+0.00755945j 0.7799861 +0.00753549j 0.77998734+0.0075124j ]
Maximum projection onto existing basis: 4.13e-01
Difference between v_i and v_j: 1.5712961
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000183110 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000440569 sec
Ax/x [0.78001946-0.00757217j

In [None]:
for i, ev in enumerate(eigenvalues):
    print(f"λ_{i} = {ev:.2e}")
    # Verify eigenvector
    v = eigenvectors[i]
    w = cp.zeros_like(v)
    w = bistabcg_dslash(v)
    error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
    print(f"Relative error: {error:.2e}")
    j = i+1
    if j == len(eigenvalues):
        j = 0
    print(
        f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

λ_0 = 7.78e-01-4.66e-02j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000117886 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000076568 sec
Relative error: 1.20e-05
Diff between λ_0 and λ_1: 1.38e+00
λ_1 = 7.78e-01+4.66e-02j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000084466 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000133616 sec
Relative error: 1.43e-05
Diff between λ_1 and λ_2: 1.59e+00
λ_2 = 7.80e-01+7.52e-03j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000090477 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000079145 sec
Relative error: 3.07e-05
Diff between λ_2 and λ_3: 1.57e+00
λ_3 = 7.80e-01-7.52e-03j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000072780 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000123923 sec
Relative error: 3.21e-05
Diff between λ_3 and λ_4: 1.46e+00
λ_4 

Diff between λ_42 and λ_43: 1.39e+00
λ_43 = 8.42e-01+1.39e-01j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000129805 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000103221 sec
Relative error: 1.33e-05
Diff between λ_43 and λ_44: 1.42e+00
λ_44 = 9.16e-01+2.03e-01j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000185937 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000143208 sec
Relative error: 1.14e-05
Diff between λ_44 and λ_45: 1.12e+00
λ_45 = 9.22e-01+2.06e-01j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000108897 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000085442 sec
Relative error: 9.59e-06
Diff between λ_45 and λ_46: 1.41e+00
λ_46 = 8.93e-01+1.88e-01j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000166945 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000181009 sec
Relative error: 9

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 模拟数据（复特征值）
theta = np.linspace(0, 2 * np.pi, 3000)
r = 1 + 0.5 * np.cos(6 * theta)
x = 2 + r * np.cos(theta)
y = r * np.sin(theta)
eigenvalues = x + 1j * y

# 绘图
plt.figure(figsize=(8, 6))
plt.scatter(eigenvalues.real, eigenvalues.imag, s=1, label=r'$D_{DW}$')

plt.title(r"Eigenvalues, $24^2$, $\beta = 10.0$, $m = 0.05$, $L_s = 8$")
plt.xlabel(r'Re$(\lambda)$')
plt.ylabel(r'Im$(\lambda)$')
plt.xlim(-1, 5)
plt.ylim(-2, 2)
plt.legend()
plt.tight_layout()
plt.show()


AttributeError: `np.Inf` was removed in the NumPy 2.0 release. Use `np.inf` instead.

AttributeError: `np.Inf` was removed in the NumPy 2.0 release. Use `np.inf` instead.

<Figure size 800x600 with 1 Axes>

# End for pyqcu. (pass, don't run this)

In [None]:
# demo.end(set_ptrs=set_ptrs,params=params)