# Init for pyqcu.

In [1]:
import cupy as cp
import numpy as np
from pyqcu import define
from pyqcu import io
from pyqcu import qcu
from pyqcu import eigen, cg
from opt_einsum import contract
from pyqcu.set import params, argv, set_ptrs
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
kappa = 1 / (2 * argv[define._MASS_] + 8)
print('My rank is ', define.rank)
gauge_filename = f"quda_wilson-bistabcg-gauge_-{params[define._LAT_X_]}-{params[define._LAT_Y_]}-{params  [define._LAT_Z_]}-{params[define._LAT_T_]}-{params[define._LAT_XYZT_]}-{params[define._GRID_X_]}-{params[define._GRID_Y_]}-{params[define._GRID_Z_]}-{params[define._GRID_T_]}-{params[define._PARITY_]}-{params[define._NODE_RANK_]}-{params[define._NODE_SIZE_]}-{params[define._DAGGER_]}-f.h5"
print("Parameters:", params)


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (x=4,y=4,z=4,t=8) that QCU support.
    @@@@@@######QCU NOTES END######@@@@@@@
    
Parameter

In [2]:
wilson_cg_params = params.copy()
wilson_cg_params[define._SET_INDEX_] = 0
wilson_cg_params[define._SET_PLAN_] = define._SET_PLAN1_
qcu.applyInitQcu(set_ptrs, wilson_cg_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:0
host_params[_SET_PLAN_] :1
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [3]:
wilson_dslash_eo_params = params.copy()
wilson_dslash_eo_params[define._SET_INDEX_] = 1
wilson_dslash_eo_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:1
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [4]:
wilson_dslash_eo_dag_params = params.copy()
wilson_dslash_eo_dag_params[define._SET_INDEX_] = 2
wilson_dslash_eo_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_eo_dag_params[define._PARITY_] = define._EVEN_
wilson_dslash_eo_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_eo_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:2
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [5]:
wilson_dslash_oe_params = params.copy()
wilson_dslash_oe_params[define._SET_INDEX_] = 3
wilson_dslash_oe_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_params[define._DAGGER_] = define._NO_USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:3
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [6]:
wilson_dslash_oe_dag_params = params.copy()
wilson_dslash_oe_dag_params[define._SET_INDEX_] = 4
wilson_dslash_oe_dag_params[define._SET_PLAN_] = define._SET_PLAN0_
wilson_dslash_oe_dag_params[define._PARITY_] = define._ODD_
wilson_dslash_oe_dag_params[define._DAGGER_] = define._USE_
qcu.applyInitQcu(set_ptrs, wilson_dslash_oe_dag_params, argv)

gridDim.x               :4096
blockDim.x              :128
host_params[_LAT_X_]    :16
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :32
host_params[_LAT_XYZT_] :524288
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :1
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :1
host_params[_MAX_ITER_] :10000
host_params[_SET_INDEX_]:4
host_params[_SET_PLAN_] :0
host_argv[_MASS_]       :0.000000e+00
host_argv[_TOL_]        :1.000000e-09
lat_2dim[_XY_]          :512
lat_2dim[_XZ_]          :512
lat_2dim[_XT_]          :512
lat_2dim[_YZ_]          :1024
lat_2dim[_YT_]          :1024
lat_2dim[_ZT_]          :1024
lat_3dim[_YZT_]         :32768
lat_3dim[_XZT_]         :16384
lat_3dim[_XYT_]         :16384
lat_3dim[_XYZ_]         :16384
lat_4dim                :524288
grid_2dim[_XY_]         :1
grid_2dim[_XZ_]         :1
grid_2dim[_XT_]         :1
grid_2

In [7]:
print("Set pointers:", set_ptrs)
print("Set pointers data:", set_ptrs.data)

Set pointers: [94026758578144 94026768365600 94026768540752 94026768717152
 94026768855744              0              0              0
              0              0]
Set pointers data: <memory at 0x7f392e964280>


# Read from hdf5 files.

In [8]:
print("Gauge filename:", gauge_filename)
gauge = io.hdf5_xxxtzyx2grid_xxxtzyx(params, gauge_filename)
fermion_in_filename = gauge_filename.replace("gauge", "fermion-in")
print("Fermion in filename:", fermion_in_filename)
fermion_in = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_in_filename)
fermion_out_filename = gauge_filename.replace("gauge", "fermion-out")
print("Fermion out filename:", fermion_out_filename)
quda_fermion_out = io.hdf5_xxxtzyx2grid_xxxtzyx(
    params, fermion_out_filename)
fermion_out = cp.zeros_like(fermion_in)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
eigenvalues_filename = gauge_filename.replace("gauge", "eigenvalues")
print("Eigenvalues filename:", eigenvalues_filename)
eigenvalues = io.hdf5_xxx2xxx(file_name=eigenvalues_filename)
print("Eigenvalues data:", eigenvalues.data)
print("Eigenvalues shape:", eigenvalues.shape)
eigenvectors_filename = gauge_filename.replace("gauge", "eigenvectors")
print("Eigenvectors filename:", eigenvectors_filename)
eigenvectors = io.eigenvectors2esctzyx(
    params=params, eigenvectors=io.hdf5_xxx2xxx(file_name=eigenvectors_filename))
print("Eigenvectors data:", eigenvectors.data)
print("Eigenvectors shape:", eigenvectors.shape)

Gauge filename: quda_wilson-bistabcg-gauge_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Dest Shape: (3, 3, 4, 2, 32, 32, 32, 16)
Fermion in filename: quda_wilson-bistabcg-fermion-in_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out filename: quda_wilson-bistabcg-fermion-out_-32-32-32-32-1048576-1-1-1-1-0-0-1-0-f.h5
Grid Index T: 0, Grid Index Z: 0, Grid Index Y: 0, Grid Index X: 0
Grid Lat T: 32, Grid Lat Z: 32, Grid Lat Y: 32, Grid Lat X: 16
All Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Dest Shape: (2, 4, 3, 32, 32, 32, 16)
Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemor

# Run wilson bistabcg from pyqcu test.

In [9]:
qcu.applyWilsonBistabCgQcu(fermion_out, fermion_in,
                           gauge, set_ptrs, wilson_cg_params)
# qcu.applyWilsonCgQcu(fermion_out, fermion_in,
#                            gauge, set_ptrs, wilson_cg_params)
print("Fermion out data:", fermion_out.data)
print("Fermion out shape:", fermion_out.shape)
print("QUDA Fermion out data:", quda_fermion_out.data)
print("QUDA Fermion out shape:", quda_fermion_out.shape)
print("Difference:", cp.linalg.norm(fermion_out -
      quda_fermion_out)/cp.linalg.norm(quda_fermion_out))

Fermion out data: <MemoryPointer 0xb28400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f39ac3cf830>>
Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
QUDA Fermion out data: <MemoryPointer 0xb22400000 device=0 mem=<cupy.cuda.memory.PooledMemory object at 0x7f39302a2030>>
QUDA Fermion out shape: (2, 4, 3, 32, 32, 32, 16)
##RANK:0##LOOP:118##Residual:(2.27222e-10,1.97371e-23i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :1.935418240 sec
######TIME  :6195.46######
##RANK      :0
##LOOP      :999
##tmp0      :(1.03257e-11,2.49512e-12i)
##tmp1      :(4.79284e-12,-2.12052e-23i)
##rho_prev  :(-2.31288e-06,4.83391e-06i)
##rho       :(-2.31288e-06,4.83391e-06i)
##alpha     :(0.629024,-0.434716i)
##beta      :(0.059529,-0.0243195i)
##omega     :(2.1544,0.520593i)
##send_tmp  :(0.00984323,0i)
##norm2_tmp :(4.97484e+07,0.000224118i)
##diff_tmp  :(1.9786e-10,-8.91365e-22i)
##lat_4dim  :(524288,0i)
Difference: 3.056118e-07


# Give CG Dslash
> src_o-set_ptr->kappa()**2*dslash_oe(dslash_eo(src_o))

In [10]:
def cg_dslash_no_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_params)
    return src-kappa**2*tmp1


def cg_dslash_dag(src):
    tmp0 = cp.zeros_like(src)
    tmp1 = cp.zeros_like(src)
    qcu.applyWilsonDslashQcu(
        tmp0, src, gauge, set_ptrs, wilson_dslash_eo_dag_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, gauge, set_ptrs, wilson_dslash_oe_dag_params)
    return src-kappa**2*tmp1


def cg_dslash(src):
    return cg_dslash_dag(cg_dslash_no_dag(src))

# cg_dslash(fermion_in[0])

# def matvec(src):
#     dest = cp.zeros_like(src)
#     qcu.applyWilsonCgDslashQcu(
#         dest, src, gauge, set_ptrs, wilson_cg_params)
#     return dest


def matvec(src):
    return cg_dslash(src)

# matvec(fermion_in[0])

# Run matvec(eigenvector[.]) ?= eigenvalue[.]*eigenvector[.] for eigen test

In [11]:
for i, ev in enumerate(eigenvalues):
    print(f"λ_{i} = {ev:.2e}")
    # Verify eigenvector
    v = eigenvectors[i]
    w = cp.zeros_like(v)
    w = matvec(v)
    error = cp.linalg.norm(w - ev * v) / cp.linalg.norm(w)
    print(f"Relative error: {error:.2e}")
    j = i+1
    if j == len(eigenvalues):
        j = 0
    print(
        f"Diff between λ_{i} and λ_{j}: {cp.linalg.norm(eigenvectors[i] - eigenvectors[j])/cp.linalg.norm(eigenvectors[i]):.2e}")

λ_0 = 6.46e-04+0.00e+00j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001922066 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001790122 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001867794 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001809958 sec
Relative error: 8.08e-03
Diff between λ_0 and λ_1: 1.41e+00
λ_1 = 6.46e-04+0.00e+00j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001798238 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001773787 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001854559 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001732535 sec
Relative error: 9.35e-03
Diff between λ_1 and λ_2: 1.41e+00
λ_2 = 6.46e-04+0.00e+00j
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001850852 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0

# Give guage's eigenvalues and eigenvectors to hdf5 files. (pass, don't run this)

In [12]:
# eigen_solver = eigen.solver(
#     n=params[define._LAT_XYZT_] * define._LAT_HALF_SC_, k=define._LAT_Ne_,matvec=matvec,dtype=gauge.dtype)
# eigenvalues, eigenvectors = eigen_solver.run()
# io.xxx2hdf5_xxx(
#     eigenvalues, params, gauge_filename.replace("gauge", "eigenvalues"))
# io.xxx2hdf5_xxx(
#     eigenvectors, params, gauge_filename.replace("gauge", "eigenvectors"))

# Origin CG

In [13]:
b_e = fermion_in[define._EVEN_].flatten()
b_o = fermion_in[define._ODD_].flatten()
b__o = cp.zeros_like(b_o)
tmp = cp.zeros_like(b_o)
# b__o=b_o+kappa*D_oe(b_e)
qcu.applyWilsonDslashQcu(tmp, b_e, gauge, set_ptrs, wilson_dslash_oe_params)
b__o = b_o+kappa*tmp
# b__o -> Dslash^dag b__o
b__o = cg_dslash_dag(b__o)
# Dslash(x_o)=b__o
cg_solver = cg.slover(b=b__o, matvec=matvec, tol=1e-10, max_iter=1000000)
x_o = cg_solver.run()
# x_e  =b_e+kappa*D_eo(x_o)
qcu.applyWilsonDslashQcu(tmp, x_o, gauge, set_ptrs, wilson_dslash_eo_params)
x_e = b_e+kappa*tmp
# give qcu_fermion_out
qcu_fermion_out = cp.zeros_like(quda_fermion_out)
qcu_fermion_out[define._EVEN_] = x_e.reshape(
    quda_fermion_out[define._EVEN_].shape)
qcu_fermion_out[define._ODD_] = x_o.reshape(
    quda_fermion_out[define._ODD_].shape)

multi-gpu wilson dslash total time: (without malloc free memcpy) :0.002014467 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001958300 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001853369 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001820334 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001755793 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001818637 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001739819 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001837596 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001764317 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001799836 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.001749311 sec
Iteration 0: Residual = 4.702738e+05, Time = 0.014057 s
multi-gpu wilson dslash total time: (witho

In [14]:
np.linalg.norm(qcu_fermion_out-quda_fermion_out) / \
    np.linalg.norm(quda_fermion_out)

array(3.4778685e-07, dtype=float32)

# MultiGrid - give grids

In [None]:
_eigenvectors = io.xxxtzyx2mg_xxxtzyx(input_array=eigenvectors, params=params)
orth_eigenvectors = None

Input Array Shape: (24, 4, 3, 32, 32, 32, 16)
Dest Shape: (24, 4, 3, 8, 4, 4, 8, 4, 8, 4, 4)


In [None]:
_eigenvectors.shape  # escTtZzYyXx

(24, 4, 3, 8, 4, 4, 8, 4, 8, 4, 4)

In [47]:
import cupy as cp

# 创建一个二维的 cupy 数组（矩阵）
A = cp.array([[1, 2], [3, 4], [5, 6]], dtype=cp.float64)
print("原始矩阵 A 的形状：", A.shape)
# 进行 QR 分解
Q, R = cp.linalg.qr(A)

print("原始矩阵 A:")
print(A)
print("正交矩阵 Q:")
print(Q)
print("上三角矩阵 R:")
print(R)


原始矩阵 A 的形状： (3, 2)
原始矩阵 A:
[[1. 2.]
 [3. 4.]
 [5. 6.]]
正交矩阵 Q:
[[-0.16903085  0.89708523]
 [-0.50709255  0.27602622]
 [-0.84515425 -0.34503278]]
上三角矩阵 R:
[[-5.91607978 -7.43735744]
 [ 0.          0.82807867]]


In [48]:
a=Q[:,0]
b=Q[:,1]

In [None]:
cp.dot(a,b)

array(5.55111512e-17)

In [64]:
def orthogonalize(eigenvectors):
    size_e, size_s, size_c, size_T, size_t, size_Z, size_z, size_Y, size_y, size_X, size_x = eigenvectors.shape
    print(size_e, size_s, size_c, size_T, size_t,
          size_Z, size_z, size_Y, size_y, size_X, size_x)
    for T in range(size_T):
        for Z in range(size_Z):
            for Y in range(size_Y):
                for X in range(size_X):
                    origin_matrix = eigenvectors[:,
                                                 :, :, T, :, Z, :, Y, :, X, :]
                    _shape = origin_matrix.shape
                    _origin_matrix = origin_matrix.reshape(size_e, -1)
                    Q, _ = cp.linalg.qr(_origin_matrix.T)
                    a=Q[:,0]
                    b=Q[:,1]
                    print(cp.dot(a,b))
                    return origin_matrix, Q.T.reshape(_shape)

In [67]:
_origin_matrix, _Q = orthogonalize(_eigenvectors)
test_origin_matrix=_origin_matrix.reshape(24,-1)
print(cp.dot(test_origin_matrix[0],test_origin_matrix[1]))
test_Q=_Q.reshape(24,-1)
print(cp.dot(test_Q[0],test_Q[1]))
print(cp.dot(test_Q[:,0],test_Q[:,1]))

24 4 3 8 4 4 8 4 8 4 4
(0.01932884-0.03835904j)
(3.8411163e-05-6.872704e-05j)
(0.01932884-0.03835904j)
(5.2099902e-05-0.000634855j)


In [None]:
import cupy as cp

def gram_schmidt(vectors):
    num_vectors = vectors.shape[1]
    orthogonal_vectors = cp.copy(vectors)
    for i in range(num_vectors):
        orthogonal_vectors[:, i] /= cp.linalg.norm(orthogonal_vectors[:, i])
        for j in range(i + 1, num_vectors):
            projection = cp.dot(orthogonal_vectors[:, j], orthogonal_vectors[:, i])
            orthogonal_vectors[:, j] -= projection * orthogonal_vectors[:, i]
    return orthogonal_vectors

# 示例矩阵
_origin_matrix = cp.array([[1, 2], [3, 4]], dtype=cp.float64)
# 进行修正的格拉姆 - 施密特正交化
Q = modified_gram_schmidt(_origin_matrix)
a = Q[:, 0]
b = Q[:, 1]
print(cp.dot(a, b))


-3.3306690738754696e-16


In [68]:
import cupy as cp

_origin_matrix = cp.array([[1, 2], [3, 4]], dtype=cp.float64)
Q, _ = cp.linalg.qr(_origin_matrix.T)
a = Q[:, 0]
b = Q[:, 1]
print(cp.dot(a, b))


-2.220446049250313e-16


In [19]:

mg_eigenvectors = _eigenvectors/(define._LAT_E_**2/_eigenvectors.size)**0.5
_src = io.xxxtzyx2mg_xxxtzyx(
    input_array=fermion_in[define._EVEN_], params=params)

Input Array Shape: (4, 3, 32, 32, 32, 16)
Dest Shape: (4, 3, 8, 4, 4, 8, 4, 8, 4, 4)


In [20]:
params

array([     32,      32,      32,      32, 1048576,       1,       1,
             1,       1,       0,       0,       1,       0,   10000,
             0,       0,       0,       4,       4,       4,       8],
      dtype=int32)

# MultiGrid - R*vector
![](./image0-dev40.png)

In [21]:
r_src = _src


def r_vec(src):
    return contract("escTtZzYyXx,scTtZzYyXx->eTZYX", mg_eigenvectors, src)


r_dest = r_vec(r_src)

In [22]:
fermion_in.shape

(2, 4, 3, 32, 32, 32, 16)

In [23]:
r_src.shape

(4, 3, 8, 4, 4, 8, 4, 8, 4, 4)

In [24]:
r_dest.shape

(24, 8, 4, 4, 4)

# MultiGrid - P*vector
![](./image1-dev40.png)


In [25]:
p_src = r_dest


def p_vec(src):
    return contract("escTtZzYyXx,eTZYX->scTtZzYyXx", cp.conj(mg_eigenvectors), src)


p_dest = p_vec(p_src)

In [26]:
p_dest.shape

(4, 3, 8, 4, 4, 8, 4, 8, 4, 4)

# MultiGrid - verify above
![](./image2-dev40.png)

In [27]:
print(cp.linalg.norm(r_src-p_dest)/cp.linalg.norm(r_src))

1013.7322


In [28]:
r_src.flatten()[:50]

array([1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j,
       1.+1.j, 1.+1.j], dtype=complex64)

In [29]:
p_dest.flatten()[:50]

array([ 246.04144  +894.07495j,  399.79956  +881.2871j ,
        452.3036   +665.55884j,  517.0934   +735.2545j ,
        839.16284 +1064.4465j ,  921.3453  +1166.526j  ,
        888.3568  +1078.7959j ,  600.0752  +1216.5171j ,
        832.3099  +1398.6096j ,  533.64984 +1297.9133j ,
        329.09045 +1065.7424j ,  206.53877  +884.70935j,
       -138.98935  +869.5852j ,  -40.69879  +792.6736j ,
        -55.968536 +769.5004j ,   23.505737 +718.9712j ,
         65.5307  +1022.7233j ,  127.41443  +917.787j  ,
        456.77524  +935.182j  ,  363.32425  +920.3023j ,
        755.70013 +1070.4946j ,  967.64246 +1082.2573j ,
        893.83563 +1358.3857j ,  680.5658  +1201.3472j ,
        787.989   +1337.4756j ,  711.11224 +1443.2031j ,
        608.5442  +1159.7468j ,  674.03204 +1075.7908j ,
       -159.86273 +1120.3923j , -103.05679 +1014.368j  ,
         31.039398 +934.3166j ,   44.285156+1007.33484j,
        204.70578 +1008.7954j ,  319.82    +1128.8835j ,
        301.73724 +1026.9133j ,

In [30]:
_mat = contract("escTtZzYyXx,escTtZzYyXx->scTtZzYyXx",
                mg_eigenvectors, cp.conj(mg_eigenvectors)).flatten()
print(cp.linalg.norm(_mat))
print(_mat[:100])

2531.2222
[0.81065047+1.99118877e-09j 0.7881021 -8.38526582e-10j
 0.77472997+1.63276792e-09j 0.7932915 -3.23339155e-09j
 0.79547673+2.51676724e-09j 0.88196594+1.98862105e-09j
 0.861972  -2.12312123e-09j 0.9398663 +2.02619876e-09j
 0.92097104+1.97333105e-09j 0.8955735 -1.44655843e-09j
 0.8646207 -1.15720045e-10j 0.9179826 -6.78705481e-10j
 0.8908733 +1.58189706e-09j 0.8973061 +3.20887450e-09j
 0.91152143-1.21407329e-10j 0.85536456-1.63423475e-09j
 0.8016379 +5.17214604e-10j 0.8052037 +2.27316987e-09j
 0.7541182 -2.08433493e-09j 0.7315155 -7.01129821e-10j
 0.7753762 +1.64873371e-09j 0.80678964+3.39131523e-09j
 0.8949382 +1.49327906e-09j 0.9193432 -2.08113571e-09j
 0.926426  +2.01821981e-09j 0.95401335-2.83163248e-09j
 0.88492525+5.45218715e-09j 0.8645886 +1.25165156e-09j
 0.9198135 +3.06654613e-09j 0.8474449 -1.46472146e-09j
 0.8619629 +7.59942331e-10j 0.88894194-1.72849868e-09j
 0.77467096+3.58292396e-09j 0.7737134 +3.67311626e-10j
 0.7124592 -7.13699322e-10j 0.75399554-2.16126173e-10j


In [31]:
_norm = cp.linalg.norm(mg_eigenvectors, axis=0).flatten()
print(cp.linalg.norm(_norm))

2508.2776


In [32]:
_norm.shape

(6291456,)

In [33]:
_norm[:100]**2

array([0.81065047, 0.78810203, 0.77472997, 0.79329145, 0.7954767 ,
       0.88196594, 0.86197203, 0.93986636, 0.920971  , 0.89557356,
       0.8646207 , 0.9179825 , 0.89087325, 0.8973061 , 0.9115214 ,
       0.85536456, 0.8016378 , 0.8052037 , 0.7541182 , 0.73151547,
       0.77537614, 0.80678964, 0.89493805, 0.9193432 , 0.92642605,
       0.95401347, 0.88492525, 0.8645885 , 0.9198135 , 0.8474449 ,
       0.8619629 , 0.8889418 , 0.77467096, 0.77371335, 0.7124592 ,
       0.75399554, 0.77764374, 0.8803033 , 0.94582295, 0.9478899 ,
       0.93970734, 0.88534635, 0.9381994 , 0.8681243 , 0.84519804,
       0.8096056 , 0.82115966, 0.79026216, 0.7674842 , 0.73126936,
       0.7034451 , 0.69353855, 0.79352456, 0.81033045, 0.93573385,
       0.9734145 , 0.9478038 , 0.99160093, 0.9309007 , 0.8553509 ,
       0.86110777, 0.8190731 , 0.80228066, 0.80027574, 0.78812003,
       0.7716602 , 0.74145067, 0.77566683, 0.76472366, 0.9089576 ,
       0.99338806, 0.9481497 , 0.9527801 , 1.006373  , 0.91488

# End for pyqcu. (pass, don't run this)

In [34]:
# qcu.applyEndQcu(set_ptrs, params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_eo_dag_params)
# qcu.applyEndQcu(set_ptrs, wilson_dslash_oe_dag_params)