In [1]:
from pyqcu.gmg import *
from pyqcu import define, gauge, io, qcu, set
import cupy as cp
import numpy as np


class WilsonCase:
    def __init__(self, params=None, argv=None, min_size=4, max_levels=10, seed=12138):
        if params is None:
            self.params = set.params.copy()
            self.params[define._LAT_X_] = 4
            self.params[define._LAT_Y_] = 8
            self.params[define._LAT_Z_] = 8
            self.params[define._LAT_T_] = 4
            self.params[define._LAT_XYZT_] = self.params[define._LAT_X_] * \
                self.params[define._LAT_Y_]*self.params[define._LAT_Z_] * \
                self.params[define._LAT_T_]
            self.params[define._DATA_TYPE_] = define._LAT_C64_
            self.params[define._NODE_RANK_] = define.rank
            self.params[define._NODE_SIZE_] = define.size
        else:
            self.params = params.copy()
        if argv is None:
            self.argv = set.argv.copy()
            self.argv[define._MASS_] = 0.0
        else:
            self.argv = argv.copy()
        self.min_size = min_size
        self.max_levels = max_levels
        self.seed = seed
        np.random.seed(seed)
        cp.random.seed(seed)

    def give_b(self, params):
        b = cp.ones(params[define._LAT_XYZT_]*define._LAT_SC_,
                    dtype=define.dtype(params[define._DATA_TYPE_]))
        b = io.fermion2psctzyx(b, params)
        return b

    def run(self):
        grid_params = []
        U_params = []
        src_params = []
        dest_params = []
        params_params = []
        set_ptrs = set.set_ptrs.copy()
        params = self.params.copy()
        argv = self.argv.copy()
        current_nx, current_ny, current_nz = self.params[define._LAT_Y_], self.params[
            define._LAT_Z_], self.params[define._LAT_X_]*self.params[define._LAT_T_]*define._LAT_SC_
        print(
            f"current_nx: {current_nx}, current_ny: {current_ny}, current_nz: {current_nz}")
        while min(current_nx, current_ny) >= self.min_size and len(grid_params) < self.max_levels:
            grid_params.append((current_nx, current_ny, current_nz))
            print(
                f"  Level {len(grid_params)-1}: {current_nx}x{current_ny}x{current_nz}")
            current_nx = max(2, current_nx // 2)
            current_ny = max(2, current_ny // 2)
        for i, (nx, ny, nz) in enumerate(grid_params):
            params[define._SET_INDEX_] = i
            params[define._SET_PLAN_] = define._SET_PLAN1_
            params[define._LAT_Y_] = nx
            params[define._LAT_Z_] = ny
            params[define._LAT_XYZT_] = params[define._LAT_X_] * \
                params[define._LAT_Y_]*params[define._LAT_Z_] * \
                params[define._LAT_T_]
            if i == 0:
                U = gauge.give_gauge(params=params)
            else:
                _U = U.copy()
                _shape = list(_U.shape)
                lat_x = _shape[-1]
                lat_y = _shape[-2]
                lat_z = _shape[-3]
                lat_t = _shape[-4]
                lat_p = define._LAT_P_
                lat_d = define._LAT_D_
                lat_c = define._LAT_C_
                lat_y //= 2
                __U = cp.zeros((lat_c, lat_c, lat_d, lat_p, lat_t,
                               lat_z, lat_y, lat_x), dtype=U.dtype)
                for d in range(lat_d):
                    for p in range(lat_p):
                        for t in range(lat_t):
                            for z in range(lat_z):
                                for y in range(lat_y):
                                    for x in range(lat_x):
                                        __U[:, :, d, p, t, z, y, x] = _U[:, :, d, p, t,
                                                                         z, y*2, x] @ _U[:, :, d, p, t, z, y*2+1, x]
                lat_z //= 2
                U = cp.zeros((lat_c, lat_c, lat_d, lat_p, lat_t,
                              lat_z, lat_y, lat_x), dtype=U.dtype)
                for d in range(lat_d):
                    for p in range(lat_p):
                        for t in range(lat_t):
                            for z in range(lat_z):
                                for y in range(lat_y):
                                    for x in range(lat_x):
                                        U[:, :, d, p, t, z, y, x] = __U[:, :, d, p, t,
                                                                         z*2, y, x] @ __U[:, :, d, p, t, z*2+1, y, x]
            qcu.applyInitQcu(set_ptrs, params, argv)
            src = self.give_b(params)
            dest = cp.zeros_like(src)
            qcu.applyWilsonBistabCgQcu(dest, src,
                                       U, set_ptrs, params)
            dest_params.append(dest.get())
            src_params.append(src.get())
            U_params.append(U.get())
            params_params.append(params.copy())  
        return dict(dest_params=dest_params, src_params=src_params, U_params=U_params, set_ptrs=set_ptrs, params_params=params_params, kappa=1 / (2 * argv[define._MASS_] + 8))


case = WilsonCase()
case_result_dict = case.run()


    @@@@@@######QCU NOTES START######@@@@@@@
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (wilson:x=4,y=4,z=4,t=4;clover:x=8,y=8,z=8,t=8) that QCU support (when '#define _BLOCK_SIZE_ 

In [5]:
def dslash_eo(src, index):
    eo_params = case_result_dict['params_params'][index]
    eo_params[define._PARITY_] = define._EVEN_
    eo_params[define._DAGGER_] = define._NO_USE_
    _src = cp.array(src.copy())
    _dest = cp.zeros_like(_src)
    _U = cp.array(case_result_dict['U_params'][index])
    _set_ptrs = case_result_dict['set_ptrs']
    qcu.applyWilsonDslashQcu(
        _dest, _src, _U, _set_ptrs, eo_params)
    return _dest.get()


def dslash_oe(src, index):
    oe_params = case_result_dict['params_params'][index]
    oe_params[define._PARITY_] = define._ODD_
    oe_params[define._DAGGER_] = define._NO_USE_
    _src = cp.array(src.copy())
    _dest = cp.zeros_like(_src)
    _U = cp.array(case_result_dict['U_params'][index])
    _set_ptrs = case_result_dict['set_ptrs']
    qcu.applyWilsonDslashQcu(
        _dest, _src, _U, _set_ptrs, oe_params)
    return _dest.get()


def dslash(src, index):
    eo_params = case_result_dict['params_params'][index]
    eo_params[define._PARITY_] = define._EVEN_
    eo_params[define._DAGGER_] = define._NO_USE_
    oe_params = case_result_dict['params_params'][index]
    oe_params[define._PARITY_] = define._ODD_
    oe_params[define._DAGGER_] = define._NO_USE_
    _src = cp.array(src.copy())
    tmp0 = cp.zeros_like(_src)
    tmp1 = cp.zeros_like(_src)
    _dest = cp.zeros_like(_src)
    _U = cp.array(case_result_dict['U_params'][index])
    _set_ptrs = case_result_dict['set_ptrs']
    qcu.applyWilsonDslashQcu(
        tmp0, _src, _U, _set_ptrs, eo_params)
    qcu.applyWilsonDslashQcu(
        tmp1, tmp0, _U, _set_ptrs, oe_params)
    _dest = _src-case_result_dict['kappa']**2*tmp1
    return _dest.get()


def check_result():
    for i in range(len(case_result_dict['src_params'])):
        _dest = cp.array(case_result_dict['dest_params'][i])
        src = cp.array(case_result_dict['src_params'][i])
        dest = cp.zeros_like(src)
        U = cp.array(case_result_dict['U_params'][i])
        print(src.shape)
        print(dest.shape)
        print(U.shape)
        qcu.applyWilsonBistabCgQcu(dest, src,
                                   U, case_result_dict['set_ptrs'], case_result_dict['params_params'][i])
        print(cp.linalg.norm(dest))
        print(cp.linalg.norm(_dest))
        print(cp.linalg.norm(dest-_dest)/cp.linalg.norm(dest))
        b_e = case_result_dict['src_params'][i][define._EVEN_].copy()
        b_o = case_result_dict['src_params'][i][define._ODD_].copy()
        x_e = dest[define._EVEN_].get()
        x_o = dest[define._ODD_].get()
        _b_e = x_e-case_result_dict['kappa']*dslash_eo(x_o, i)
        _b_o = x_o-case_result_dict['kappa']*dslash_oe(x_e, i)
        print(np.linalg.norm(_b_e-b_e)/np.linalg.norm(b_e))
        print(np.linalg.norm(_b_o-b_o)/np.linalg.norm(b_o))


check_result()

(2, 4, 3, 4, 8, 8, 2)
(2, 4, 3, 4, 8, 8, 2)
(3, 3, 4, 2, 4, 8, 8, 2)
##RANK:0##LOOP:42##Residual:(7.57267e-10,3.72852e-20i)
multi-gpu wilson bistabcg total time: (without malloc free memcpy) :0.046136008 sec
######TIME  :151123######
##RANK      :0
##LOOP      :999
##tmp0      :(7.30128e-10,3.42483e-10i)
##tmp1      :(6.77767e-10,-1.7108e-19i)
##rho_prev  :(3.55129e-07,-2.20223e-07i)
##rho       :(3.55129e-07,-2.20223e-07i)
##alpha     :(4.77053,-4.49048i)
##beta      :(0.00665572,-0.00832452i)
##omega     :(1.07725,0.505311i)
##send_tmp  :(6.35495e-07,0i)
##norm2_tmp :(24005.2,3.37487e-07i)
##diff_tmp  :(2.64733e-11,-3.72185e-22i)
##lat_4dim  :(512,0i)
4224.6313
473.92096
0.89727837
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000143212 sec
multi-gpu wilson dslash total time: (without malloc free memcpy) :0.000163397 sec
2.7205672e-07
6.202054
(2, 4, 3, 4, 4, 4, 2)
(2, 4, 3, 4, 4, 4, 2)
(3, 3, 4, 2, 4, 4, 4, 2)
##RANK:0##LOOP:16##Residual:(1.7e-10,-3.42488e-20i)