In [1]:
from opt_einsum import contract
import cupy as cp
import numpy as np
from time import perf_counter
from pyqcu.cuda import define
from pyqcu.cuda import io
from pyqcu.cuda import qcu
#############################
params = np.array([0]*define._PARAMS_SIZE_, dtype=np.int32)
params[define._LAT_X_] = 32
params[define._LAT_Y_] = 32
params[define._LAT_Z_] = 32
params[define._LAT_T_] = 1
params[define._LAT_XYZT_] = params[define._LAT_X_] * \
    params[define._LAT_Y_]*params[define._LAT_Z_]*params[define._LAT_T_]
params[define._GRID_X_], params[define._GRID_Y_], params[define._GRID_Z_], params[
    define._GRID_T_] = define.split_into_four_factors(define.size)
params[define._PARITY_] = 0
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
params[define._DAGGER_] = 0
params[define._MAX_ITER_] = 1000
params[define._DATA_TYPE_] = define._LAT_C128_
params[define._SET_INDEX_] = 5
params[define._SET_PLAN_] = -1
params[define._MG_X_] = 1
params[define._MG_Y_] = 1
params[define._MG_Z_] = 1
params[define._MG_T_] = 1
params[define._LAT_E_] = 24
params[define._VERBOSE_] = 1
params[define._SEED_] = 42
dtype = define.dtype_half(params[define._DATA_TYPE_])
argv = np.array([0.0]*define._ARGV_SIZE_,
                dtype=dtype)
argv[define._MASS_] = 0.05
argv[define._TOL_] = 1e-12
argv[define._SIGMA_] = 0.1
set_ptrs = np.array(10*[0], dtype=np.int64)  # maybe more than 10?
#############################
laplacian_in = cp.array([range(define._LAT_C_*params[define._LAT_XYZT_])], dtype=dtype).reshape(
    define._LAT_C_, params[define._LAT_Z_], params[define._LAT_Y_], params[define._LAT_X_]).copy()
laplacian_out = cp.zeros_like(laplacian_in).copy()
gauge = cp.array([range(define._LAT_DCC_*params[define._LAT_XYZT_])], dtype=dtype).reshape(
    define._LAT_C_, define._LAT_C_, define._LAT_D_, params[define._LAT_Z_], params[define._LAT_Y_], params[define._LAT_X_]).copy()
laplacian_in = cp.ones_like(laplacian_in).copy()
# gauge = cp.zeros_like(gauge).copy()
#############################


def _Laplacian(F, U):
    Lx, Ly, Lz = params[define._LAT_X_], params[define._LAT_Y_], params[define._LAT_Z_]
    U_dag = U.transpose(0, 1, 2, 3, 5, 4).conj()
    F = F.reshape(Lz, Ly, Lx, define._LAT_C_, -1)
    t0 = perf_counter()
    dest = (
        # - for SA with evals , + for LA with (12 - evals)
        6 * F
        - (
            contract("zyxab,zyxbc->zyxac", U[0], cp.roll(F, -1, 2))
            + contract("zyxab,zyxbc->zyxac", U[1], cp.roll(F, -1, 1))
            + contract("zyxab,zyxbc->zyxac", U[2], cp.roll(F, -1, 0))
            + cp.roll(contract("zyxab,zyxbc->zyxac", U_dag[0], F), 1, 2)
            + cp.roll(contract("zyxab,zyxbc->zyxac", U_dag[1], F), 1, 1)
            + cp.roll(contract("zyxab,zyxbc->zyxac", U_dag[2], F), 1, 0)
        )
    ).reshape(Lz * Ly * Lx * define._LAT_C_, -1)
    t1 = perf_counter()
    print(f'cupy cost time: {t1 - t0} sec')
    return dest
#############################

@My Rank:0/1, Local Rank:0@



In [2]:
gauge

array([[[[[[0.000000e+00, 1.000000e+00, 2.000000e+00, ...,
            2.900000e+01, 3.000000e+01, 3.100000e+01],
           [3.200000e+01, 3.300000e+01, 3.400000e+01, ...,
            6.100000e+01, 6.200000e+01, 6.300000e+01],
           [6.400000e+01, 6.500000e+01, 6.600000e+01, ...,
            9.300000e+01, 9.400000e+01, 9.500000e+01],
           ...,
           [9.280000e+02, 9.290000e+02, 9.300000e+02, ...,
            9.570000e+02, 9.580000e+02, 9.590000e+02],
           [9.600000e+02, 9.610000e+02, 9.620000e+02, ...,
            9.890000e+02, 9.900000e+02, 9.910000e+02],
           [9.920000e+02, 9.930000e+02, 9.940000e+02, ...,
            1.021000e+03, 1.022000e+03, 1.023000e+03]],

          [[1.024000e+03, 1.025000e+03, 1.026000e+03, ...,
            1.053000e+03, 1.054000e+03, 1.055000e+03],
           [1.056000e+03, 1.057000e+03, 1.058000e+03, ...,
            1.085000e+03, 1.086000e+03, 1.087000e+03],
           [1.088000e+03, 1.089000e+03, 1.090000e+03, ...,
           

In [3]:
laplacian_in

array([[[[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],

        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],

        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
    

In [4]:
#############################
qcu.applyInitQcu(set_ptrs, params, argv)
t0 = perf_counter()
for i in range(10):
    print("_norm of Laplacian gauge:", cp.linalg.norm(gauge))
    print("_norm of Laplacian in:", cp.linalg.norm(laplacian_in))
    print("_norm of Laplacian out:", cp.linalg.norm(laplacian_out))
    qcu.applyLaplacianQcu(laplacian_out, laplacian_in,
                      gauge, set_ptrs, params)
    print("norm of Laplacian gauge:", cp.linalg.norm(gauge))
    print("norm of Laplacian in:", cp.linalg.norm(laplacian_in))
    print("norm of Laplacian out:", cp.linalg.norm(laplacian_out))
t1 = perf_counter()
qcu.applyEndQcu(set_ptrs, params)
print(f'PyQCU cost time: {t1 - t0} sec')
#############################
_gauge = io.ccdzyx2dzyxcc(io.gauge2ccdzyx(
    gauge, params))
_laplacian_in = io.czyx2zyxc(io.laplacian2czyx(
    laplacian_in, params))
t0 = perf_counter()
_laplacian_out = _Laplacian(
    _laplacian_in, _gauge)
t1 = perf_counter()
print(f'PyQUDA cost time: {t1 - t0} sec')
print("norm of PyQuda Laplacian out:",
      cp.linalg.norm(_laplacian_out))
#############################

just for laplacian, lat_t = 1, no even-odd
set_ptr:0x562ebc6dba80
long long set_ptr:94758729792128
gridDim.x               :2048
blockDim.x              :16
host_params[_LAT_X_]    :32
host_params[_LAT_Y_]    :32
host_params[_LAT_Z_]    :32
host_params[_LAT_T_]    :1
host_params[_LAT_XYZT_] :32768
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :1000
host_params[_DATA_TYPE_]:4
host_params[_SET_INDEX_]:5
host_params[_SET_PLAN_] :-1
host_params[_MG_X_]     :1
host_params[_MG_Y_]     :1
host_params[_MG_Z_]     :1
host_params[_MG_T_]     :1
host_params[_LAT_E_]    :24
host_params[_VERBOSE_]  :1
host_argv[_MASS_]       :5.000000e-02
host_argv[_TOL_]        :1.000000e-12
host_argv[_SIGMA_]      :1.000000e-01
lat_2dim[_XY_]          :1024
lat_2dim[_XZ_]          :1024
lat_2dim[_XT_]          :32
lat_2dim

In [5]:
_gauge

array([[[[[[0.000000e+00, 1.310720e+05, 2.621440e+05],
           [3.932160e+05, 5.242880e+05, 6.553600e+05],
           [7.864320e+05, 9.175040e+05, 1.048576e+06]],

          [[1.000000e+00, 1.310730e+05, 2.621450e+05],
           [3.932170e+05, 5.242890e+05, 6.553610e+05],
           [7.864330e+05, 9.175050e+05, 1.048577e+06]],

          [[2.000000e+00, 1.310740e+05, 2.621460e+05],
           [3.932180e+05, 5.242900e+05, 6.553620e+05],
           [7.864340e+05, 9.175060e+05, 1.048578e+06]],

          ...,

          [[2.900000e+01, 1.311010e+05, 2.621730e+05],
           [3.932450e+05, 5.243170e+05, 6.553890e+05],
           [7.864610e+05, 9.175330e+05, 1.048605e+06]],

          [[3.000000e+01, 1.311020e+05, 2.621740e+05],
           [3.932460e+05, 5.243180e+05, 6.553900e+05],
           [7.864620e+05, 9.175340e+05, 1.048606e+06]],

          [[3.100000e+01, 1.311030e+05, 2.621750e+05],
           [3.932470e+05, 5.243190e+05, 6.553910e+05],
           [7.864630e+05, 9.175350e+05,

In [6]:
_laplacian_in

array([[[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        ...,

        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         ...,
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]],


       [[[1., 1., 1.],
         [1., 1., 1.],
         [1., 

In [7]:
_laplacian_out

array([[ -5406711.],
       [-10125303.],
       [-14843895.],
       ...,
       [ -5895045.],
       [-10613637.],
       [-15332229.]])

In [8]:
#############################
_laplacian_out = io.zyxc2czyx(io.laplacian2zyxc(_laplacian_out, params))
print("Difference between QUDA and PyQuda Laplacian out:",
      cp.linalg.norm(_laplacian_out - laplacian_out)/cp.linalg.norm(_laplacian_out))
#############################

Difference between QUDA and PyQuda Laplacian out: 46430.05744397814


In [9]:
laplacian_in

array([[[[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],

        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],

        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
    

In [10]:
laplacian_out

array([[[[-2.53145774e+11, -4.66623793e+11, -2.53150214e+11, ...,
          -4.66745027e+11, -2.53212377e+11, -4.66753687e+11],
         [-2.53216817e+11, -4.66762347e+11, -2.53221257e+11, ...,
          -4.66883592e+11, -2.53283424e+11, -4.66825136e+11],
         [-2.53287856e+11, -4.66900914e+11, -2.53292297e+11, ...,
          -4.67022169e+11, -2.53354467e+11, -4.67030831e+11],
         ...,
         [-2.55207683e+11, -4.70646850e+11, -2.55212131e+11, ...,
          -4.70768395e+11, -2.55274398e+11, -4.70709846e+11],
         [-2.55278845e+11, -4.70785760e+11, -2.55283293e+11, ...,
          -4.70907317e+11, -2.55345563e+11, -4.70916000e+11],
         [-2.55350011e+11, -4.70924683e+11, -2.55354459e+11, ...,
          -4.71046250e+11, -2.55416733e+11, -4.70987694e+11]],

        [[-2.55421182e+11, -4.71063618e+11, -2.55425630e+11, ...,
          -4.71185196e+11, -2.55487907e+11, -4.71193880e+11],
         [-2.55492356e+11, -4.71202565e+11, -2.55496804e+11, ...,
          -4.71324154e

In [11]:
_laplacian_out

array([[[[ -5406711.,  -5406633.,  -5406651., ...,  -5407137.,
           -5407155.,  -5407173.],
         [ -5404215.,  -5404137.,  -5404155., ...,  -5404641.,
           -5404659.,  -5404677.],
         [ -5404791.,  -5404713.,  -5404731., ...,  -5405217.,
           -5405235.,  -5405253.],
         ...,
         [ -5420343.,  -5420265.,  -5420283., ...,  -5420769.,
           -5420787.,  -5420805.],
         [ -5420919.,  -5420841.,  -5420859., ...,  -5421345.,
           -5421363.,  -5421381.],
         [ -5421495.,  -5421417.,  -5421435., ...,  -5421921.,
           -5421939.,  -5421957.]],

        [[ -5326839.,  -5326761.,  -5326779., ...,  -5327265.,
           -5327283.,  -5327301.],
         [ -5324343.,  -5324265.,  -5324283., ...,  -5324769.,
           -5324787.,  -5324805.],
         [ -5324919.,  -5324841.,  -5324859., ...,  -5325345.,
           -5325363.,  -5325381.],
         ...,
         [ -5340471.,  -5340393.,  -5340411., ...,  -5340897.,
           -5340915.,  -5

In [12]:
_laplacian_out-laplacian_out

array([[[[ 2.53140367e+11,  4.66618387e+11,  2.53144807e+11, ...,
           4.66739620e+11,  2.53206970e+11,  4.66748280e+11],
         [ 2.53211413e+11,  4.66756943e+11,  2.53215853e+11, ...,
           4.66878187e+11,  2.53278019e+11,  4.66819731e+11],
         [ 2.53282452e+11,  4.66895509e+11,  2.53286892e+11, ...,
           4.67016764e+11,  2.53349062e+11,  4.67025425e+11],
         ...,
         [ 2.55202263e+11,  4.70641429e+11,  2.55206710e+11, ...,
           4.70762975e+11,  2.55268977e+11,  4.70704425e+11],
         [ 2.55273425e+11,  4.70780339e+11,  2.55277872e+11, ...,
           4.70901895e+11,  2.55340142e+11,  4.70910578e+11],
         [ 2.55344590e+11,  4.70919261e+11,  2.55349038e+11, ...,
           4.71040828e+11,  2.55411311e+11,  4.70982272e+11]],

        [[ 2.55415855e+11,  4.71058291e+11,  2.55420303e+11, ...,
           4.71179869e+11,  2.55482580e+11,  4.71188553e+11],
         [ 2.55487032e+11,  4.71197241e+11,  2.55491480e+11, ...,
           4.71318829e