In [13]:
from opt_einsum import contract
import cupy as cp
import numpy as np
from time import perf_counter
from pyqcu.cuda import define
from pyqcu.cuda import io
from pyqcu.cuda import qcu
#############################
params = np.array([0]*define._PARAMS_SIZE_, dtype=np.int32)
params[define._LAT_X_] = 8
params[define._LAT_Y_] = 4
params[define._LAT_Z_] = 4
params[define._LAT_T_] = 1
params[define._LAT_XYZT_] = params[define._LAT_X_] * \
    params[define._LAT_Y_]*params[define._LAT_Z_]*params[define._LAT_T_]
params[define._GRID_X_], params[define._GRID_Y_], params[define._GRID_Z_], params[
    define._GRID_T_] = define.split_into_four_factors(define.size)
params[define._PARITY_] = 0
params[define._NODE_RANK_] = define.rank
params[define._NODE_SIZE_] = define.size
params[define._DAGGER_] = 0
params[define._MAX_ITER_] = 1000
params[define._DATA_TYPE_] = define._LAT_C128_
params[define._SET_INDEX_] = 5
params[define._SET_PLAN_] = -1
params[define._MG_X_] = 1
params[define._MG_Y_] = 1
params[define._MG_Z_] = 1
params[define._MG_T_] = 1
params[define._LAT_E_] = 24
params[define._VERBOSE_] = 1
params[define._SEED_] = 42
dtype = define.dtype_half(params[define._DATA_TYPE_])
argv = np.array([0.0]*define._ARGV_SIZE_,
                dtype=dtype)
argv[define._MASS_] = 0.05
argv[define._TOL_] = 1e-12
argv[define._SIGMA_] = 0.1
set_ptrs = np.array(10*[0], dtype=np.int64)  # maybe more than 10?
#############################
laplacian_in = cp.array([range(define._LAT_C_*params[define._LAT_XYZT_])], dtype=dtype).reshape(
    define._LAT_C_, params[define._LAT_Z_], params[define._LAT_Y_], params[define._LAT_X_]).copy()
laplacian_out = cp.zeros_like(laplacian_in).copy()
gauge = cp.array([range(define._LAT_DCC_*params[define._LAT_XYZT_])], dtype=dtype).reshape(
    define._LAT_C_, define._LAT_C_, define._LAT_D_, params[define._LAT_Z_], params[define._LAT_Y_], params[define._LAT_X_]).copy()
laplacian_in = cp.ones_like(laplacian_in).copy()
# gauge = cp.zeros_like(gauge).copy()
#############################


def _Laplacian(F, U):
    Lx, Ly, Lz = params[define._LAT_X_], params[define._LAT_Y_], params[define._LAT_Z_]
    U_dag = U.transpose(0, 1, 2, 3, 5, 4).conj()
    F = F.reshape(Lz, Ly, Lx, define._LAT_C_, -1)
    t0 = perf_counter()
    dest = (
        # - for SA with evals , + for LA with (12 - evals)
        6 * F
        - (
            contract("zyxab,zyxbc->zyxac", U[0], cp.roll(F, -1, 2))
            + contract("zyxab,zyxbc->zyxac", U[1], cp.roll(F, -1, 1))
            + contract("zyxab,zyxbc->zyxac", U[2], cp.roll(F, -1, 0))
            + cp.roll(contract("zyxab,zyxbc->zyxac", U_dag[0], F), 1, 2)
            + cp.roll(contract("zyxab,zyxbc->zyxac", U_dag[1], F), 1, 1)
            + cp.roll(contract("zyxab,zyxbc->zyxac", U_dag[2], F), 1, 0)
        )
    ).reshape(Lz * Ly * Lx * define._LAT_C_, -1)
    t1 = perf_counter()
    print(f'cupy cost time: {t1 - t0} sec')
    return dest
#############################

In [14]:
gauge

array([[[[[[0.000e+00, 1.000e+00, 2.000e+00, ..., 5.000e+00,
            6.000e+00, 7.000e+00],
           [8.000e+00, 9.000e+00, 1.000e+01, ..., 1.300e+01,
            1.400e+01, 1.500e+01],
           [1.600e+01, 1.700e+01, 1.800e+01, ..., 2.100e+01,
            2.200e+01, 2.300e+01],
           [2.400e+01, 2.500e+01, 2.600e+01, ..., 2.900e+01,
            3.000e+01, 3.100e+01]],

          [[3.200e+01, 3.300e+01, 3.400e+01, ..., 3.700e+01,
            3.800e+01, 3.900e+01],
           [4.000e+01, 4.100e+01, 4.200e+01, ..., 4.500e+01,
            4.600e+01, 4.700e+01],
           [4.800e+01, 4.900e+01, 5.000e+01, ..., 5.300e+01,
            5.400e+01, 5.500e+01],
           [5.600e+01, 5.700e+01, 5.800e+01, ..., 6.100e+01,
            6.200e+01, 6.300e+01]],

          [[6.400e+01, 6.500e+01, 6.600e+01, ..., 6.900e+01,
            7.000e+01, 7.100e+01],
           [7.200e+01, 7.300e+01, 7.400e+01, ..., 7.700e+01,
            7.800e+01, 7.900e+01],
           [8.000e+01, 8.100e+01, 8.

In [15]:
laplacian_in

array([[[[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],

In [16]:
#############################
qcu.applyInitQcu(set_ptrs, params, argv)
t0 = perf_counter()
for i in range(10):
    print("_norm of Laplacian gauge:", cp.linalg.norm(gauge))
    print("_norm of Laplacian in:", cp.linalg.norm(laplacian_in))
    print("_norm of Laplacian out:", cp.linalg.norm(laplacian_out))
    qcu.applyLaplacianQcu(laplacian_out, laplacian_in,
                      gauge, set_ptrs, params)
    print("norm of Laplacian gauge:", cp.linalg.norm(gauge))
    print("norm of Laplacian in:", cp.linalg.norm(laplacian_in))
    print("norm of Laplacian out:", cp.linalg.norm(laplacian_out))
t1 = perf_counter()
qcu.applyEndQcu(set_ptrs, params)
print(f'PyQCU cost time: {t1 - t0} sec')
#############################
_gauge = io.ccdzyx2dzyxcc(io.gauge2ccdzyx(
    gauge, params))
_laplacian_in = io.czyx2zyxc(io.laplacian2czyx(
    laplacian_in, params))
t0 = perf_counter()
_laplacian_out = _Laplacian(
    _laplacian_in, _gauge)
t1 = perf_counter()
print(f'PyQUDA cost time: {t1 - t0} sec')
print("norm of PyQuda Laplacian out:",
      cp.linalg.norm(_laplacian_out))
#############################

_norm of Laplacian gauge:just for laplacian, lat_t = 1, no even-odd
set_ptr:0x555950a7ada0
long long set_ptr:93842093616544
 gridDim.x               :8
blockDim.x              :16
host_params[_LAT_X_]    :8
host_params[_LAT_Y_]    :4
host_params[_LAT_Z_]    :4
host_params[_LAT_T_]    :1
host_params[_LAT_XYZT_] :128
host_params[_GRID_X_]   :1
host_params[_GRID_Y_]   :1
host_params[_GRID_Z_]   :1
host_params[_GRID_T_]   :1
host_params[_PARITY_]   :0
host_params[_NODE_RANK_]:0
host_params[_NODE_SIZE_]:1
host_params[_DAGGER_]   :0
host_params[_MAX_ITER_] :1000
host_params[_DATA_TYPE_]:4
host_params[_SET_INDEX_]:5
host_params[_SET_PLAN_] :-1
host_params[_MG_X_]     :1
host_params[_MG_Y_]     :1
host_params[_MG_Z_]     :1
host_params[_MG_T_]     :1
host_params[_LAT_E_]    :24
host_params[_VERBOSE_]  :1
host_argv[_MASS_]       :5.000000e-02
host_argv[_TOL_]        :1.000000e-12
host_argv[_SIGMA_]      :1.000000e-01
lat_2dim[_XY_]          :32
lat_2dim[_XZ_]          :32
lat_2dim[_XT_]        

In [17]:
_gauge

array([[[[[[0.000e+00, 5.120e+02, 1.024e+03],
           [1.536e+03, 2.048e+03, 2.560e+03],
           [3.072e+03, 3.584e+03, 4.096e+03]],

          [[1.000e+00, 5.130e+02, 1.025e+03],
           [1.537e+03, 2.049e+03, 2.561e+03],
           [3.073e+03, 3.585e+03, 4.097e+03]],

          [[2.000e+00, 5.140e+02, 1.026e+03],
           [1.538e+03, 2.050e+03, 2.562e+03],
           [3.074e+03, 3.586e+03, 4.098e+03]],

          ...,

          [[5.000e+00, 5.170e+02, 1.029e+03],
           [1.541e+03, 2.053e+03, 2.565e+03],
           [3.077e+03, 3.589e+03, 4.101e+03]],

          [[6.000e+00, 5.180e+02, 1.030e+03],
           [1.542e+03, 2.054e+03, 2.566e+03],
           [3.078e+03, 3.590e+03, 4.102e+03]],

          [[7.000e+00, 5.190e+02, 1.031e+03],
           [1.543e+03, 2.055e+03, 2.567e+03],
           [3.079e+03, 3.591e+03, 4.103e+03]]],


         [[[8.000e+00, 5.200e+02, 1.032e+03],
           [1.544e+03, 2.056e+03, 2.568e+03],
           [3.080e+03, 3.592e+03, 4.104e+03]],

  

In [18]:
_laplacian_in

array([[[[ 2.20567384e+50,  2.20575669e+48,  1.37650863e+50],
         [-3.17036318e+49,  1.60837255e+49, -1.97253158e+49],
         [-1.51418458e+47,  7.10633620e+48, -7.15135956e+46],
         [-1.06310001e+50,  3.22887723e+46, -6.58795098e+49],
         [-1.03021154e+49,  1.29595859e+50, -1.64374403e+49],
         [-7.07704486e+49, -1.85336639e+49, -1.14154567e+50],
         [-3.45819689e+49, -3.37855773e+46, -5.55497769e+49],
         [ 1.04410273e+47, -6.28955703e+49,  8.35301253e+46]],

        [[-1.94046165e+48, -6.14263898e+48, -1.91639898e+49],
         [ 3.09991307e+47, -4.27162130e+49,  2.77938101e+48],
         [ 1.37528226e+46, -2.08562954e+49,  2.15309468e+46],
         [ 9.80008684e+47,  2.55674408e+46,  9.37783063e+48],
         [ 1.27349231e+49,  3.92538441e+49,  3.21878258e+48],
         [ 8.90748488e+49, -5.62929291e+48,  2.25723249e+49],
         [ 4.22813035e+49, -3.15825195e+46,  1.07106647e+49],
         [-5.22802694e+46, -1.86719324e+49, -8.52586833e+45]],

    

In [19]:
_laplacian_out

array([[ 1.04237560e+53],
       [-6.82154312e+52],
       [-2.38545582e+53],
       [-1.65943249e+53],
       [-8.68145399e+52],
       [-8.18740951e+51],
       [-7.25061972e+52],
       [ 1.91335219e+53],
       [ 4.55090022e+53],
       [ 2.57820389e+53],
       [ 2.92548718e+53],
       [ 3.26243523e+53],
       [ 4.36345197e+53],
       [ 7.57410560e+53],
       [ 1.07676033e+54],
       [ 1.05813209e+53],
       [ 2.91522688e+53],
       [ 4.76345021e+53],
       [ 7.49206198e+52],
       [ 1.50501941e+53],
       [ 2.25542878e+53],
       [ 1.61769446e+53],
       [-2.62788467e+53],
       [-6.86590504e+53],
       [-3.51045886e+53],
       [-4.60536708e+53],
       [-5.70080446e+53],
       [ 1.75576001e+53],
       [ 4.47597598e+53],
       [ 7.20150326e+53],
       [-9.17969315e+52],
       [-3.65010030e+53],
       [-6.37972641e+53],
       [ 4.00309441e+53],
       [ 7.85033260e+53],
       [ 1.16981892e+54],
       [-2.30521356e+53],
       [-4.92004191e+53],
       [-7.5

In [20]:
#############################
_laplacian_out = io.zyxc2czyx(io.laplacian2zyxc(_laplacian_out, params))
print("Difference between QUDA and PyQuda Laplacian out:",
      cp.linalg.norm(_laplacian_out - laplacian_out)/cp.linalg.norm(_laplacian_out))
#############################

Difference between QUDA and PyQuda Laplacian out: 0.9999142816389075


In [21]:
laplacian_in

array([[[[ 2.20567384e+50, -3.17036318e+49, -1.51418458e+47,
          -1.06310001e+50, -1.03021154e+49, -7.07704486e+49,
          -3.45819689e+49,  1.04410273e+47],
         [-1.94046165e+48,  3.09991307e+47,  1.37528226e+46,
           9.80008684e+47,  1.27349231e+49,  8.90748488e+49,
           4.22813035e+49, -5.22802694e+46],
         [-1.42414921e+47, -8.32629376e+49, -1.00488001e+49,
          -6.84369410e+49, -3.33710981e+49,  1.46288189e+47,
           3.92547945e+48, -4.87889080e+47],
         [ 3.31207932e+46, -1.85998211e+48,  5.17419723e+48,
           3.61161257e+49,  1.72756331e+49, -3.13993740e+46,
           1.75925411e+50, -2.53090601e+49]],

        [[ 1.88935370e+48,  1.46286383e+49,  6.82913312e+48,
           9.11241111e+46,  5.55979149e+49, -7.89062338e+48,
           1.49941930e+46, -2.70162575e+49],
         [-1.28904204e+47, -8.92122840e+47, -4.88083930e+47,
          -3.75333957e+45,  7.56429447e+49, -1.08838521e+49,
          -7.05744436e+46, -3.59304947e+4

In [22]:
laplacian_out

array([[[[-1.31880057e+51,  2.23469054e+50,  1.68539804e+49,
           6.37918801e+50,  1.76994476e+50,  4.08180385e+50,
           2.07475109e+50, -5.65858812e+49],
         [ 4.58617163e+48, -5.09627474e+49, -2.40631176e+49,
          -5.85316358e+48, -6.33871658e+49, -5.36350230e+50,
          -2.53716515e+50, -5.86349041e+48],
         [ 3.40423541e+49,  4.99590421e+50,  2.64439705e+50,
           3.81392356e+50,  2.00152027e+50, -9.96212832e+49,
          -4.11894208e+49, -1.19680609e+50],
         [-5.99804812e+49,  1.12375791e+49, -5.06315853e+49,
          -2.13885211e+50, -1.03646504e+50,  9.75931842e+48,
          -1.04532344e+51,  2.23958394e+50]],

        [[ 1.69663865e+50, -1.13736120e+50, -4.10699054e+49,
          -8.70138794e+49, -3.55427708e+50, -1.04164737e+50,
          -7.37549637e+49,  1.62216873e+50],
         [-3.07010638e+49,  9.90765028e+48,  2.96089498e+48,
           1.53995232e+49, -4.47736877e+50,  1.08292366e+50,
           2.09349854e+49,  2.15577313e+5

In [23]:
_laplacian_out

array([[[[ 1.04237560e+53, -1.65943249e+53, -7.25061972e+52,
           2.57820389e+53,  4.36345197e+53,  1.05813209e+53,
           7.49206198e+52,  1.61769446e+53],
         [-3.51045886e+53,  1.75576001e+53, -9.17969315e+52,
           4.00309441e+53, -2.30521356e+53,  3.86311028e+53,
          -1.47573896e+53,  2.83202958e+53],
         [-3.22866379e+52,  2.76672054e+52,  2.85476377e+53,
           7.15179273e+50,  4.05894205e+53,  5.08470969e+53,
           8.37264017e+52,  3.03564804e+53],
         [-2.47689852e+53,  3.45975364e+53, -3.58949877e+53,
           6.56037468e+53,  1.70690756e+52,  1.83462601e+53,
           1.22194922e+53, -1.14820128e+53]],

        [[-3.84071645e+53, -1.01827667e+53,  2.13163341e+53,
           5.15658415e+53, -8.20796871e+51,  6.59088233e+53,
           1.27864140e+53,  2.85608472e+53],
         [-1.01513145e+53,  5.29144591e+53,  9.39412037e+52,
           1.35115584e+53,  2.59226373e+52, -1.44818491e+52,
          -5.58867601e+52, -7.53452395e+5

In [24]:
_laplacian_out-laplacian_out

array([[[[ 1.05556360e+53, -1.66166718e+53, -7.25230511e+52,
           2.57182470e+53,  4.36168203e+53,  1.05405029e+53,
           7.47131447e+52,  1.61826031e+53],
         [-3.51050472e+53,  1.75626963e+53, -9.17728684e+52,
           4.00315295e+53, -2.30457969e+53,  3.86847378e+53,
          -1.47320180e+53,  2.83208822e+53],
         [-3.23206803e+52,  2.71676150e+52,  2.85211937e+53,
           3.33786917e+50,  4.05694053e+53,  5.08570590e+53,
           8.37675912e+52,  3.03684485e+53],
         [-2.47629871e+53,  3.45964126e+53, -3.58899246e+53,
           6.56251353e+53,  1.71727221e+52,  1.83452842e+53,
           1.23240245e+53, -1.15044086e+53]],

        [[-3.84241309e+53, -1.01713931e+53,  2.13204410e+53,
           5.15745429e+53, -7.85254100e+51,  6.59192398e+53,
           1.27937895e+53,  2.85446255e+53],
         [-1.01482443e+53,  5.29134683e+53,  9.39382428e+52,
           1.35100185e+53,  2.63703742e+52, -1.45901415e+52,
          -5.59076951e+52, -7.75010126e+5