In [1]:
import torch
from pyqcu.ascend import dslash
from pyqcu.ascend.include import *
# latt_size = (32, 32, 32, 32)
# latt_size = (32, 32, 16, 16)
# latt_size = (16, 16, 16, 32)
latt_size = (16, 16, 16, 16)
# latt_size = (32, 32, 32, 32)
# latt_size = (32, 32, 32, 64)
# latt_size = (4, 8, 8, 8)
# latt_size = (8, 8, 8, 4)
# latt_size = (16, 8, 8, 8)
# latt_size = (8, 8, 16, 16)
# latt_size = (8, 8, 8, 8)
# latt_size = (4, 4, 4, 4)
# mass = -3.5
# mass = -0.8
# mass = -0.5
mass = 0.05
mass = 0.0
mass = -0.05
# kappa = 0.4
# kappa = 0.125
# kappa = 0.5
kappa = 1 / (2 * mass + 8)
dtype = torch.complex128
# dtype = torch.complex64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Initialize lattice gauge theory
wilson = dslash.wilson(
    latt_size=latt_size,
    kappa=kappa,
    dtype=dtype,
    device=device,
    verbose=False
)
clover = dslash.clover(
    latt_size=latt_size,
    kappa=kappa,
    dtype=dtype,
    device=device,
    verbose=False
)
U = wilson.generate_gauge_field(sigma=0.1, seed=42)

wilson.check_su3(U)
clover_term = clover.make_clover(U=U)

b = torch.randn(4, 3, latt_size[3], latt_size[2], latt_size[1], latt_size[0],
                dtype=dtype, device=device)
verbose = True


    @@@@@@######QCU NOTES START######@@@@@@@
    Guide:
    0. Required: MPI(e.g. 4.1.2), CUDA(e.g. 12.4), CMAKE(e.g. 3.22.1), GCC(e.g. 11.4.0), HDF5-MPI(e.g. 1.10.7,'apt install libhdf5-mpi-dev && export HDF5_MPI="ON" && pip install --no-binary=h5py h5py').
    1. The libqcu.so was compiled when pyqcu setup in download_path/PyQCU/lib, please add this path to your LD_LIBRARY_PATH.
    2. The QCU(PyQCU) splite grid by x->y->z->t, lattice by x->y->z->t->p->d->c->c or x->y->z->t->c->s(->p) and x->y->z->t->c->s->c->s(->p).
    3. The QUDA(PyQUDA) splite grid by t->z->y->x, lattice by c->c->x->y->z->t->p->d or c->s->x->y->z->t(->p) and c->s->c->s->x->y->z->t(->p).
    4. The QCU input params in numpy array(dtype=np.int32), argv in  numpy array(dtype=np.float32 or float64) array, set_ptrs in numpy array(dtype=np.int64), other in cupy array(dtype=cp.complex64 or complex128).
    5. The smallest lattice size is (wilson:x=4,y=4,z=4,t=4;clover:x=8,y=8,z=8,t=8) that QCU support (when '#define _B

In [2]:
from pyqcu.ascend import inverse
mg = inverse.mg(b=b, wilson=wilson, U=U,
                clover=clover, clover_term=clover.add_I(clover_term=clover_term), verbose=verbose, max_iter=20, max_restarts=5, max_levels=1)

self.dof_list:[12, 12, 12, 12, 8, 8, 4, 12, 12, 12, 8, 4, 2, 4, 4, 24, 12, 12, 12, 4, 4, 4, 4, 4]
Building grid list:
  Level 0: 32x32x16x16
self.grid_list:[[16, 16, 32, 32]]


In [34]:
%%time

def matvec(src: torch.Tensor, U: torch.Tensor = U, clover_term: torch.Tensor = clover_term) -> torch.Tensor:
    return wilson.give_wilson(src, U)+clover.give_clover(clover_term=clover_term, src=src)


def _matvec(src: torch.Tensor) -> torch.Tensor:
    return mg.op_list[0].matvec(src=src)

%time Ab = matvec(b)
%time _Ab = _matvec(b)
print(torch.norm(U).item())
print(torch.norm(clover_term).item())
print(torch.norm(Ab).item())
print(torch.norm(_Ab).item())
print(torch.norm(Ab-_Ab).item()/torch.norm(_Ab).item())

CPU times: user 5.14 ms, sys: 0 ns, total: 5.14 ms
Wall time: 4.07 ms
CPU times: user 3.03 ms, sys: 0 ns, total: 3.03 ms
Wall time: 2.79 ms
1773.6200269505302
85.35483652791633
1989.3860972653238
1989.3860972653238
1.5372166177825714e-16
CPU times: user 3.36 s, sys: 0 ns, total: 3.36 s
Wall time: 3.35 s


In [33]:
%time Ab = matvec(b)
%time _Ab = _matvec(b)

CPU times: user 3.99 ms, sys: 0 ns, total: 3.99 ms
Wall time: 3.25 ms
CPU times: user 2.56 ms, sys: 0 ns, total: 2.56 ms
Wall time: 2.41 ms


In [5]:
# b0 = mg.b_list[0]
# b1 = inverse.restrict(
#     local_ortho_null_vecs=mg.lonv_list[0], fine_vec=b0)
# _b0 = inverse.prolong(local_ortho_null_vecs=mg.lonv_list[0], coarse_vec=b1)
# _b1 = inverse.restrict(
#     local_ortho_null_vecs=mg.lonv_list[0], fine_vec=_b0)
# print(_b1.flatten()[:100]/b1.flatten()[:100])

In [6]:
# _x = inverse.cg(b=b, matvec=matvec, verbose=verbose)
_x = inverse.bicgstab(b=b, matvec=matvec, verbose=verbose)
# _x = inverse.bicgstab(b=b, matvec=mg.op_list[0].matvec, verbose=verbose)

Norm of b:887.1781621292079
Norm of r:1332.8839838216686
Norm of x0:886.2189203208331
BICGSTAB-Iteration 0: Residual = 2.884216e+02, Time = 0.429446 s
BICGSTAB-Iteration 1: Residual = 1.241411e+02, Time = 0.428128 s
BICGSTAB-Iteration 2: Residual = 1.252450e+02, Time = 0.426886 s
BICGSTAB-Iteration 3: Residual = 8.451741e+01, Time = 0.427186 s
BICGSTAB-Iteration 4: Residual = 5.338513e+01, Time = 0.426374 s
BICGSTAB-Iteration 5: Residual = 5.041762e+01, Time = 0.427881 s
BICGSTAB-Iteration 6: Residual = 3.189633e+01, Time = 0.429024 s
BICGSTAB-Iteration 7: Residual = 2.583369e+01, Time = 0.431202 s
BICGSTAB-Iteration 8: Residual = 2.210261e+01, Time = 0.429054 s
BICGSTAB-Iteration 9: Residual = 2.146068e+02, Time = 0.414853 s
BICGSTAB-Iteration 10: Residual = 9.271055e+01, Time = 0.430533 s
BICGSTAB-Iteration 11: Residual = 1.636268e+02, Time = 0.430858 s
BICGSTAB-Iteration 12: Residual = 2.552070e+01, Time = 0.429450 s
BICGSTAB-Iteration 13: Residual = 1.510602e+01, Time = 0.431134 s


KeyboardInterrupt: 

In [None]:
x = mg.solve()
mg.plot()

In [None]:
index = -1
mg.u_list[-1] = inverse.bicgstab(b=torch.ones_like(mg.b_list[-1]),
                                 matvec=mg.op_list[-1].matvec, verbose=verbose,max_iter=10000)

In [None]:
print(torch.norm(x-_x).item()/torch.norm(_x).item())

In [None]:
mg.b_list[-1].flatten()[:100]