Skip to content

Commit

Permalink
Merge pull request QMCPACK#4790 from kgasperich/offload-ao-vgl
Browse files Browse the repository at this point in the history
Offload AO evaluateVGL and evaluateV routines
  • Loading branch information
prckent committed Oct 30, 2023
2 parents 65eea1d + 2d5f88d commit 8b7ef30
Show file tree
Hide file tree
Showing 10 changed files with 995 additions and 63 deletions.
112 changes: 93 additions & 19 deletions src/Numerics/SoaCartesianTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct SoaCartesianTensor
using ggg_type = TinyVector<Tensor<T, 3>, 3>;
using OffloadArray2D = Array<T, 2, OffloadPinnedAllocator<T>>;
using OffloadArray3D = Array<T, 3, OffloadPinnedAllocator<T>>;
using OffloadArray4D = Array<T, 4, OffloadPinnedAllocator<T>>;

///maximum angular momentum
int Lmax;
Expand All @@ -62,6 +63,17 @@ struct SoaCartesianTensor
///compute Ylm
static void evaluate_bare(T x, T y, T z, T* XYZ, int lmax);

static void evaluateVGL_impl(T x,
T y,
T z,
T* restrict XYZ,
T* restrict gr0,
T* restrict gr1,
T* restrict gr2,
T* restrict lap,
int lmax,
const T* normfactor,
int n_normfac);
///compute Ylm
inline void evaluateV(T x, T y, T z, T* XYZ) const
{
Expand Down Expand Up @@ -107,14 +119,65 @@ struct SoaCartesianTensor
PRAGMA_OFFLOAD("omp target teams distribute parallel for \
map(always, to:NormFactor_ptr[:Nlm]) \
map(to:xyz_ptr[:3*nR], XYZ_ptr[:Nlm*nR])")
for (size_t ir = 0; ir < nR; ir++)
for (uint32_t ir = 0; ir < nR; ir++)
{
evaluate_bare(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], XYZ_ptr + (ir * Nlm), Lmax);
for (int i = 0; i < Nlm; i++)
for (uint32_t i = 0; i < Nlm; i++)
XYZ_ptr[ir * Nlm + i] *= NormFactor_ptr[i];
}
}

/**
* @brief evaluate VGL for multiple electrons and multiple pbc images
*
* when offload is enabled, xyz is assumed to be up to date on the device before entering the function
* XYZ_vgl will be up to date on the device (but not host) when this function exits
*
* @param [in] xyz electron positions [Nelec, Npbc, 3(x,y,z)]
* @param [out] XYZ_vgl Cartesian tensor elements [5(v, gx, gy, gz, lapl), Nelec, Npbc, Nlm]
*/
inline void batched_evaluateVGL(const OffloadArray3D& xyz, OffloadArray4D& XYZ_vgl) const
{
const size_t nElec = xyz.size(0);
const size_t Npbc = xyz.size(1); // number of PBC images
assert(xyz.size(2) == 3);

assert(XYZ_vgl.size(0) == 5);
assert(XYZ_vgl.size(1) == nElec);
assert(XYZ_vgl.size(2) == Npbc);
const size_t Nlm = XYZ_vgl.size(3);
assert(NormFactor.size() == Nlm);

size_t nR = nElec * Npbc; // total number of positions to evaluate
size_t offset = Nlm * nR; // stride for v/gx/gy/gz/l

auto* xyz_ptr = xyz.data();
auto* XYZ_vgl_ptr = XYZ_vgl.data();
auto* NormFactor_ptr = NormFactor.data();
// TODO: make separate ptrs to start of v/gx/gy/gz/l?
// might be more readable?
// or just pass one ptr to evaluateVGL and apply stride/offset inside

// FIXME: remove "always" after fixing MW mem to only transfer once ahead of time
PRAGMA_OFFLOAD("omp target teams distribute parallel for \
map(always, to:NormFactor_ptr[:Nlm]) \
map(to: xyz_ptr[:3*nR], XYZ_vgl_ptr[:5*nR*Nlm])")
for (uint32_t ir = 0; ir < nR; ir++)
{
constexpr T czero(0);
for (uint32_t i = 0; i < Nlm; i++)
{
XYZ_vgl_ptr[ir * Nlm + offset * 1 + i] = czero;
XYZ_vgl_ptr[ir * Nlm + offset * 2 + i] = czero;
XYZ_vgl_ptr[ir * Nlm + offset * 3 + i] = czero;
XYZ_vgl_ptr[ir * Nlm + offset * 4 + i] = czero;
}
evaluateVGL_impl(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], XYZ_vgl_ptr + (ir * Nlm),
XYZ_vgl_ptr + (ir * Nlm + offset * 1), XYZ_vgl_ptr + (ir * Nlm + offset * 2),
XYZ_vgl_ptr + (ir * Nlm + offset * 3), XYZ_vgl_ptr + (ir * Nlm + offset * 4), Lmax,
NormFactor_ptr, Nlm);
}
}

///makes a table of \f$ r^l S_l^m \f$ and their gradients up to Lmax.
void evaluateVGL(T x, T y, T z);
Expand Down Expand Up @@ -179,6 +242,14 @@ SoaCartesianTensor<T>::SoaCartesianTensor(const int l_max, bool addsign) : Lmax(
NormFactor.updateTo();
}

template<class T>
void SoaCartesianTensor<T>::evaluateVGL(T x, T y, T z)
{
constexpr T czero(0);
cXYZ = czero;
evaluateVGL_impl(x, y, z, cXYZ.data(0), cXYZ.data(1), cXYZ.data(2), cXYZ.data(3), cXYZ.data(4), Lmax,
NormFactor.data(), NormFactor.size());
}

PRAGMA_OFFLOAD("omp declare target")
template<class T>
Expand Down Expand Up @@ -286,23 +357,26 @@ void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ, int lm
PRAGMA_OFFLOAD("omp end declare target")


PRAGMA_OFFLOAD("omp declare target")
template<class T>
void SoaCartesianTensor<T>::evaluateVGL(T x, T y, T z)
void SoaCartesianTensor<T>::evaluateVGL_impl(T x,
T y,
T z,
T* restrict XYZ,
T* restrict gr0,
T* restrict gr1,
T* restrict gr2,
T* restrict lap,
int lmax,
const T* normfactor,
int n_normfac)
{
constexpr T czero(0);
cXYZ = czero;

const T x2 = x * x, y2 = y * y, z2 = z * z;
const T x3 = x2 * x, y3 = y2 * y, z3 = z2 * z;
const T x4 = x3 * x, y4 = y3 * y, z4 = z3 * z;
const T x5 = x4 * x, y5 = y4 * y, z5 = z4 * z;
T* restrict XYZ = cXYZ.data(0);
T* restrict gr0 = cXYZ.data(1);
T* restrict gr1 = cXYZ.data(2);
T* restrict gr2 = cXYZ.data(3);
T* restrict lap = cXYZ.data(4);

switch (Lmax)
switch (lmax)
{
case 6:
XYZ[83] = x2 * y2 * z2; // X2Y2Z2
Expand Down Expand Up @@ -641,16 +715,16 @@ void SoaCartesianTensor<T>::evaluateVGL(T x, T y, T z)
XYZ[0] = 1; // S
}

const size_t ntot = NormFactor.size();
for (size_t i = 0; i < ntot; i++)
for (size_t i = 0; i < n_normfac; i++)
{
XYZ[i] *= NormFactor[i];
gr0[i] *= NormFactor[i];
gr1[i] *= NormFactor[i];
gr2[i] *= NormFactor[i];
lap[i] *= NormFactor[i];
XYZ[i] *= normfactor[i];
gr0[i] *= normfactor[i];
gr1[i] *= normfactor[i];
gr2[i] *= normfactor[i];
lap[i] *= normfactor[i];
}
}
PRAGMA_OFFLOAD("omp end declare target")


template<class T>
Expand Down
113 changes: 97 additions & 16 deletions src/Numerics/SoaSphericalTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ struct SoaSphericalTensor
{
using OffloadArray2D = Array<T, 2, OffloadPinnedAllocator<T>>;
using OffloadArray3D = Array<T, 3, OffloadPinnedAllocator<T>>;
using OffloadArray4D = Array<T, 4, OffloadPinnedAllocator<T>>;
///maximum angular momentum for the center
int Lmax;
/// Normalization factors
Expand All @@ -58,8 +59,19 @@ struct SoaSphericalTensor

SoaSphericalTensor(const SoaSphericalTensor& rhs) = default;

///compute Ylm
///compute Ylm for single position
static void evaluate_bare(T x, T y, T z, T* Ylm, int lmax, const T* factorL, const T* factorLM);
///compute Ylm_vgl for single position
static void evaluateVGL_impl(const T x,
const T y,
const T z,
T* restrict Ylm_vgl,
int lmax,
const T* factorL,
const T* factorLM,
const T* factor2L,
const T* normfactor,
size_t offset);

///compute Ylm
inline void evaluateV(T x, T y, T z, T* Ylm) const
Expand Down Expand Up @@ -97,7 +109,7 @@ struct SoaSphericalTensor
PRAGMA_OFFLOAD("omp target teams distribute parallel for \
map(always, to:FactorLM_ptr[:Nlm], FactorL_ptr[:Lmax+1], NormFactor_ptr[:Nlm]) \
map(to: xyz_ptr[:3*nR], Ylm_ptr[:Nlm*nR])")
for (size_t ir = 0; ir < nR; ir++)
for (uint32_t ir = 0; ir < nR; ir++)
{
evaluate_bare(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], Ylm_ptr + (ir * Nlm), Lmax,
FactorL_ptr, FactorLM_ptr);
Expand All @@ -106,6 +118,46 @@ struct SoaSphericalTensor
}
}

/**
* @brief evaluate VGL for multiple electrons and multiple pbc images
*
* when offload is enabled, xyz is assumed to be up to date on the device before entering the function
* Ylm_vgl will be up to date on the device (but not host) when this function exits
*
* @param [in] xyz electron positions [Nelec, Npbc, 3(x,y,z)]
* @param [out] Ylm_vgl Spherical tensor elements [5(v, gx, gy, gz, lapl), Nelec, Npbc, Nlm]
*/
inline void batched_evaluateVGL(const OffloadArray3D& xyz, OffloadArray4D& Ylm_vgl) const
{
const size_t nElec = xyz.size(0);
const size_t Npbc = xyz.size(1); // number of PBC images
assert(xyz.size(2) == 3);

assert(Ylm_vgl.size(0) == 5);
assert(Ylm_vgl.size(1) == nElec);
assert(Ylm_vgl.size(2) == Npbc);
const size_t Nlm = Ylm_vgl.size(3);
assert(NormFactor.size() == Nlm);

const size_t nR = nElec * Npbc; // total number of positions to evaluate
const size_t offset = Nlm * nR; // stride for v/gx/gy/gz/l

auto* xyz_ptr = xyz.data();
auto* Ylm_vgl_ptr = Ylm_vgl.data();
auto* FactorLM_ptr = FactorLM.data();
auto* FactorL_ptr = FactorL.data();
auto* Factor2L_ptr = Factor2L.data();
auto* NormFactor_ptr = NormFactor.data();

// FIXME: remove "always" after fixing MW mem to only transfer once ahead of time
PRAGMA_OFFLOAD("omp target teams distribute parallel for \
map(always, to:FactorLM_ptr[:Nlm], FactorL_ptr[:Lmax+1], NormFactor_ptr[:Nlm], Factor2L_ptr[:Lmax+1]) \
map(to: xyz_ptr[:nR*3], Ylm_vgl_ptr[:5*nR*Nlm])")
for (uint32_t ir = 0; ir < nR; ir++)
evaluateVGL_impl(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], Ylm_vgl_ptr + (ir * Nlm), Lmax,
FactorL_ptr, FactorLM_ptr, Factor2L_ptr, NormFactor_ptr, offset);
}

///compute Ylm
inline void evaluateV(T x, T y, T z)
{
Expand Down Expand Up @@ -167,7 +219,6 @@ inline SoaSphericalTensor<T>::SoaSphericalTensor(const int l_max, bool addsign)
constexpr T cone(1);
const int ntot = (Lmax + 1) * (Lmax + 1);
cYlm.resize(ntot);
cYlm = czero;
NormFactor.resize(ntot, cone);
const T sqrt2 = std::sqrt(2.0);
if (addsign)
Expand Down Expand Up @@ -317,23 +368,42 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x,
}
PRAGMA_OFFLOAD("omp end declare target")


PRAGMA_OFFLOAD("omp declare target")
template<typename T>
inline void SoaSphericalTensor<T>::evaluateVGL(T x, T y, T z)
inline void SoaSphericalTensor<T>::evaluateVGL_impl(const T x,
const T y,
const T z,
T* restrict Ylm_vgl,
int lmax,
const T* factorL,
const T* factorLM,
const T* factor2L,
const T* normfactor,
size_t offset)
{
T* restrict Ylm = cYlm.data(0);
evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());
T* restrict Ylm = Ylm_vgl;
// T* restrict Ylm = cYlm.data(0);
evaluate_bare(x, y, z, Ylm, lmax, factorL, factorLM);
const size_t Nlm = (lmax + 1) * (lmax + 1);

constexpr T czero(0);
constexpr T ahalf(0.5);
T* restrict gYlmX = cYlm.data(1);
T* restrict gYlmY = cYlm.data(2);
T* restrict gYlmZ = cYlm.data(3);
T* restrict gYlmX = Ylm_vgl + offset * 1;
T* restrict gYlmY = Ylm_vgl + offset * 2;
T* restrict gYlmZ = Ylm_vgl + offset * 3;
T* restrict lYlm = Ylm_vgl + offset * 4; // just need to set to zero

gYlmX[0] = czero;
gYlmY[0] = czero;
gYlmZ[0] = czero;
lYlm[0] = czero;

// Calculating Gradient now//
for (int l = 1; l <= Lmax; l++)
for (int l = 1; l <= lmax; l++)
{
//T fac = ((T) (2*l+1))/(2*l-1);
T fac = Factor2L[l];
T fac = factor2L[l];
for (int m = -l; m <= l; m++)
{
int lm = index(l - 1, 0);
Expand Down Expand Up @@ -390,9 +460,9 @@ inline void SoaSphericalTensor<T>::evaluateVGL(T x, T y, T z)
lm = index(l, m);
if (ma)
{
gYlmX[lm] = NormFactor[lm] * gx;
gYlmY[lm] = NormFactor[lm] * gy;
gYlmZ[lm] = NormFactor[lm] * gz;
gYlmX[lm] = normfactor[lm] * gx;
gYlmY[lm] = normfactor[lm] * gy;
gYlmZ[lm] = normfactor[lm] * gz;
}
else
{
Expand All @@ -402,10 +472,21 @@ inline void SoaSphericalTensor<T>::evaluateVGL(T x, T y, T z)
}
}
}
for (int i = 0; i < cYlm.size(); i++)
Ylm[i] *= NormFactor[i];
for (int i = 0; i < Nlm; i++)
{
Ylm[i] *= normfactor[i];
lYlm[i] = 0;
}
//for (int i=0; i<Ylm.size(); i++) gradYlm[i]*= NormFactor[i];
}
PRAGMA_OFFLOAD("omp end declare target")

template<typename T>
inline void SoaSphericalTensor<T>::evaluateVGL(T x, T y, T z)
{
evaluateVGL_impl(x, y, z, cYlm.data(), Lmax, FactorL.data(), FactorLM.data(), Factor2L.data(), NormFactor.data(),
cYlm.capacity());
}

template<typename T>
inline void SoaSphericalTensor<T>::evaluateVGH(T x, T y, T z)
Expand Down
10 changes: 8 additions & 2 deletions src/QMCWaveFunctions/BasisSetBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,15 @@ struct SoaBasisSetBase
//Evaluates value, gradient, and laplacian for electron "iat". Parks them into a temporary data structure "vgl".
virtual void evaluateVGL(const ParticleSet& P, int iat, vgl_type& vgl) = 0;
//Evaluates value, gradient, and laplacian for electron "iat". places them in a offload array for batched code.
virtual void mw_evaluateVGL(const RefVectorWithLeader<ParticleSet>& P_list, int iat, OffloadMWVGLArray& vgl) = 0;
virtual void mw_evaluateVGL(const RefVectorWithLeader<SoaBasisSetBase<T>>& basis_list,
const RefVectorWithLeader<ParticleSet>& P_list,
int iat,
OffloadMWVGLArray& vgl) = 0;
//Evaluates value for electron "iat". places it in a offload array for batched code.
virtual void mw_evaluateValue(const RefVectorWithLeader<ParticleSet>& P_list, int iat, OffloadMWVArray& v) = 0;
virtual void mw_evaluateValue(const RefVectorWithLeader<SoaBasisSetBase<T>>& basis_list,
const RefVectorWithLeader<ParticleSet>& P_list,
int iat,
OffloadMWVArray& v) = 0;
//Evaluates value for all the electrons of the virtual particles. places it in a offload array for batched code.
virtual void mw_evaluateValueVPs(const RefVectorWithLeader<SoaBasisSetBase<T>>& basis_list,
const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
Expand Down
8 changes: 6 additions & 2 deletions src/QMCWaveFunctions/LCAO/LCAOrbitalSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,9 @@ void LCAOrbitalSet::mw_evaluateVGLImplGEMM(const RefVectorWithLeader<SPOSet>& sp

{
ScopedTimer local(basis_timer_);
myBasisSet->mw_evaluateVGL(P_list, iat, basis_vgl_mw);
auto basis_list = spo_leader.extractBasisRefList(spo_list);
myBasisSet->mw_evaluateVGL(basis_list, P_list, iat, basis_vgl_mw);
basis_vgl_mw.updateFrom(); // TODO: remove this when gemm is implemented
}

if (Identity)
Expand Down Expand Up @@ -547,7 +549,9 @@ void LCAOrbitalSet::mw_evaluateValueImplGEMM(const RefVectorWithLeader<SPOSet>&
auto& basis_v_mw = spo_leader.mw_mem_handle_.getResource().basis_v_mw;
basis_v_mw.resize(nw, BasisSetSize);

myBasisSet->mw_evaluateValue(P_list, iat, basis_v_mw);
auto basis_list = spo_leader.extractBasisRefList(spo_list);
myBasisSet->mw_evaluateValue(basis_list, P_list, iat, basis_v_mw);
basis_v_mw.updateFrom(); // TODO: remove this when gemm is implemented

if (Identity)
{
Expand Down
Loading

0 comments on commit 8b7ef30

Please sign in to comment.