Skip to content

Commit

Permalink
Merge pull request QMCPACK#4780 from kgasperich/offload-vp-ao
Browse files Browse the repository at this point in the history
Offload AO evaluateV and VP related valueonly rourtines
  • Loading branch information
prckent committed Oct 27, 2023
2 parents 896ac84 + 7d8696b commit 4c38e3f
Show file tree
Hide file tree
Showing 15 changed files with 1,156 additions and 92 deletions.
60 changes: 51 additions & 9 deletions src/Numerics/SoaCartesianTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

#include <stdexcept>
#include "OhmmsSoA/VectorSoaContainer.h"
#include "OhmmsPETE/OhmmsArray.h"
#include "OMPTarget/OffloadAlignedAllocators.hpp"

namespace qmcplusplus
{
Expand All @@ -37,13 +39,15 @@ namespace qmcplusplus
template<class T>
struct SoaCartesianTensor
{
using value_type = T;
using ggg_type = TinyVector<Tensor<T, 3>, 3>;
using value_type = T;
using ggg_type = TinyVector<Tensor<T, 3>, 3>;
using OffloadArray2D = Array<T, 2, OffloadPinnedAllocator<T>>;
using OffloadArray3D = Array<T, 3, OffloadPinnedAllocator<T>>;

///maximum angular momentum
size_t Lmax;
int Lmax;
///normalization factor
aligned_vector<T> NormFactor;
Vector<T, OffloadPinnedAllocator<T>> NormFactor;
///composite V,Gx,Gy,Gz,[L | H00, H01, H02, H11, H12, H12]
// {GH000, GH001, GH002, GH011, GH012, GH022, GH111, GH112, GH122, GH222
VectorSoaContainer<T, 20> cXYZ;
Expand All @@ -56,27 +60,62 @@ struct SoaCartesianTensor
explicit SoaCartesianTensor(const int l_max, bool addsign = false);

///compute Ylm
void evaluate_bare(T x, T y, T z, T* XYZ) const;
static void evaluate_bare(T x, T y, T z, T* XYZ, int lmax);

///compute Ylm
inline void evaluateV(T x, T y, T z, T* XYZ) const
{
evaluate_bare(x, y, z, XYZ);
evaluate_bare(x, y, z, XYZ, Lmax);
for (size_t i = 0, nl = cXYZ.size(); i < nl; i++)
XYZ[i] *= NormFactor[i];
}

///compute Ylm
inline void evaluateV(T x, T y, T z, T* XYZ)
{
evaluate_bare(x, y, z, XYZ);
evaluate_bare(x, y, z, XYZ, Lmax);
for (size_t i = 0, nl = cXYZ.size(); i < nl; i++)
XYZ[i] *= NormFactor[i];
}

///compute Ylm
inline void evaluateV(T x, T y, T z) { evaluateV(x, y, z, cXYZ.data(0)); }

/**
* @brief evaluate for multiple electrons and multiple pbc images
*
* @param [in] xyz electron positions [Nelec, Npbc, 3(x,y,z)]
* @param [out] XYZ Cartesian tensor elements [Nelec, Npbc, Nlm]
*/
inline void batched_evaluateV(const OffloadArray3D& xyz, OffloadArray3D& XYZ) const
{
const size_t nElec = xyz.size(0);
const size_t Npbc = xyz.size(1); // number of PBC images
assert(xyz.size(2) == 3); // x,y,z

assert(XYZ.size(0) == nElec);
assert(XYZ.size(1) == Npbc);
const size_t Nlm = XYZ.size(2);

size_t nR = nElec * Npbc; // total number of positions to evaluate

auto* xyz_ptr = xyz.data();
auto* XYZ_ptr = XYZ.data();
auto* NormFactor_ptr = NormFactor.data();

// FIXME: remove "always" after fixing MW mem to only transfer once ahead of time
PRAGMA_OFFLOAD("omp target teams distribute parallel for \
map(always, to:NormFactor_ptr[:Nlm]) \
map(to:xyz_ptr[:3*nR], XYZ_ptr[:Nlm*nR])")
for (size_t ir = 0; ir < nR; ir++)
{
evaluate_bare(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], XYZ_ptr + (ir * Nlm), Lmax);
for (int i = 0; i < Nlm; i++)
XYZ_ptr[ir * Nlm + i] *= NormFactor_ptr[i];
}
}


///makes a table of \f$ r^l S_l^m \f$ and their gradients up to Lmax.
void evaluateVGL(T x, T y, T z);

Expand Down Expand Up @@ -137,17 +176,19 @@ SoaCartesianTensor<T>::SoaCartesianTensor(const int l_max, bool addsign) : Lmax(
NormL);
}
}
NormFactor.updateTo();
}


PRAGMA_OFFLOAD("omp declare target")
template<class T>
void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ) const
void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ, int lmax)
{
const T x2 = x * x, y2 = y * y, z2 = z * z;
const T x3 = x2 * x, y3 = y2 * y, z3 = z2 * z;
const T x4 = x3 * x, y4 = y3 * y, z4 = z3 * z;
const T x5 = x4 * x, y5 = y4 * y, z5 = z4 * z;
switch (Lmax)
switch (lmax)
{
case 6:
XYZ[83] = x2 * y2 * z2; // X2Y2Z2
Expand Down Expand Up @@ -242,6 +283,7 @@ void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ) const
XYZ[0] = 1; // S
}
}
PRAGMA_OFFLOAD("omp end declare target")


template<class T>
Expand Down
87 changes: 70 additions & 17 deletions src/Numerics/SoaSphericalTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <limits>
#include "OhmmsSoA/VectorSoaContainer.h"
#include "OhmmsPETE/Tensor.h"
#include "OhmmsPETE/OhmmsArray.h"
#include "OMPTarget/OffloadAlignedAllocators.hpp"

namespace qmcplusplus
{
Expand All @@ -37,16 +39,18 @@ namespace qmcplusplus
template<typename T>
struct SoaSphericalTensor
{
using OffloadArray2D = Array<T, 2, OffloadPinnedAllocator<T>>;
using OffloadArray3D = Array<T, 3, OffloadPinnedAllocator<T>>;
///maximum angular momentum for the center
int Lmax;
/// Normalization factors
aligned_vector<T> NormFactor;
Vector<T, OffloadPinnedAllocator<T>> NormFactor;
///pre-evaluated factor \f$1/\sqrt{(l+m)\times(l+1-m)}\f$
aligned_vector<T> FactorLM;
Vector<T, OffloadPinnedAllocator<T>> FactorLM;
///pre-evaluated factor \f$\sqrt{(2l+1)/(4\pi)}\f$
aligned_vector<T> FactorL;
Vector<T, OffloadPinnedAllocator<T>> FactorL;
///pre-evaluated factor \f$(2l+1)/(2l-1)\f$
aligned_vector<T> Factor2L;
Vector<T, OffloadPinnedAllocator<T>> Factor2L;
///composite
VectorSoaContainer<T, 5> cYlm;

Expand All @@ -55,21 +59,58 @@ struct SoaSphericalTensor
SoaSphericalTensor(const SoaSphericalTensor& rhs) = default;

///compute Ylm
void evaluate_bare(T x, T y, T z, T* Ylm) const;
static void evaluate_bare(T x, T y, T z, T* Ylm, int lmax, const T* factorL, const T* factorLM);

///compute Ylm
inline void evaluateV(T x, T y, T z, T* Ylm) const
{
evaluate_bare(x, y, z, Ylm);
evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());
for (int i = 0, nl = cYlm.size(); i < nl; i++)
Ylm[i] *= NormFactor[i];
}

/**
* @brief evaluate V for multiple electrons and multiple pbc images
*
* @param [in] xyz electron positions [Nelec, Npbc, 3(x,y,z)]
* @param [out] Ylm Spherical tensor elements [Nelec, Npbc, Nlm]
*/
inline void batched_evaluateV(const OffloadArray3D& xyz, OffloadArray3D& Ylm) const
{
const size_t nElec = xyz.size(0);
const size_t Npbc = xyz.size(1); // number of PBC images
assert(xyz.size(2) == 3);

assert(Ylm.size(0) == nElec);
assert(Ylm.size(1) == Npbc);
const size_t Nlm = Ylm.size(2);

size_t nR = nElec * Npbc; // total number of positions to evaluate

auto* xyz_ptr = xyz.data();
auto* Ylm_ptr = Ylm.data();
auto* FactorLM_ptr = FactorLM.data();
auto* FactorL_ptr = FactorL.data();
auto* NormFactor_ptr = NormFactor.data();

// FIXME: remove "always" after fixing MW mem to only transfer once ahead of time
PRAGMA_OFFLOAD("omp target teams distribute parallel for \
map(always, to:FactorLM_ptr[:Nlm], FactorL_ptr[:Lmax+1], NormFactor_ptr[:Nlm]) \
map(to: xyz_ptr[:3*nR], Ylm_ptr[:Nlm*nR])")
for (size_t ir = 0; ir < nR; ir++)
{
evaluate_bare(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], Ylm_ptr + (ir * Nlm), Lmax,
FactorL_ptr, FactorLM_ptr);
for (int i = 0; i < Nlm; i++)
Ylm_ptr[ir * Nlm + i] *= NormFactor_ptr[i];
}
}

///compute Ylm
inline void evaluateV(T x, T y, T z)
{
T* restrict Ylm = cYlm.data(0);
evaluate_bare(x, y, z, Ylm);
evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());
for (int i = 0, nl = cYlm.size(); i < nl; i++)
Ylm[i] *= NormFactor[i];
}
Expand All @@ -84,7 +125,7 @@ struct SoaSphericalTensor
void evaluateVGHGH(T x, T y, T z);

///returns the index/locator for (\f$l,m\f$) combo, \f$ l(l+1)+m \f$
inline int index(int l, int m) const { return (l * (l + 1)) + m; }
static inline int index(int l, int m) { return (l * (l + 1)) + m; }

/** return the starting address of the component
*
Expand Down Expand Up @@ -167,10 +208,21 @@ inline SoaSphericalTensor<T>::SoaSphericalTensor(const int l_max, bool addsign)
FactorLM[index(l, m)] = fac2;
FactorLM[index(l, -m)] = fac2;
}
NormFactor.updateTo();
FactorLM.updateTo();
FactorL.updateTo();
Factor2L.updateTo();
}

PRAGMA_OFFLOAD("omp declare target")
template<typename T>
inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm) const
inline void SoaSphericalTensor<T>::evaluate_bare(T x,
T y,
T z,
T* restrict Ylm,
int lmax,
const T* factorL,
const T* factorLM)
{
constexpr T czero(0);
constexpr T cone(1);
Expand Down Expand Up @@ -210,7 +262,7 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
// calculate P_ll and P_l,l-1
T fac = cone;
int j = -1;
for (int l = 1; l <= Lmax; l++)
for (int l = 1; l <= lmax; l++)
{
j += 2;
fac *= -j * stheta;
Expand All @@ -221,10 +273,10 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
Ylm[l1] = j * ctheta * Ylm[l2];
}
// Use recurence to get other plm's //
for (int m = 0; m < Lmax - 1; m++)
for (int m = 0; m < lmax - 1; m++)
{
int j = 2 * m + 1;
for (int l = m + 2; l <= Lmax; l++)
for (int l = m + 2; l <= lmax; l++)
{
j += 2;
int lm = index(l, m);
Expand All @@ -237,12 +289,12 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
T sphim, cphim, temp;
Ylm[0] = omega; //1.0/sqrt(pi4);
T rpow = 1.0;
for (int l = 1; l <= Lmax; l++)
for (int l = 1; l <= lmax; l++)
{
rpow *= r;
//fac = rpow*sqrt(static_cast<T>(2*l+1))*omega;//rpow*sqrt((2*l+1)/pi4);
//FactorL[l] = sqrt(2*l+1)/sqrt(4*pi)
fac = rpow * FactorL[l];
//factorL[l] = sqrt(2*l+1)/sqrt(4*pi)
fac = rpow * factorL[l];
int l0 = index(l, 0);
Ylm[l0] *= fac;
cphim = cone;
Expand All @@ -253,7 +305,7 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
sphim = sphim * cphi + cphim * sphi;
cphim = temp;
int lm = index(l, m);
fac *= FactorLM[lm];
fac *= factorLM[lm];
temp = fac * Ylm[lm];
Ylm[lm] = temp * cphim;
lm = index(l, -m);
Expand All @@ -263,12 +315,13 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
//for (int i=0; i<Ylm.size(); i++)
// Ylm[i]*= NormFactor[i];
}
PRAGMA_OFFLOAD("omp end declare target")

template<typename T>
inline void SoaSphericalTensor<T>::evaluateVGL(T x, T y, T z)
{
T* restrict Ylm = cYlm.data(0);
evaluate_bare(x, y, z, Ylm);
evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());

constexpr T czero(0);
constexpr T ahalf(0.5);
Expand Down
28 changes: 28 additions & 0 deletions src/Particle/VirtualParticleSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,34 @@ void VirtualParticleSet::releaseResource(ResourceCollection& collection,
ParticleSet::releaseResource(collection, p_list);
}


const RefVectorWithLeader<const DistanceTableAB> VirtualParticleSet::extractDTRefList(
const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
int id)
{
RefVectorWithLeader<const DistanceTableAB> dt_list(vp_list.getLeader().getDistTableAB(id));
dt_list.reserve(vp_list.size());
for (const VirtualParticleSet& vp : vp_list)
{
const auto& d_table = vp.getDistTableAB(id);
dt_list.push_back(d_table);
}
return dt_list;
}


const std::vector<QMCTraits::PosType> VirtualParticleSet::extractVPCoords(
const RefVectorWithLeader<const VirtualParticleSet>& vp_list)
{
std::vector<QMCTraits::PosType> coords_list;
for (const VirtualParticleSet& vp : vp_list)
for (int iat = 0; iat < vp.getTotalNum(); iat++)
coords_list.push_back(vp.R[iat]);

return coords_list;
}


/// move virtual particles to new postions and update distance tables
void VirtualParticleSet::makeMoves(const ParticleSet& refp,
int jel,
Expand Down
10 changes: 10 additions & 0 deletions src/Particle/VirtualParticleSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ class VirtualParticleSet : public ParticleSet
bool sphere = false,
int iat = -1);

inline size_t getTotalNum() const { return TotalNum; }
/**Extract list of Distance Tables
*/
static const RefVectorWithLeader<const DistanceTableAB> extractDTRefList(
const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
int id);
/**Extract list of VP coordinates, flattened over all walkers
*/
static const std::vector<QMCTraits::PosType> extractVPCoords(
const RefVectorWithLeader<const VirtualParticleSet>& vp_list);
/** move virtual particles to new postions and update distance tables
* @param refp reference particle set
* @param jel reference particle that all the VP moves from
Expand Down
3 changes: 2 additions & 1 deletion src/QMCWaveFunctions/BasisSetBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ struct SoaBasisSetBase
//Evaluates value for electron "iat". places it in a offload array for batched code.
virtual void mw_evaluateValue(const RefVectorWithLeader<ParticleSet>& P_list, int iat, OffloadMWVArray& v) = 0;
//Evaluates value for all the electrons of the virtual particles. places it in a offload array for batched code.
virtual void mw_evaluateValueVPs(const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
virtual void mw_evaluateValueVPs(const RefVectorWithLeader<SoaBasisSetBase<T>>& basis_list,
const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
OffloadMWVArray& v) = 0;
//Evaluates value, gradient, and Hessian for electron "iat". Parks them into a temporary data structure "vgh".
virtual void evaluateVGH(const ParticleSet& P, int iat, vgh_type& vgh) = 0;
Expand Down
Loading

0 comments on commit 4c38e3f

Please sign in to comment.