Merge pull request QMCPACK#4780 from kgasperich/offload-vp-ao

Offload AO evaluateV and VP related valueonly rourtines
amandadumi · Oct 27, 2023 · 4c38e3f · 4c38e3f
2 parents 896ac84 + 7d8696b
commit 4c38e3f
Show file tree

Hide file tree

Showing 15 changed files with 1,156 additions and 92 deletions.
diff --git a/src/Numerics/SoaCartesianTensor.h b/src/Numerics/SoaCartesianTensor.h
@@ -23,6 +23,8 @@
 
 #include <stdexcept>
 #include "OhmmsSoA/VectorSoaContainer.h"
+#include "OhmmsPETE/OhmmsArray.h"
+#include "OMPTarget/OffloadAlignedAllocators.hpp"
 
 namespace qmcplusplus
 {
@@ -37,13 +39,15 @@ namespace qmcplusplus
 template<class T>
 struct SoaCartesianTensor
 {
-  using value_type = T;
-  using ggg_type   = TinyVector<Tensor<T, 3>, 3>;
+  using value_type     = T;
+  using ggg_type       = TinyVector<Tensor<T, 3>, 3>;
+  using OffloadArray2D = Array<T, 2, OffloadPinnedAllocator<T>>;
+  using OffloadArray3D = Array<T, 3, OffloadPinnedAllocator<T>>;
 
   ///maximum angular momentum
-  size_t Lmax;
+  int Lmax;
   ///normalization factor
-  aligned_vector<T> NormFactor;
+  Vector<T, OffloadPinnedAllocator<T>> NormFactor;
   ///composite V,Gx,Gy,Gz,[L | H00, H01, H02, H11, H12, H12]
   //   {GH000, GH001, GH002, GH011, GH012, GH022, GH111, GH112, GH122, GH222
   VectorSoaContainer<T, 20> cXYZ;
@@ -56,27 +60,62 @@ struct SoaCartesianTensor
   explicit SoaCartesianTensor(const int l_max, bool addsign = false);
 
   ///compute Ylm
-  void evaluate_bare(T x, T y, T z, T* XYZ) const;
+  static void evaluate_bare(T x, T y, T z, T* XYZ, int lmax);
 
   ///compute Ylm
   inline void evaluateV(T x, T y, T z, T* XYZ) const
   {
-    evaluate_bare(x, y, z, XYZ);
+    evaluate_bare(x, y, z, XYZ, Lmax);
     for (size_t i = 0, nl = cXYZ.size(); i < nl; i++)
       XYZ[i] *= NormFactor[i];
   }
 
   ///compute Ylm
   inline void evaluateV(T x, T y, T z, T* XYZ)
   {
-    evaluate_bare(x, y, z, XYZ);
+    evaluate_bare(x, y, z, XYZ, Lmax);
     for (size_t i = 0, nl = cXYZ.size(); i < nl; i++)
       XYZ[i] *= NormFactor[i];
   }
 
   ///compute Ylm
   inline void evaluateV(T x, T y, T z) { evaluateV(x, y, z, cXYZ.data(0)); }
 
+  /**
+   * @brief evaluate for multiple electrons and multiple pbc images
+   * 
+   * @param [in] xyz electron positions [Nelec, Npbc, 3(x,y,z)]
+   * @param [out] XYZ Cartesian tensor elements [Nelec, Npbc, Nlm]
+  */
+  inline void batched_evaluateV(const OffloadArray3D& xyz, OffloadArray3D& XYZ) const
+  {
+    const size_t nElec = xyz.size(0);
+    const size_t Npbc  = xyz.size(1); // number of PBC images
+    assert(xyz.size(2) == 3);         // x,y,z
+
+    assert(XYZ.size(0) == nElec);
+    assert(XYZ.size(1) == Npbc);
+    const size_t Nlm = XYZ.size(2);
+
+    size_t nR = nElec * Npbc; // total number of positions to evaluate
+
+    auto* xyz_ptr        = xyz.data();
+    auto* XYZ_ptr        = XYZ.data();
+    auto* NormFactor_ptr = NormFactor.data();
+
+    // FIXME: remove "always" after fixing MW mem to only transfer once ahead of time
+    PRAGMA_OFFLOAD("omp target teams distribute parallel for \
+                    map(always, to:NormFactor_ptr[:Nlm]) \
+                    map(to:xyz_ptr[:3*nR], XYZ_ptr[:Nlm*nR])")
+    for (size_t ir = 0; ir < nR; ir++)
+    {
+      evaluate_bare(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], XYZ_ptr + (ir * Nlm), Lmax);
+      for (int i = 0; i < Nlm; i++)
+        XYZ_ptr[ir * Nlm + i] *= NormFactor_ptr[i];
+    }
+  }
+
+
   ///makes a table of \f$ r^l S_l^m \f$ and their gradients up to Lmax.
   void evaluateVGL(T x, T y, T z);
 
@@ -137,17 +176,19 @@ SoaCartesianTensor<T>::SoaCartesianTensor(const int l_max, bool addsign) : Lmax(
           NormL);
     }
   }
+  NormFactor.updateTo();
 }
 
 
+PRAGMA_OFFLOAD("omp declare target")
 template<class T>
-void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ) const
+void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ, int lmax)
 {
   const T x2 = x * x, y2 = y * y, z2 = z * z;
   const T x3 = x2 * x, y3 = y2 * y, z3 = z2 * z;
   const T x4 = x3 * x, y4 = y3 * y, z4 = z3 * z;
   const T x5 = x4 * x, y5 = y4 * y, z5 = z4 * z;
-  switch (Lmax)
+  switch (lmax)
   {
   case 6:
     XYZ[83] = x2 * y2 * z2; // X2Y2Z2
@@ -242,6 +283,7 @@ void SoaCartesianTensor<T>::evaluate_bare(T x, T y, T z, T* restrict XYZ) const
     XYZ[0] = 1; // S
   }
 }
+PRAGMA_OFFLOAD("omp end declare target")
 
 
 template<class T>

diff --git a/src/Numerics/SoaSphericalTensor.h b/src/Numerics/SoaSphericalTensor.h
@@ -17,6 +17,8 @@
 #include <limits>
 #include "OhmmsSoA/VectorSoaContainer.h"
 #include "OhmmsPETE/Tensor.h"
+#include "OhmmsPETE/OhmmsArray.h"
+#include "OMPTarget/OffloadAlignedAllocators.hpp"
 
 namespace qmcplusplus
 {
@@ -37,16 +39,18 @@ namespace qmcplusplus
 template<typename T>
 struct SoaSphericalTensor
 {
+  using OffloadArray2D = Array<T, 2, OffloadPinnedAllocator<T>>;
+  using OffloadArray3D = Array<T, 3, OffloadPinnedAllocator<T>>;
   ///maximum angular momentum for the center
   int Lmax;
   /// Normalization factors
-  aligned_vector<T> NormFactor;
+  Vector<T, OffloadPinnedAllocator<T>> NormFactor;
   ///pre-evaluated factor \f$1/\sqrt{(l+m)\times(l+1-m)}\f$
-  aligned_vector<T> FactorLM;
+  Vector<T, OffloadPinnedAllocator<T>> FactorLM;
   ///pre-evaluated factor \f$\sqrt{(2l+1)/(4\pi)}\f$
-  aligned_vector<T> FactorL;
+  Vector<T, OffloadPinnedAllocator<T>> FactorL;
   ///pre-evaluated factor \f$(2l+1)/(2l-1)\f$
-  aligned_vector<T> Factor2L;
+  Vector<T, OffloadPinnedAllocator<T>> Factor2L;
   ///composite
   VectorSoaContainer<T, 5> cYlm;
 
@@ -55,21 +59,58 @@ struct SoaSphericalTensor
   SoaSphericalTensor(const SoaSphericalTensor& rhs) = default;
 
   ///compute Ylm
-  void evaluate_bare(T x, T y, T z, T* Ylm) const;
+  static void evaluate_bare(T x, T y, T z, T* Ylm, int lmax, const T* factorL, const T* factorLM);
 
   ///compute Ylm
   inline void evaluateV(T x, T y, T z, T* Ylm) const
   {
-    evaluate_bare(x, y, z, Ylm);
+    evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());
     for (int i = 0, nl = cYlm.size(); i < nl; i++)
       Ylm[i] *= NormFactor[i];
   }
 
+  /**
+   * @brief evaluate V for multiple electrons and multiple pbc images
+   * 
+   * @param [in] xyz electron positions [Nelec, Npbc, 3(x,y,z)]
+   * @param [out] Ylm Spherical tensor elements [Nelec, Npbc, Nlm]
+  */
+  inline void batched_evaluateV(const OffloadArray3D& xyz, OffloadArray3D& Ylm) const
+  {
+    const size_t nElec = xyz.size(0);
+    const size_t Npbc  = xyz.size(1); // number of PBC images
+    assert(xyz.size(2) == 3);
+
+    assert(Ylm.size(0) == nElec);
+    assert(Ylm.size(1) == Npbc);
+    const size_t Nlm = Ylm.size(2);
+
+    size_t nR = nElec * Npbc; // total number of positions to evaluate
+
+    auto* xyz_ptr        = xyz.data();
+    auto* Ylm_ptr        = Ylm.data();
+    auto* FactorLM_ptr   = FactorLM.data();
+    auto* FactorL_ptr    = FactorL.data();
+    auto* NormFactor_ptr = NormFactor.data();
+
+    // FIXME: remove "always" after fixing MW mem to only transfer once ahead of time
+    PRAGMA_OFFLOAD("omp target teams distribute parallel for \
+                    map(always, to:FactorLM_ptr[:Nlm], FactorL_ptr[:Lmax+1], NormFactor_ptr[:Nlm]) \
+                    map(to: xyz_ptr[:3*nR], Ylm_ptr[:Nlm*nR])")
+    for (size_t ir = 0; ir < nR; ir++)
+    {
+      evaluate_bare(xyz_ptr[0 + 3 * ir], xyz_ptr[1 + 3 * ir], xyz_ptr[2 + 3 * ir], Ylm_ptr + (ir * Nlm), Lmax,
+                    FactorL_ptr, FactorLM_ptr);
+      for (int i = 0; i < Nlm; i++)
+        Ylm_ptr[ir * Nlm + i] *= NormFactor_ptr[i];
+    }
+  }
+
   ///compute Ylm
   inline void evaluateV(T x, T y, T z)
   {
     T* restrict Ylm = cYlm.data(0);
-    evaluate_bare(x, y, z, Ylm);
+    evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());
     for (int i = 0, nl = cYlm.size(); i < nl; i++)
       Ylm[i] *= NormFactor[i];
   }
@@ -84,7 +125,7 @@ struct SoaSphericalTensor
   void evaluateVGHGH(T x, T y, T z);
 
   ///returns the index/locator for (\f$l,m\f$) combo, \f$ l(l+1)+m \f$
-  inline int index(int l, int m) const { return (l * (l + 1)) + m; }
+  static inline int index(int l, int m) { return (l * (l + 1)) + m; }
 
   /** return the starting address of the component
    *
@@ -167,10 +208,21 @@ inline SoaSphericalTensor<T>::SoaSphericalTensor(const int l_max, bool addsign)
       FactorLM[index(l, m)]  = fac2;
       FactorLM[index(l, -m)] = fac2;
     }
+  NormFactor.updateTo();
+  FactorLM.updateTo();
+  FactorL.updateTo();
+  Factor2L.updateTo();
 }
 
+PRAGMA_OFFLOAD("omp declare target")
 template<typename T>
-inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm) const
+inline void SoaSphericalTensor<T>::evaluate_bare(T x,
+                                                 T y,
+                                                 T z,
+                                                 T* restrict Ylm,
+                                                 int lmax,
+                                                 const T* factorL,
+                                                 const T* factorLM)
 {
   constexpr T czero(0);
   constexpr T cone(1);
@@ -210,7 +262,7 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
   // calculate P_ll and P_l,l-1
   T fac = cone;
   int j = -1;
-  for (int l = 1; l <= Lmax; l++)
+  for (int l = 1; l <= lmax; l++)
   {
     j += 2;
     fac *= -j * stheta;
@@ -221,10 +273,10 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
     Ylm[l1] = j * ctheta * Ylm[l2];
   }
   // Use recurence to get other plm's //
-  for (int m = 0; m < Lmax - 1; m++)
+  for (int m = 0; m < lmax - 1; m++)
   {
     int j = 2 * m + 1;
-    for (int l = m + 2; l <= Lmax; l++)
+    for (int l = m + 2; l <= lmax; l++)
     {
       j += 2;
       int lm  = index(l, m);
@@ -237,12 +289,12 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
   T sphim, cphim, temp;
   Ylm[0] = omega; //1.0/sqrt(pi4);
   T rpow = 1.0;
-  for (int l = 1; l <= Lmax; l++)
+  for (int l = 1; l <= lmax; l++)
   {
     rpow *= r;
     //fac = rpow*sqrt(static_cast<T>(2*l+1))*omega;//rpow*sqrt((2*l+1)/pi4);
-    //FactorL[l] = sqrt(2*l+1)/sqrt(4*pi)
-    fac    = rpow * FactorL[l];
+    //factorL[l] = sqrt(2*l+1)/sqrt(4*pi)
+    fac    = rpow * factorL[l];
     int l0 = index(l, 0);
     Ylm[l0] *= fac;
     cphim = cone;
@@ -253,7 +305,7 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
       sphim  = sphim * cphi + cphim * sphi;
       cphim  = temp;
       int lm = index(l, m);
-      fac *= FactorLM[lm];
+      fac *= factorLM[lm];
       temp    = fac * Ylm[lm];
       Ylm[lm] = temp * cphim;
       lm      = index(l, -m);
@@ -263,12 +315,13 @@ inline void SoaSphericalTensor<T>::evaluate_bare(T x, T y, T z, T* restrict Ylm)
   //for (int i=0; i<Ylm.size(); i++)
   //  Ylm[i]*= NormFactor[i];
 }
+PRAGMA_OFFLOAD("omp end declare target")
 
 template<typename T>
 inline void SoaSphericalTensor<T>::evaluateVGL(T x, T y, T z)
 {
   T* restrict Ylm = cYlm.data(0);
-  evaluate_bare(x, y, z, Ylm);
+  evaluate_bare(x, y, z, Ylm, Lmax, FactorL.data(), FactorLM.data());
 
   constexpr T czero(0);
   constexpr T ahalf(0.5);

diff --git a/src/Particle/VirtualParticleSet.cpp b/src/Particle/VirtualParticleSet.cpp
@@ -97,6 +97,34 @@ void VirtualParticleSet::releaseResource(ResourceCollection& collection,
   ParticleSet::releaseResource(collection, p_list);
 }
 
+
+const RefVectorWithLeader<const DistanceTableAB> VirtualParticleSet::extractDTRefList(
+    const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
+    int id)
+{
+  RefVectorWithLeader<const DistanceTableAB> dt_list(vp_list.getLeader().getDistTableAB(id));
+  dt_list.reserve(vp_list.size());
+  for (const VirtualParticleSet& vp : vp_list)
+  {
+    const auto& d_table = vp.getDistTableAB(id);
+    dt_list.push_back(d_table);
+  }
+  return dt_list;
+}
+
+
+const std::vector<QMCTraits::PosType> VirtualParticleSet::extractVPCoords(
+    const RefVectorWithLeader<const VirtualParticleSet>& vp_list)
+{
+  std::vector<QMCTraits::PosType> coords_list;
+  for (const VirtualParticleSet& vp : vp_list)
+    for (int iat = 0; iat < vp.getTotalNum(); iat++)
+      coords_list.push_back(vp.R[iat]);
+
+  return coords_list;
+}
+
+
 /// move virtual particles to new postions and update distance tables
 void VirtualParticleSet::makeMoves(const ParticleSet& refp,
                                    int jel,

diff --git a/src/Particle/VirtualParticleSet.h b/src/Particle/VirtualParticleSet.h
@@ -95,6 +95,16 @@ class VirtualParticleSet : public ParticleSet
                  bool sphere = false,
                  int iat     = -1);
 
+  inline size_t getTotalNum() const { return TotalNum; }
+  /**Extract list of Distance Tables
+    */
+  static const RefVectorWithLeader<const DistanceTableAB> extractDTRefList(
+      const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
+      int id);
+  /**Extract list of VP coordinates, flattened over all walkers
+    */
+  static const std::vector<QMCTraits::PosType> extractVPCoords(
+      const RefVectorWithLeader<const VirtualParticleSet>& vp_list);
   /** move virtual particles to new postions and update distance tables
      * @param refp reference particle set
      * @param jel reference particle that all the VP moves from

diff --git a/src/QMCWaveFunctions/BasisSetBase.h b/src/QMCWaveFunctions/BasisSetBase.h
@@ -155,7 +155,8 @@ struct SoaBasisSetBase
   //Evaluates value for electron "iat".  places it in a offload array for batched code.
   virtual void mw_evaluateValue(const RefVectorWithLeader<ParticleSet>& P_list, int iat, OffloadMWVArray& v) = 0;
   //Evaluates value for all the electrons of the virtual particles. places it in a offload array for batched code.
-  virtual void mw_evaluateValueVPs(const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
+  virtual void mw_evaluateValueVPs(const RefVectorWithLeader<SoaBasisSetBase<T>>& basis_list,
+                                   const RefVectorWithLeader<const VirtualParticleSet>& vp_list,
                                    OffloadMWVArray& v) = 0;
   //Evaluates value, gradient, and Hessian for electron "iat".  Parks them into a temporary data structure "vgh".
   virtual void evaluateVGH(const ParticleSet& P, int iat, vgh_type& vgh) = 0;