diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H index c52407ee4c8..a8002f2c211 100644 --- a/Source/Particles/Deposition/CurrentDeposition.H +++ b/Source/Particles/Deposition/CurrentDeposition.H @@ -461,8 +461,7 @@ void doDepositionSharedShapeN (const GetParticlePosition& GetPosition, sample_tbox_y.grow(depos_order); sample_tbox_z.grow(depos_order); - //Note that the number of points in any of the sample boxes is the same for all - const auto npts = sample_tbox_x.numPts()+sample_tbox_y.numPts()+sample_tbox_z.numPts(); + const auto npts = amrex::max(sample_tbox_x.numPts(), sample_tbox_y.numPts(), sample_tbox_z.numPts()); //needed for gpu kernel const int nblocks = a_bins.numBins(); @@ -544,18 +543,18 @@ void doDepositionSharedShapeN (const GetParticlePosition& GetPosition, //mem amrex::Array4 const jx_buff(shared, amrex::begin(tbox_x), amrex::end(tbox_x), 1); - amrex::Array4 const jy_buff(shared + tbox_x.numPts(), + amrex::Array4 const jy_buff(shared, amrex::begin(tbox_y), amrex::end(tbox_y), 1); - amrex::Array4 const jz_buff(shared + tbox_x.numPts() + tbox_y.numPts(), + amrex::Array4 const jz_buff(shared, amrex::begin(tbox_z), amrex::end(tbox_z), 1); // Zero-initialize the temporary array in shared memory volatile amrex::Real* vs = shared; - for (int i = threadIdx.x; i < tbox_x.numPts()+tbox_y.numPts()+tbox_z.numPts(); i += blockDim.x){ + for (int i = threadIdx.x; i < npts; i += blockDim.x){ vs[i] = 0.0; } __syncthreads(); #else - amrex::Array4 const& jx_buff = jx_arr; + amrex::Array4 const& jx_buff = jx_arr; amrex::Array4 const& jy_buff = jy_arr; amrex::Array4 const& jz_buff = jz_arr; #endif @@ -565,9 +564,9 @@ void doDepositionSharedShapeN (const GetParticlePosition& GetPosition, for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig 0._rt) { + costheta = xpmid/rpmid; + sintheta = ypmid/rpmid; + } else { + costheta = 1._rt; + sintheta = 0._rt; + } + const Complex xy0 = Complex{costheta, sintheta}; + const amrex::Real wqx = wq*invvol*(+vx*costheta + vy*sintheta); + const amrex::Real wqy = wq*invvol*(-vx*sintheta + vy*costheta); +#else + const amrex::Real wqx = wq*invvol*vx; + const amrex::Real wqy = wq*invvol*vy; +#endif + const amrex::Real wqz = wq*invvol*vz; + + // --- Compute shape factors + Compute_shape_factor< depos_order > const compute_shape_factor; +#if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ) || defined(WARPX_DIM_3D) + + // x direction + // Get particle position after 1/2 push back in position +#if defined(WARPX_DIM_RZ) + // Keep these double to avoid bug in single precision + const double xmid = (rpmid - xmin)*dxi; +#else + const double xmid = ((xp - xmin) + relative_time*vx)*dxi; +#endif + // j_j[xyz] leftmost grid point in x that the particle touches for the centering of each current + // sx_j[xyz] shape factor along x for the centering of each current + // There are only two possible centerings, node or cell centered, so at most only two shape factor + // arrays will be needed. + // Keep these double to avoid bug in single precision + double sx_node[depos_order + 1] = {0.}; + double sx_cell[depos_order + 1] = {0.}; + int j_node = 0; + int j_cell = 0; + if (jx_type[0] == NODE || jy_type[0] == NODE || jz_type[0] == NODE) { + j_node = compute_shape_factor(sx_node, xmid); + } + if (jx_type[0] == CELL || jy_type[0] == CELL || jz_type[0] == CELL) { + j_cell = compute_shape_factor(sx_cell, xmid - 0.5); + } + + amrex::Real sx_jx[depos_order + 1] = {0._rt}; + amrex::Real sx_jy[depos_order + 1] = {0._rt}; + amrex::Real sx_jz[depos_order + 1] = {0._rt}; + for (int ix=0; ix<=depos_order; ix++) + { + sx_jx[ix] = ((jx_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix])); + sx_jy[ix] = ((jy_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix])); + sx_jz[ix] = ((jz_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix])); + } + + int const j_jx = ((jx_type[0] == NODE) ? j_node : j_cell); + int const j_jy = ((jy_type[0] == NODE) ? j_node : j_cell); + int const j_jz = ((jz_type[0] == NODE) ? j_node : j_cell); +#endif //AMREX_SPACEDIM >= 2 + +#if defined(WARPX_DIM_3D) + // y direction + // Keep these double to avoid bug in single precision + const double ymid = ((yp - ymin) + relative_time*vy)*dyi; + double sy_node[depos_order + 1] = {0.}; + double sy_cell[depos_order + 1] = {0.}; + int k_node = 0; + int k_cell = 0; + if (jx_type[1] == NODE || jy_type[1] == NODE || jz_type[1] == NODE) { + k_node = compute_shape_factor(sy_node, ymid); + } + if (jx_type[1] == CELL || jy_type[1] == CELL || jz_type[1] == CELL) { + k_cell = compute_shape_factor(sy_cell, ymid - 0.5); + } + amrex::Real sy_jx[depos_order + 1] = {0._rt}; + amrex::Real sy_jy[depos_order + 1] = {0._rt}; + amrex::Real sy_jz[depos_order + 1] = {0._rt}; + for (int iy=0; iy<=depos_order; iy++) + { + sy_jx[iy] = ((jx_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy])); + sy_jy[iy] = ((jy_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy])); + sy_jz[iy] = ((jz_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy])); + } + int const k_jx = ((jx_type[1] == NODE) ? k_node : k_cell); + int const k_jy = ((jy_type[1] == NODE) ? k_node : k_cell); + int const k_jz = ((jz_type[1] == NODE) ? k_node : k_cell); +#endif + + // z direction + // Keep these double to avoid bug in single precision + const double zmid = ((zp - zmin) + relative_time*vz)*dzi; + double sz_node[depos_order + 1] = {0.}; + double sz_cell[depos_order + 1] = {0.}; + int l_node = 0; + int l_cell = 0; + if (jx_type[zdir] == NODE || jy_type[zdir] == NODE || jz_type[zdir] == NODE) { + l_node = compute_shape_factor(sz_node, zmid); + } + if (jx_type[zdir] == CELL || jy_type[zdir] == CELL || jz_type[zdir] == CELL) { + l_cell = compute_shape_factor(sz_cell, zmid - 0.5); + } + amrex::Real sz_jx[depos_order + 1] = {0._rt}; + amrex::Real sz_jy[depos_order + 1] = {0._rt}; + amrex::Real sz_jz[depos_order + 1] = {0._rt}; + for (int iz=0; iz<=depos_order; iz++) + { + sz_jx[iz] = ((jx_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz])); + sz_jy[iz] = ((jy_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz])); + sz_jz[iz] = ((jz_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz])); + } + int const l_jx = ((jx_type[zdir] == NODE) ? l_node : l_cell); + int const l_jy = ((jy_type[zdir] == NODE) ? l_node : l_cell); + int const l_jz = ((jz_type[zdir] == NODE) ? l_node : l_cell); + + // Deposit current into jx_arr, jy_arr and jz_arr +#if defined(WARPX_DIM_1D_Z) + for (int iz=0; iz<=depos_order; iz++){ + amrex::Gpu::Atomic::AddNoRet( + &jy_buff(lo.x+l_jy+iz, 0, 0, 0), + sz_jy[iz]*wqy); + } +#endif +#if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ) + for (int iz=0; iz<=depos_order; iz++){ + for (int ix=0; ix<=depos_order; ix++){ + amrex::Gpu::Atomic::AddNoRet( + &jy_buff(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 0), + sx_jy[ix]*sz_jy[iz]*wqy); +#if defined(WARPX_DIM_RZ) + Complex xy = xy0; // Note that xy is equal to e^{i m theta} + for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) { + // The factor 2 on the weighting comes from the normalization of the modes + amrex::Gpu::Atomic::AddNoRet( &jy_buff(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode-1), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.real()); + amrex::Gpu::Atomic::AddNoRet( &jy_buff(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode ), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.imag()); + xy = xy*xy0; + } +#endif + } + } +#elif defined(WARPX_DIM_3D) + for (int iz=0; iz<=depos_order; iz++){ + for (int iy=0; iy<=depos_order; iy++){ + for (int ix=0; ix<=depos_order; ix++){ amrex::Gpu::Atomic::AddNoRet( &jy_buff(lo.x+j_jy+ix, lo.y+k_jy+iy, lo.z+l_jy+iz), sx_jy[ix]*sy_jy[iy]*sz_jy[iz]*wqy); + } + } + } +#endif + } + __syncthreads(); + addLocalToGlobal(tbox_y, jy_arr, jy_buff); + for (int i = threadIdx.x; i < npts; i += blockDim.x){ + vs[i] = 0.0; + } + __syncthreads(); + +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig 0._rt) { + costheta = xpmid/rpmid; + sintheta = ypmid/rpmid; + } else { + costheta = 1._rt; + sintheta = 0._rt; + } + const Complex xy0 = Complex{costheta, sintheta}; + const amrex::Real wqx = wq*invvol*(+vx*costheta + vy*sintheta); + const amrex::Real wqy = wq*invvol*(-vx*sintheta + vy*costheta); +#else + const amrex::Real wqx = wq*invvol*vx; + const amrex::Real wqy = wq*invvol*vy; +#endif + const amrex::Real wqz = wq*invvol*vz; + + // --- Compute shape factors + Compute_shape_factor< depos_order > const compute_shape_factor; +#if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ) || defined(WARPX_DIM_3D) + + // x direction + // Get particle position after 1/2 push back in position +#if defined(WARPX_DIM_RZ) + // Keep these double to avoid bug in single precision + const double xmid = (rpmid - xmin)*dxi; +#else + const double xmid = ((xp - xmin) + relative_time*vx)*dxi; +#endif + // j_j[xyz] leftmost grid point in x that the particle touches for the centering of each current + // sx_j[xyz] shape factor along x for the centering of each current + // There are only two possible centerings, node or cell centered, so at most only two shape factor + // arrays will be needed. + // Keep these double to avoid bug in single precision + double sx_node[depos_order + 1] = {0.}; + double sx_cell[depos_order + 1] = {0.}; + int j_node = 0; + int j_cell = 0; + if (jx_type[0] == NODE || jy_type[0] == NODE || jz_type[0] == NODE) { + j_node = compute_shape_factor(sx_node, xmid); + } + if (jx_type[0] == CELL || jy_type[0] == CELL || jz_type[0] == CELL) { + j_cell = compute_shape_factor(sx_cell, xmid - 0.5); + } + + amrex::Real sx_jx[depos_order + 1] = {0._rt}; + amrex::Real sx_jy[depos_order + 1] = {0._rt}; + amrex::Real sx_jz[depos_order + 1] = {0._rt}; + for (int ix=0; ix<=depos_order; ix++) + { + sx_jx[ix] = ((jx_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix])); + sx_jy[ix] = ((jy_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix])); + sx_jz[ix] = ((jz_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix])); + } + + int const j_jx = ((jx_type[0] == NODE) ? j_node : j_cell); + int const j_jy = ((jy_type[0] == NODE) ? j_node : j_cell); + int const j_jz = ((jz_type[0] == NODE) ? j_node : j_cell); +#endif //AMREX_SPACEDIM >= 2 + +#if defined(WARPX_DIM_3D) + // y direction + // Keep these double to avoid bug in single precision + const double ymid = ((yp - ymin) + relative_time*vy)*dyi; + double sy_node[depos_order + 1] = {0.}; + double sy_cell[depos_order + 1] = {0.}; + int k_node = 0; + int k_cell = 0; + if (jx_type[1] == NODE || jy_type[1] == NODE || jz_type[1] == NODE) { + k_node = compute_shape_factor(sy_node, ymid); + } + if (jx_type[1] == CELL || jy_type[1] == CELL || jz_type[1] == CELL) { + k_cell = compute_shape_factor(sy_cell, ymid - 0.5); + } + amrex::Real sy_jx[depos_order + 1] = {0._rt}; + amrex::Real sy_jy[depos_order + 1] = {0._rt}; + amrex::Real sy_jz[depos_order + 1] = {0._rt}; + for (int iy=0; iy<=depos_order; iy++) + { + sy_jx[iy] = ((jx_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy])); + sy_jy[iy] = ((jy_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy])); + sy_jz[iy] = ((jz_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy])); + } + int const k_jx = ((jx_type[1] == NODE) ? k_node : k_cell); + int const k_jy = ((jy_type[1] == NODE) ? k_node : k_cell); + int const k_jz = ((jz_type[1] == NODE) ? k_node : k_cell); +#endif + + // z direction + // Keep these double to avoid bug in single precision + const double zmid = ((zp - zmin) + relative_time*vz)*dzi; + double sz_node[depos_order + 1] = {0.}; + double sz_cell[depos_order + 1] = {0.}; + int l_node = 0; + int l_cell = 0; + if (jx_type[zdir] == NODE || jy_type[zdir] == NODE || jz_type[zdir] == NODE) { + l_node = compute_shape_factor(sz_node, zmid); + } + if (jx_type[zdir] == CELL || jy_type[zdir] == CELL || jz_type[zdir] == CELL) { + l_cell = compute_shape_factor(sz_cell, zmid - 0.5); + } + amrex::Real sz_jx[depos_order + 1] = {0._rt}; + amrex::Real sz_jy[depos_order + 1] = {0._rt}; + amrex::Real sz_jz[depos_order + 1] = {0._rt}; + for (int iz=0; iz<=depos_order; iz++) + { + sz_jx[iz] = ((jx_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz])); + sz_jy[iz] = ((jy_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz])); + sz_jz[iz] = ((jz_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz])); + } + int const l_jx = ((jx_type[zdir] == NODE) ? l_node : l_cell); + int const l_jy = ((jy_type[zdir] == NODE) ? l_node : l_cell); + int const l_jz = ((jz_type[zdir] == NODE) ? l_node : l_cell); + + // Deposit current into jx_arr, jy_arr and jz_arr +#if defined(WARPX_DIM_1D_Z) + for (int iz=0; iz<=depos_order; iz++){ + amrex::Gpu::Atomic::AddNoRet( + &jz_buff(lo.x+l_jz+iz, 0, 0, 0), + sz_jz[iz]*wqz); + } +#endif +#if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ) + for (int iz=0; iz<=depos_order; iz++){ + for (int ix=0; ix<=depos_order; ix++){ + amrex::Gpu::Atomic::AddNoRet( + &jz_buff(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 0), + sx_jz[ix]*sz_jz[iz]*wqz); +#if defined(WARPX_DIM_RZ) + Complex xy = xy0; // Note that xy is equal to e^{i m theta} + for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) { + // The factor 2 on the weighting comes from the normalization of the modes + amrex::Gpu::Atomic::AddNoRet( &jz_buff(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode-1), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.real()); + amrex::Gpu::Atomic::AddNoRet( &jz_buff(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode ), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.imag()); + xy = xy*xy0; + } +#endif + } + } +#elif defined(WARPX_DIM_3D) + for (int iz=0; iz<=depos_order; iz++){ + for (int iy=0; iy<=depos_order; iy++){ + for (int ix=0; ix<=depos_order; ix++){ amrex::Gpu::Atomic::AddNoRet( &jz_buff(lo.x+j_jz+ix, lo.y+k_jz+iy, lo.z+l_jz+iz), sx_jz[ix]*sy_jz[iy]*sz_jz[iz]*wqz); @@ -772,14 +1141,10 @@ void doDepositionSharedShapeN (const GetParticlePosition& GetPosition, } #endif } -#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) __syncthreads(); - addLocalToGlobal(tbox_x, jx_arr, jx_buff); - addLocalToGlobal(tbox_y, jy_arr, jy_buff); addLocalToGlobal(tbox_z, jz_arr, jz_buff); -#endif - } - ); + } + ); #if defined(WARPX_USE_GPUCLOCK) if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) { amrex::Gpu::streamSynchronize();