Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Accessor, an interface for accessing buffers in kernels #1249

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
167 changes: 45 additions & 122 deletions example/bufferCopy/src/bufferCopy.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner
/* Copyright 2019-2021 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner, Bernhard Manfred Gruber
BenjaminW3 marked this conversation as resolved.
Show resolved Hide resolved
*
* This file exemplifies usage of alpaka.
*
Expand Down Expand Up @@ -32,72 +32,58 @@ ALPAKA_FN_ACC size_t linIdxToPitchedIdx(size_t const globalIdx, size_t const pit
//! Prints all elements of the buffer.
struct PrintBufferKernel
{
template<typename TAcc, typename TData, typename TExtent>
template<typename TAcc, typename TData>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TData const* const buffer,
TExtent const& extents,
size_t const pitch) const -> void
alpaka::experimental::BufferAccessor<TAcc, TData, 3, alpaka::experimental::ReadAccess> const data) const
-> void
{
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridSize = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);

for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
{
// NOTE: hard-coded for unsigned int
printf("%u:%u ", static_cast<uint32_t>(i), static_cast<uint32_t>(buffer[linIdxToPitchedIdx<2>(i, pitch)]));
}
for(size_t z = idx[0]; z < data.extents[0]; z += gridSize[0])
for(size_t y = idx[1]; y < data.extents[1]; y += gridSize[1])
for(size_t x = idx[2]; x < data.extents[2]; x += gridSize[2])
printf("%zu,%zu,%zu:%u ", z, y, x, static_cast<uint32_t>(data[{z, y, x}]));
}
};


//! Tests if the value of the buffer on index i is equal to i.
struct TestBufferKernel
{
template<typename TAcc, typename TData, typename TExtent>
template<typename TAcc, typename TData>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TData const* const
#ifndef NDEBUG
data
#endif
,
TExtent const& extents,
size_t const
#ifndef NDEBUG
pitch
#endif
) const -> void
alpaka::experimental::BufferAccessor<TAcc, TData, 3, alpaka::experimental::ReadAccess> const data) const
-> void
{
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);

for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
{
ALPAKA_ASSERT_OFFLOAD(data[linIdxToPitchedIdx<2>(i, pitch)] == i);
}
auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridSize = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

for(size_t z = idx[0]; z < data.extents[0]; z += gridSize[0])
for(size_t y = idx[1]; y < data.extents[1]; y += gridSize[1])
for(size_t x = idx[2]; x < data.extents[2]; x += gridSize[2])
ALPAKA_ASSERT_OFFLOAD(
data(z, y, x) == alpaka::mapIdx<1u>(decltype(data.extents){z, y, x}, data.extents)[0]);
}
};

//! Fills values of buffer with increasing elements starting from 0
struct FillBufferKernel
{
template<typename TAcc, typename TData, typename TExtent>
ALPAKA_FN_ACC auto operator()(TAcc const& acc, TData* const data, TExtent const& extents) const -> void
template<typename TAcc, typename TData>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
alpaka::experimental::BufferAccessor<TAcc, TData, 3, alpaka::experimental::WriteAccess> const data) const
-> void
{
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const gridSize = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
{
data[i] = static_cast<TData>(i);
}
for(size_t z = idx[0]; z < data.extents[0]; z += gridSize[0])
for(size_t y = idx[1]; y < data.extents[1]; y += gridSize[1])
for(size_t x = idx[2]; x < data.extents[2]; x += gridSize[2])
data(z, y, x) = alpaka::mapIdx<1u>(idx, data.extents)[0];
}
};

Expand Down Expand Up @@ -209,35 +195,25 @@ auto main() -> int

// Init host buffer
//
// You can not access the inner
// elements of a buffer directly, but
// you can get the pointer to the memory
// (getPtrNative).
Data* const pHostBuffer = alpaka::getPtrNative(hostBuffer);
// You can not access the inner elements of a buffer directly, but you can get the pointer to the memory via
// getPtrNative() or a read/write accessor using access().
auto hostBufferAccessor = alpaka::experimental::access(hostBuffer);

// This pointer can be used to directly write
// some values into the buffer memory.
// Mind, that only a host can write on host memory.
// The same holds true for device memory.
for(Idx i(0); i < extents.prod(); ++i)
{
pHostBuffer[i] = static_cast<Data>(i);
}
for(size_t z = 0; z < extents[0]; z++)
for(size_t y = 0; y < extents[1]; y++)
for(size_t x = 0; x < extents[2]; x++)
hostBufferAccessor(z, y, x) = static_cast<Data>(alpaka::mapIdx<1u>(Vec{z, y, x}, extents)[0]);

// Memory views and buffers can also be initialized by executing a kernel.
// To pass a buffer into a kernel, you can pass the
// native pointer into the kernel invocation.
Data* const pHostViewPlainPtr = alpaka::getPtrNative(hostViewPlainPtr);

FillBufferKernel fillBufferKernel;

alpaka::exec<Host>(
hostQueue,
hostWorkDiv,
fillBufferKernel,
pHostViewPlainPtr, // 1st kernel argument
extents); // 2nd kernel argument

alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel, alpaka::experimental::writeAccess(hostViewPlainPtr));

// Copy host to device Buffer
//
Expand All @@ -253,40 +229,15 @@ auto main() -> int
alpaka::memcpy(devQueue, deviceBuffer1, hostViewPlainPtr, extents);
alpaka::memcpy(devQueue, deviceBuffer2, hostBuffer, extents);

// Depending on the accelerator, the allocation function may introduce
// padding between rows/planes of multidimensional memory allocations.
// Therefore the pitch (distance between consecutive rows/planes) may be
// greater than the space required for the data.
Idx const deviceBuffer1Pitch(alpaka::getPitchBytes<2u>(deviceBuffer1) / sizeof(Data));
Idx const deviceBuffer2Pitch(alpaka::getPitchBytes<2u>(deviceBuffer2) / sizeof(Data));
Idx const hostBuffer1Pitch(alpaka::getPitchBytes<2u>(hostBuffer) / sizeof(Data));
Idx const hostViewPlainPtrPitch(alpaka::getPitchBytes<2u>(hostViewPlainPtr) / sizeof(Data));

// Test device Buffer
//
// This kernel tests if the copy operations
// were successful. In the case something
// went wrong an assert will fail.
Data const* const pDeviceBuffer1 = alpaka::getPtrNative(deviceBuffer1);
Data const* const pDeviceBuffer2 = alpaka::getPtrNative(deviceBuffer2);

TestBufferKernel testBufferKernel;
alpaka::exec<Acc>(
devQueue,
devWorkDiv,
testBufferKernel,
pDeviceBuffer1, // 1st kernel argument
extents, // 2nd kernel argument
deviceBuffer1Pitch); // 3rd kernel argument

alpaka::exec<Acc>(
devQueue,
devWorkDiv,
testBufferKernel,
pDeviceBuffer2, // 1st kernel argument
extents, // 2nd kernel argument
deviceBuffer2Pitch); // 3rd kernel argument

alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, alpaka::experimental::readAccess(deviceBuffer1));
alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, alpaka::experimental::readAccess(deviceBuffer2));

// Print device Buffer
//
Expand All @@ -299,43 +250,15 @@ auto main() -> int
// completely distorted.

PrintBufferKernel printBufferKernel;
alpaka::exec<Acc>(
devQueue,
devWorkDiv,
printBufferKernel,
pDeviceBuffer1, // 1st kernel argument
extents, // 2nd kernel argument
deviceBuffer1Pitch); // 3rd kernel argument
alpaka::wait(devQueue);
std::cout << std::endl;

alpaka::exec<Acc>(
devQueue,
devWorkDiv,
printBufferKernel,
pDeviceBuffer2, // 1st kernel argument
extents, // 2nd kernel argument
deviceBuffer2Pitch); // 3rd kernel argument
alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, alpaka::experimental::readAccess(deviceBuffer1));
alpaka::wait(devQueue);
std::cout << std::endl;

alpaka::exec<Host>(
hostQueue,
hostWorkDiv,
printBufferKernel,
pHostBuffer, // 1st kernel argument
extents, // 2nd kernel argument
hostBuffer1Pitch); // 3rd kernel argument
alpaka::exec<Host>(hostQueue, hostWorkDiv, printBufferKernel, alpaka::experimental::readAccess(hostBuffer));
alpaka::wait(hostQueue);
std::cout << std::endl;

alpaka::exec<Host>(
hostQueue,
hostWorkDiv,
printBufferKernel,
pHostViewPlainPtr, // 1st kernel argument
extents, // 2nd kernel argument
hostViewPlainPtrPitch); // 3rd kernel argument
alpaka::exec<Host>(hostQueue, hostWorkDiv, printBufferKernel, alpaka::experimental::readAccess(hostViewPlainPtr));
alpaka::wait(hostQueue);
std::cout << std::endl;

Expand Down
30 changes: 18 additions & 12 deletions example/heatEquation/src/heatEquation.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/* Copyright 2020 Benjamin Worpitz, Matthias Werner, Jakob Krude,
* Sergei Bastrakov
/* Copyright 2020-2021 Benjamin Worpitz, Matthias Werner, Jakob Krude, Sergei Bastrakov, Bernhard Manfred Gruber
*
* This file exemplifies usage of alpaka.
*
Expand Down Expand Up @@ -40,19 +39,20 @@

struct HeatEquationKernel
{
template<typename TAcc>
template<typename TAcc, typename TMemoryHandle, typename TIdx>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
double const* const uCurrBuf,
double* const uNextBuf,
uint32_t const extent,
alpaka::experimental::Accessor<TMemoryHandle, double, TIdx, 1, alpaka::experimental::ReadAccess> const
uCurrBuf,
alpaka::experimental::Accessor<TMemoryHandle, double, TIdx, 1, alpaka::experimental::WriteAccess> const
uNextBuf,
double const dx,
double const dt) const -> void
{
// Each kernel executes one element
double const r = dt / (dx * dx);
int idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
if(idx > 0 && idx < extent - 1u)
if(idx > 0 && idx < uNextBuf.extents[0] - 1u)
{
uNextBuf[idx] = uCurrBuf[idx] * (1.0 - 2.0 * r) + uCurrBuf[idx - 1] * r + uCurrBuf[idx + 1] * r;
}
Expand Down Expand Up @@ -146,9 +146,6 @@ auto main() -> int
auto uNextBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};
auto uCurrBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};

double* pCurrAcc = alpaka::getPtrNative(uCurrBufAcc);
double* pNextAcc = alpaka::getPtrNative(uNextBufAcc);

// Apply initial conditions for the test problem
for(uint32_t i = 0; i < numNodesX; i++)
{
Expand All @@ -163,15 +160,24 @@ auto main() -> int
alpaka::memcpy(queue, uNextBufAcc, uCurrBufAcc, extent);
alpaka::wait(queue);

auto* uCurrBufAccPtr = &uCurrBufAcc;
BenjaminW3 marked this conversation as resolved.
Show resolved Hide resolved
auto* uNextBufAccPtr = &uNextBufAcc;
for(uint32_t step = 0; step < numTimeSteps; step++)
{
// Compute next values
alpaka::exec<Acc>(queue, workdiv, kernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
alpaka::exec<Acc>(
queue,
workdiv,
kernel,
alpaka::experimental::readAccess(*uCurrBufAccPtr),
alpaka::experimental::writeAccess(*uNextBufAccPtr),
dx,
dt);

// We assume the boundary conditions are constant and so these values
// do not need to be updated.
// So we just swap next to curr (shallow copy)
std::swap(pCurrAcc, pNextAcc);
std::swap(uCurrBufAccPtr, uNextBufAccPtr);
}

// Copy device -> host
Expand Down
17 changes: 8 additions & 9 deletions example/monteCarloIntegration/src/monteCarloIntegration.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2020 Benjamin Worpitz, Sergei Bastrakov, Jakob Krude
/* Copyright 2020-2021 Benjamin Worpitz, Sergei Bastrakov, Jakob Krude, Bernhard Manfred Gruber
*
* This file exemplifies usage of alpaka.
*
Expand Down Expand Up @@ -47,11 +47,12 @@ struct Kernel
//! \param numPoints The total number of points to be calculated.
//! \param globalCounter The sum of all local results.
//! \param functor The function for which the integral is to be computed.
template<typename TAcc, typename TFunctor>
template<typename TAcc, typename TMemoryHandle, typename TIdx, typename TFunctor>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
size_t const numPoints,
uint32_t* const globalCounter,
alpaka::experimental::Accessor<TMemoryHandle, uint32_t, TIdx, 1, alpaka::experimental::ReadWriteAccess>
globalCounter,
TFunctor functor) const -> void
{
// Get the global linearized thread idx.
Expand Down Expand Up @@ -81,7 +82,7 @@ struct Kernel
}

// Add the local result to the sum of the other results.
alpaka::atomicAdd(acc, globalCounter, localCount, alpaka::hierarchy::Blocks{});
alpaka::atomicAdd(acc, &globalCounter[0], localCount, alpaka::hierarchy::Blocks{});
}
};

Expand Down Expand Up @@ -117,21 +118,19 @@ auto main() -> int

// Setup buffer.
BufHost bufHost{alpaka::allocBuf<uint32_t, Idx>(devHost, extent)};
uint32_t* const ptrBufHost{alpaka::getPtrNative(bufHost)};
BufAcc bufAcc{alpaka::allocBuf<uint32_t, Idx>(devAcc, extent)};
uint32_t* const ptrBufAcc{alpaka::getPtrNative(bufAcc)};

// Initialize the global count to 0.
ptrBufHost[0] = 0.0f;
alpaka::experimental::access(bufHost)[0] = 0.0f;
alpaka::memcpy(queue, bufAcc, bufHost, extent);

Kernel kernel;
alpaka::exec<Acc>(queue, workdiv, kernel, numPoints, ptrBufAcc, Function{});
alpaka::exec<Acc>(queue, workdiv, kernel, numPoints, alpaka::experimental::access(bufAcc), Function{});
alpaka::memcpy(queue, bufHost, bufAcc, extent);
alpaka::wait(queue);

// Check the result.
uint32_t globalCount = *ptrBufHost;
uint32_t globalCount = alpaka::experimental::access(bufHost)[0];

// Final result.
float finalResult = globalCount / static_cast<float>(numPoints);
Expand Down