Skip to content

Commit

Permalink
Add event callback support
Browse files Browse the repository at this point in the history
  • Loading branch information
bmerry committed Nov 13, 2012
1 parent af7de89 commit b40cf8e
Show file tree
Hide file tree
Showing 11 changed files with 191 additions and 6 deletions.
4 changes: 4 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
1.1.0
-----
Add setEventCallback methods to Scan and Radixsort

1.0.3
-----
* Some minor tweaks to allow building on Windows with MSVC
Expand Down
2 changes: 1 addition & 1 deletion Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -1432,7 +1432,7 @@ INCLUDE_FILE_PATTERNS =
# undefined via #undef or recursively expanded use the := operator
# instead of the = operator.

PREDEFINED = UNIT_TESTS DOXYGEN_FAKE_CODE CLOGS_API CLOGS_LOCAL
PREDEFINED = UNIT_TESTS DOXYGEN_FAKE_CODE CLOGS_API CLOGS_LOCAL CL_CALLBACK=

# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
# this tag can be used to specify a list of macro names that should be expanded.
Expand Down
17 changes: 17 additions & 0 deletions doc/clogs-user.xml
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,23 @@ sorter.enqueue(queue, keys, values, numElements, 20, &wait, &event);
need to be passed by reference.
</para>
</section>
<section id="using.profile">
<title>Profiling</title>
<para>
The event returned by the various <function>enqueue</function>
commands is suitable for event ordering, but it does not work
well with OpenCL event profiling functions to determine how much
time is spent on the GPU. For this purpose, one should call
<function>setEventCallback</function> on the
<type>clogs::Radixsort</type> or <type>clogs::Scan</type>
object. The registered callback will be called once for each
CL command enqueued, passing the associated event. Note that
the callback is called during the <function>enqueue</function>
call, rather than when the event completes; it is up to you to
defer querying the profiling information until the event is
complete.
</para>
</section>
</chapter>

<chapter id="performance">
Expand Down
16 changes: 16 additions & 0 deletions include/clogs/radixsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,22 @@ class CLOGS_API Radixsort

~Radixsort(); ///< Destructor

/**
* Set a callback function that will receive a list of all underlying events.
* The callback will be called multiple times during each enqueue, because
* the implementation uses multiple commands. This allows profiling information
* to be extracted from the events once they complete.
*
* The callback may also be set to @c NULL to disable it.
*
* @note This is not an event completion callback: it is called during
* @c enqueue, generally before the events complete.
*
* @param callback The callback function.
* @param userData Arbitrary data to be passed to the callback.
*/
void setEventCallback(void (CL_CALLBACK *callback)(const cl::Event &, void *), void *userData);

/**
* Enqueue a scan operation on a command queue.
*
Expand Down
16 changes: 16 additions & 0 deletions include/clogs/scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,22 @@ class CLOGS_API Scan

~Scan(); ///< Destructor

/**
* Set a callback function that will receive a list of all underlying events.
* The callback will be called multiple times during each enqueue, because
* the implementation uses multiple commands. This allows profiling information
* to be extracted from the events once they complete.
*
* The callback may also be set to @c NULL to disable it.
*
* @note This is not an event completion callback: it is called during
* @c enqueue, generally before the events complete.
*
* @param callback The callback function.
* @param userData Arbitrary data to be passed to the callback.
*/
void setEventCallback(void (CL_CALLBACK *callback)(const cl::Event &, void *), void *userData);

/**
* Enqueue a scan operation on a command queue.
*
Expand Down
42 changes: 39 additions & 3 deletions src/radixsort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,15 @@ void Radixsort::enqueueReduce(
reduceKernel.setArg(3, (cl_uint) elements);
reduceKernel.setArg(4, (cl_uint) firstBit);
cl_uint blocks = getBlocks(elements, len);
cl::Event reduceEvent;
queue.enqueueNDRangeKernel(reduceKernel,
cl::NullRange,
cl::NDRange(reduceWorkGroupSize * blocks),
cl::NDRange(reduceWorkGroupSize),
events, event);
events, &reduceEvent);
doEventCallback(reduceEvent);
if (event != NULL)
*event = reduceEvent;
}

void Radixsort::enqueueScan(
Expand All @@ -82,11 +86,15 @@ void Radixsort::enqueueScan(
{
scanKernel.setArg(0, histogram);
scanKernel.setArg(1, (cl_uint) blocks);
cl::Event scanEvent;
queue.enqueueNDRangeKernel(scanKernel,
cl::NullRange,
cl::NDRange(scanWorkGroupSize),
cl::NDRange(scanWorkGroupSize),
events, event);
events, &scanEvent);
doEventCallback(scanEvent);
if (event != NULL)
*event = scanEvent;
}

void Radixsort::enqueueScatter(
Expand All @@ -110,11 +118,29 @@ void Radixsort::enqueueScatter(
const ::size_t slicesPerWorkGroup = scatterWorkGroupSize / scatterSlice;
assert(blocks % slicesPerWorkGroup == 0);
const ::size_t workGroups = blocks / slicesPerWorkGroup;
cl::Event scatterEvent;
queue.enqueueNDRangeKernel(scatterKernel,
cl::NullRange,
cl::NDRange(scatterWorkGroupSize * workGroups),
cl::NDRange(scatterWorkGroupSize),
events, event);
events, &scatterEvent);
doEventCallback(scatterEvent);
if (event != NULL)
*event = scatterEvent;
}

void Radixsort::doEventCallback(const cl::Event &event)
{
if (eventCallback != NULL)
(*eventCallback)(event, eventCallbackUserData);
}

void Radixsort::setEventCallback(
void (CL_CALLBACK *callback)(const cl::Event &event, void *),
void *userData)
{
eventCallback = callback;
eventCallbackUserData = userData;
}

void Radixsort::enqueue(
Expand Down Expand Up @@ -198,10 +224,12 @@ void Radixsort::enqueue(
* management.
*/
queue.enqueueCopyBuffer(*curKeys, *nextKeys, 0, 0, elements * keySize, waitFor, &next);
doEventCallback(next);
prev[0] = next; waitFor = &prev;
if (valueSize != 0)
{
queue.enqueueCopyBuffer(*curValues, *nextValues, 0, 0, elements * valueSize, waitFor, &next);
doEventCallback(next);
prev[0] = next; waitFor = &prev;
}
}
Expand All @@ -218,6 +246,7 @@ void Radixsort::setTemporaryBuffers(const cl::Buffer &keys, const cl::Buffer &va
Radixsort::Radixsort(
const cl::Context &context, const cl::Device &device,
const Type &keyType, const Type &valueType)
: eventCallback(NULL), eventCallbackUserData(NULL)
{
if (!keyType.isIntegral() || keyType.isSigned() || keyType.getLength() != 1
|| !keyType.isComputable(device) || !keyType.isStorable(device))
Expand Down Expand Up @@ -327,6 +356,13 @@ Radixsort::Radixsort(
detail_ = new detail::Radixsort(context, device, keyType, valueType);
}

void Radixsort::setEventCallback(
void (CL_CALLBACK *callback)(const cl::Event &event, void *),
void *userData)
{
detail_->setEventCallback(callback, userData);
}

void Radixsort::enqueue(
const cl::CommandQueue &commandQueue,
const cl::Buffer &keys, const cl::Buffer &values,
Expand Down
14 changes: 14 additions & 0 deletions src/radixsort_detail.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ class CLOGS_LOCAL Radixsort
cl::Buffer tmpKeys; ///< User-provided buffer to hold temporary keys
cl::Buffer tmpValues; ///< User-provided buffer to hold temporary values

void (CL_CALLBACK *eventCallback)(const cl::Event &event, void *);
void *eventCallbackUserData;

::size_t getBlocks(::size_t elements, ::size_t len);

/**
Expand Down Expand Up @@ -122,6 +125,11 @@ class CLOGS_LOCAL Radixsort
::size_t len, ::size_t elements, unsigned int firstBit,
const VECTOR_CLASS<cl::Event> *events, cl::Event *event);

/**
* Call the event callback, if there is one.
*/
void doEventCallback(const cl::Event &event);

/* Prevent copying */
Radixsort(const Radixsort &);
Radixsort &operator =(const Radixsort &);
Expand All @@ -133,6 +141,12 @@ class CLOGS_LOCAL Radixsort
*/
Radixsort(const cl::Context &context, const cl::Device &device, const Type &keyType, const Type &valueType = Type());

/**
* Set a callback to be notified of enqueued commands.
* @see @ref clogs::Radixsort::setEventCallback
*/
void setEventCallback(void (CL_CALLBACK *callback)(const cl::Event &, void *), void *userData);

/**
* Enqueue a scan operation on a command queue.
* @see @ref clogs::Radixsort::enqueue.
Expand Down
40 changes: 39 additions & 1 deletion src/scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ class CLOGS_LOCAL Scan
cl::Kernel scanKernel; ///< Final scan kernel
cl::Buffer sums; ///< Reductions of the blocks for middle phase

void (CL_CALLBACK *eventCallback)(const cl::Event &event, void *);
void *eventCallbackUserData;

/**
* Implementation of @ref enqueueInternal, supporting both offsetting and
* non-offsetting. If @a offsetBuffer is not @c NULL, we are doing offseting.
Expand All @@ -83,6 +86,11 @@ class CLOGS_LOCAL Scan
const VECTOR_CLASS<cl::Event> *events,
cl::Event *event);

/**
* Call the event callback, if there is one.
*/
void doEventCallback(const cl::Event &event);

/* Prevent copying */
Scan(const Scan &);
Scan &operator=(const Scan &);
Expand All @@ -94,6 +102,12 @@ class CLOGS_LOCAL Scan
*/
Scan(const cl::Context &context, const cl::Device &device, const Type &type);

/**
* Set a callback to be notified of enqueued commands.
* @see @ref clogs::Scan::setEventCallback
*/
void setEventCallback(void (CL_CALLBACK *callback)(const cl::Event &, void *), void *userData);

/**
* Enqueue a scan operation on a command queue, with a CPU offset.
* @see @ref clogs::Scan::enqueue.
Expand All @@ -119,6 +133,7 @@ class CLOGS_LOCAL Scan
};

Scan::Scan(const cl::Context &context, const cl::Device &device, const Type &type)
: eventCallback(NULL), eventCallbackUserData(NULL)
{
if (!type.isIntegral() || !type.isComputable(device) || !type.isStorable(device))
throw std::invalid_argument("type is not a supported integral format on this device");
Expand Down Expand Up @@ -185,6 +200,18 @@ Scan::Scan(const cl::Context &context, const cl::Device &device, const Type &typ
}
}

void Scan::doEventCallback(const cl::Event &event)
{
if (eventCallback != NULL)
(*eventCallback)(event, eventCallbackUserData);
}

void Scan::setEventCallback(void (CL_CALLBACK *callback)(const cl::Event &, void *), void *userData)
{
eventCallback = callback;
eventCallbackUserData = userData;
}

void Scan::enqueueInternal(const cl::CommandQueue &commandQueue,
const cl::Buffer &buffer,
::size_t elements,
Expand Down Expand Up @@ -254,6 +281,7 @@ void Scan::enqueueInternal(const cl::CommandQueue &commandQueue,

std::vector<cl::Event> reduceEvents(1);
std::vector<cl::Event> scanSmallEvents(1);
cl::Event scanEvent;
const std::vector<cl::Event> *waitFor = events;
if (allBlocks > 1)
{
Expand All @@ -263,17 +291,22 @@ void Scan::enqueueInternal(const cl::CommandQueue &commandQueue,
cl::NDRange(reduceWorkGroupSize),
events, &reduceEvents[0]);
waitFor = &reduceEvents;
doEventCallback(reduceEvents[0]);
}
commandQueue.enqueueNDRangeKernel(smallKernel,
cl::NullRange,
cl::NDRange(maxBlocks / 2),
cl::NDRange(maxBlocks / 2),
waitFor, &scanSmallEvents[0]);
doEventCallback(scanSmallEvents[0]);
commandQueue.enqueueNDRangeKernel(scanKernel,
cl::NullRange,
cl::NDRange(scanWorkGroupSize * allBlocks),
cl::NDRange(scanWorkGroupSize),
&scanSmallEvents, event);
&scanSmallEvents, &scanEvent);
doEventCallback(scanEvent);
if (event != NULL)
*event = scanEvent;
}

void Scan::enqueue(const cl::CommandQueue &commandQueue,
Expand Down Expand Up @@ -309,6 +342,11 @@ Scan::~Scan()
delete detail_;
}

void Scan::setEventCallback(void (CL_CALLBACK *callback)(const cl::Event &, void *), void *userData)
{
detail_->setEventCallback(callback, userData);
}

void Scan::enqueue(const cl::CommandQueue &commandQueue,
const cl::Buffer &buffer,
::size_t elements,
Expand Down
22 changes: 22 additions & 0 deletions test/test_radixsort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class TestRadixsort : public clogs::Test::TestFixture
CPPUNIT_TEST(testTmpKeys);
CPPUNIT_TEST(testTmpValues);
CPPUNIT_TEST(testTmpSmall);
CPPUNIT_TEST(testEventCallback);

CPPUNIT_TEST_SUITE_END();

Expand Down Expand Up @@ -139,6 +140,9 @@ class TestRadixsort : public clogs::Test::TestFixture
/// Tests using temporary buffers that are too small
void testTmpSmall();

/// Test that the event callback is called at least once
void testEventCallback();

virtual void setUp();
};
CPPUNIT_TEST_SUITE_REGISTRATION(TestRadixsort);
Expand Down Expand Up @@ -626,6 +630,24 @@ void TestRadixsort::testTmpSmall()
testSort<clogs::Test::TypeTag<clogs::TYPE_UINT>, clogs::Test::TypeTag<clogs::TYPE_FLOAT, 4> >(128, 0, 127, 127);
}

static void CL_CALLBACK eventCallback(const cl::Event &event, void *eventCount)
{
CPPUNIT_ASSERT(event() != NULL);
CPPUNIT_ASSERT(eventCount != NULL);
(*static_cast<int *>(eventCount))++;
}

void TestRadixsort::testEventCallback()
{
int events = 0;
clogs::Radixsort sort(context, device, clogs::TYPE_UINT);
cl::Buffer buffer(context, CL_MEM_READ_WRITE, 16);
sort.setEventCallback(eventCallback, &events);
sort.enqueue(queue, buffer, NULL, 4);
queue.finish();
CPPUNIT_ASSERT(events > 0);
}

/*******************************************************/

#include "../tools/timer.h"
Expand Down
Loading

0 comments on commit b40cf8e

Please sign in to comment.