Skip to content

Commit

Permalink
Improve transport of single track on CPU (#1235)
Browse files Browse the repository at this point in the history
  • Loading branch information
esseivaju committed May 15, 2024
1 parent c72f59d commit d6b41af
Show file tree
Hide file tree
Showing 16 changed files with 66 additions and 40 deletions.
2 changes: 1 addition & 1 deletion app/celer-g4/GlobalSetup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ void GlobalSetup::ReadInput(std::string const& filename)
options_->sd.enabled = input_.sd_type != SensitiveDetectorType::none;
options_->cuda_stack_size = input_.cuda_stack_size;
options_->cuda_heap_size = input_.cuda_heap_size;
options_->sync = input_.sync;
options_->action_times = input_.action_times;
options_->default_stream = input_.default_stream;
options_->auto_flush = input_.auto_flush;
}
Expand Down
2 changes: 1 addition & 1 deletion app/celer-g4/RunInput.hh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ struct RunInput
size_type initializer_capacity{};
real_type secondary_stack_factor{};
size_type auto_flush{}; //!< Defaults to num_track_slots
bool sync{false};
bool action_times{false};
bool default_stream{false}; //!< Launch all kernels on the default stream

// Physics setup options
Expand Down
7 changes: 5 additions & 2 deletions app/celer-g4/RunInputIO.json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ void from_json(nlohmann::json const& j, RunInput& v)
{
#define RI_LOAD_OPTION(NAME) CELER_JSON_LOAD_OPTION(j, v, NAME)
#define RI_LOAD_REQUIRED(NAME) CELER_JSON_LOAD_REQUIRED(j, v, NAME)
#define RI_LOAD_DEPRECATED(OLD, NEW) CELER_JSON_LOAD_DEPRECATED(j, v, OLD, NEW)

// Check version (if available)
check_format(j, "celer-g4");
Expand All @@ -66,11 +67,13 @@ void from_json(nlohmann::json const& j, RunInput& v)

RI_LOAD_OPTION(primary_options);

RI_LOAD_DEPRECATED(sync, action_times);

RI_LOAD_OPTION(num_track_slots);
RI_LOAD_OPTION(max_steps);
RI_LOAD_OPTION(initializer_capacity);
RI_LOAD_OPTION(secondary_stack_factor);
RI_LOAD_OPTION(sync);
RI_LOAD_OPTION(action_times);
RI_LOAD_OPTION(default_stream);
if (auto iter = j.find("auto_flush"); iter != j.end())
{
Expand Down Expand Up @@ -168,7 +171,7 @@ void to_json(nlohmann::json& j, RunInput const& v)
RI_SAVE(secondary_stack_factor);
RI_SAVE_OPTION(cuda_stack_size);
RI_SAVE_OPTION(cuda_heap_size);
RI_SAVE(sync);
RI_SAVE(action_times);
RI_SAVE(default_stream);
RI_SAVE(auto_flush);

Expand Down
2 changes: 1 addition & 1 deletion app/celer-sim/Runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ void Runner::build_transporter_input(RunnerInput const& inp)
transporter_input_->max_steps = inp.max_steps;
transporter_input_->store_track_counts = inp.write_track_counts;
transporter_input_->store_step_times = inp.write_step_times;
transporter_input_->sync = inp.sync;
transporter_input_->action_times = inp.action_times;
transporter_input_->params = core_params_;
}

Expand Down
2 changes: 1 addition & 1 deletion app/celer-sim/RunnerInput.hh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ struct RunnerInput
size_type initializer_capacity{}; //!< Divided among streams
real_type secondary_stack_factor{};
bool use_device{};
bool sync{};
bool action_times{};
bool merge_events{false}; //!< Run all events at once on a single stream
bool default_stream{false}; //!< Launch all kernels on the default stream
bool warm_up{CELER_USE_DEVICE}; //!< Run a nullop step first
Expand Down
5 changes: 3 additions & 2 deletions app/celer-sim/RunnerInputIO.json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,15 @@ void from_json(nlohmann::json const& j, RunnerInput& v)
LDIO_LOAD_OPTION(write_step_times);

LDIO_LOAD_DEPRECATED(max_num_tracks, num_track_slots);
LDIO_LOAD_DEPRECATED(sync, action_times);

LDIO_LOAD_OPTION(seed);
LDIO_LOAD_OPTION(num_track_slots);
LDIO_LOAD_OPTION(max_steps);
LDIO_LOAD_REQUIRED(initializer_capacity);
LDIO_LOAD_REQUIRED(secondary_stack_factor);
LDIO_LOAD_REQUIRED(use_device);
LDIO_LOAD_OPTION(sync);
LDIO_LOAD_OPTION(action_times);
LDIO_LOAD_OPTION(merge_events);
LDIO_LOAD_OPTION(default_stream);
LDIO_LOAD_OPTION(warm_up);
Expand Down Expand Up @@ -174,7 +175,7 @@ void to_json(nlohmann::json& j, RunnerInput const& v)
LDIO_SAVE(initializer_capacity);
LDIO_SAVE(secondary_stack_factor);
LDIO_SAVE(use_device);
LDIO_SAVE(sync);
LDIO_SAVE(action_times);
LDIO_SAVE(merge_events);
LDIO_SAVE(default_stream);
LDIO_SAVE(warm_up);
Expand Down
8 changes: 4 additions & 4 deletions app/celer-sim/Transporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Transporter<M>::Transporter(TransporterInput inp)
step_input.params = inp.params;
step_input.num_track_slots = inp.num_track_slots;
step_input.stream_id = inp.stream_id;
step_input.sync = inp.sync;
step_input.action_times = inp.action_times;
stepper_ = std::make_shared<Stepper<M>>(std::move(step_input));
}

Expand Down Expand Up @@ -157,11 +157,11 @@ auto Transporter<M>::operator()(SpanConstPrimary primaries)
template<MemSpace M>
void Transporter<M>::accum_action_times(MapStrDouble* result) const
{
// Get kernel timing if running with a single stream and if either on the
// device with synchronization enabled or on the host
// Get kernel timing if running with a single stream and if
// synchronization is enabled
auto const& step = *stepper_;
auto const& action_seq = step.actions();
if (M == MemSpace::host || action_seq.sync())
if (action_seq.action_times())
{
auto const& action_ptrs = action_seq.actions();
auto const& times = action_seq.accum_time();
Expand Down
3 changes: 2 additions & 1 deletion app/celer-sim/Transporter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ struct TransporterInput
// Stepper input
std::shared_ptr<CoreParams const> params;
size_type num_track_slots{}; //!< AKA max_num_tracks
bool sync{false}; //!< Whether to synchronize device between actions
bool action_times{false}; //!< Whether to synchronize device between
//!< actions for timing

// Loop control
size_type max_steps{};
Expand Down
2 changes: 1 addition & 1 deletion app/celer-sim/simple-driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def strtobool(text):
'step_diagnostic_bins': 200,
'write_step_times': use_device,
'simple_calo': simple_calo,
'sync': True,
'action_times': True,
'merge_events': False,
'default_stream': False,
'brem_combined': True,
Expand Down
18 changes: 10 additions & 8 deletions src/accel/LocalTransporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

#include "SetupOptions.hh"
#include "SharedParams.hh"

#include "detail/HitManager.hh"
#include "detail/OffloadWriter.hh"

Expand Down Expand Up @@ -101,7 +102,7 @@ LocalTransporter::LocalTransporter(SetupOptions const& options,
inp.params = params.Params();
inp.stream_id = StreamId{static_cast<size_type>(thread_id)};
inp.num_track_slots = options.max_num_tracks;
inp.sync = options.sync;
inp.action_times = options.action_times;

if (celeritas::device())
{
Expand Down Expand Up @@ -190,10 +191,12 @@ void LocalTransporter::Flush()
{
return;
}

CELER_LOG_LOCAL(info) << "Transporting " << buffer_.size()
<< " tracks from event " << event_id_.unchecked_get()
<< " with Celeritas";
if (celeritas::device())
{
CELER_LOG_LOCAL(info)
<< "Transporting " << buffer_.size() << " tracks from event "
<< event_id_.unchecked_get() << " with Celeritas";
}

if (dump_primaries_)
{
Expand Down Expand Up @@ -254,10 +257,9 @@ auto LocalTransporter::GetActionTime() const -> MapStrReal

MapStrReal result;
auto const& action_seq = step_->actions();
if (action_seq.sync() || !celeritas::device())
if (action_seq.action_times())
{
// Save kernel timing if either on the device with synchronization
// enabled or on the host
// Save kernel timing if synchronization is enabled
auto const& action_ptrs = action_seq.actions();
auto const& time = action_seq.accum_time();

Expand Down
2 changes: 1 addition & 1 deletion src/accel/SetupOptions.hh
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ struct SetupOptions
size_type cuda_stack_size{};
size_type cuda_heap_size{};
//! Sync the GPU at every kernel for timing
bool sync{false};
bool action_times{false};
//! Launch all kernels on the default stream
bool default_stream{false};
//!@}
Expand Down
5 changes: 3 additions & 2 deletions src/accel/SetupOptionsMessenger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,9 @@ SetupOptionsMessenger::SetupOptionsMessenger(SetupOptions* options)
add_cmd(&options->cuda_heap_size,
"heapSize",
"Set the CUDA per-thread heap size for VecGeom");
add_cmd(
&options->sync, "sync", "Sync the GPU at every kernel for timing");
add_cmd(&options->action_times,
"actionTimes",
"Add timers around every action (may reduce performance)");
add_cmd(&options->default_stream,
"defaultStream",
"Launch all kernels on the default stream");
Expand Down
2 changes: 1 addition & 1 deletion src/celeritas/global/Stepper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Stepper<M>::Stepper(Input input)
// Create action sequence
actions_ = [&] {
ActionSequence::Options opts;
opts.sync = input.sync;
opts.action_times = input.action_times;
return std::make_shared<ActionSequence>(*params_->action_reg(), opts);
}();

Expand Down
4 changes: 2 additions & 2 deletions src/celeritas/global/Stepper.hh
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ class ActionSequence;
* - \c params : Problem definition
* - \c num_track_slots : Maximum number of threads to run in parallel on GPU
* \c stream_id : Unique (thread/task) ID for this process
* - \c sync : Whether to synchronize device between actions
* - \c action_times : Whether to synchronize device between actions for timing
*/
struct StepperInput
{
std::shared_ptr<CoreParams const> params;
StreamId stream_id{};
size_type num_track_slots{};
bool sync{false};
bool action_times{false};

//! True if defined
explicit operator bool() const
Expand Down
38 changes: 28 additions & 10 deletions src/celeritas/global/detail/ActionSequence.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,33 +103,51 @@ template<MemSpace M>
void ActionSequence<Params>::execute(Params const& params, State<M>& state)
{
[[maybe_unused]] Stream::StreamT stream = nullptr;
if (M == MemSpace::device && options_.sync)
if (M == MemSpace::device && options_.action_times)
{
stream = celeritas::device().stream(state.stream_id()).get();
}

if ((M == MemSpace::host || options_.sync) && !state.warming_up())
// Running a single track slot on host:
// Skip inapplicable post-step action
auto const skip_post_action = [&](auto const& action) {
if constexpr (M != MemSpace::host)
{
return false;
}
return state.size() == 1 && action.order() == ActionOrder::post
&& action.action_id()
!= state.ref().sim.post_step_action[TrackSlotId{0}];
};

if (options_.action_times && !state.warming_up())
{
// Execute all actions and record the time elapsed
for (auto i : range(actions_.size()))
{
ScopedProfiling profile_this{actions_[i]->label()};
Stopwatch get_time;
actions_[i]->execute(params, state);
if (M == MemSpace::device)
if (auto const& action = *actions_[i]; !skip_post_action(action))
{
CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
ScopedProfiling profile_this{action.label()};
Stopwatch get_time;
action.execute(params, state);
if constexpr (M == MemSpace::device)
{
CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
}
accum_time_[i] += get_time();
}
accum_time_[i] += get_time();
}
}
else
{
// Just loop over the actions
for (auto const& sp_action : actions_)
{
ScopedProfiling profile_this{sp_action->label()};
sp_action->execute(params, state);
if (auto const& action = *sp_action; !skip_post_action(action))
{
ScopedProfiling profile_this{action.label()};
action.execute(params, state);
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/celeritas/global/detail/ActionSequence.hh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class ActionSequence
//! Construction/execution options
struct Options
{
bool sync{false}; //!< Call DeviceSynchronize and add timer
bool action_times{false}; //!< Call DeviceSynchronize and add timer
};

public:
Expand All @@ -80,7 +80,7 @@ class ActionSequence
//// ACCESSORS ////

//! Whether synchronization is taking place
bool sync() const { return options_.sync; }
bool action_times() const { return options_.action_times; }

//! Get the set of beginning-of-run actions
VecBeginAction const& begin_run_actions() const { return begin_run_; }
Expand Down

0 comments on commit d6b41af

Please sign in to comment.