/
papi.cc
300 lines (275 loc) · 11.5 KB
/
papi.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <papi.h>
#include <tvm/runtime/contrib/papi.h>
#include <string>
#include <vector>
namespace tvm {
namespace runtime {
namespace profiling {
#define PAPI_CALL(func) \
{ \
int e = (func); \
if (e != PAPI_OK) { \
LOG(FATAL) << "PAPIError: in function " #func " " << e << " " \
<< std::string(PAPI_strerror(e)); \
} \
}
static const std::unordered_map<DLDeviceType, std::vector<std::string>> default_metric_names = {
{kDLCPU,
{"perf::CYCLES", "perf::STALLED-CYCLES-FRONTEND", "perf::STALLED-CYCLES-BACKEND",
"perf::INSTRUCTIONS", "perf::CACHE-MISSES"}},
{kDLCUDA, {"cuda:::event:elapsed_cycles_sm:device=0"}}};
/*! \brief Object that holds the values of counters at the start of a function call. */
struct PAPIEventSetNode : public Object {
/*! \brief The starting values of counters for all metrics of a specific device. */
std::vector<long_long> start_values;
/*! \brief The device these counters are for. */
Device dev;
explicit PAPIEventSetNode(std::vector<long_long> start_values, Device dev)
: start_values(start_values), dev(dev) {}
static constexpr const char* _type_key = "PAPIEventSetNode";
TVM_DECLARE_FINAL_OBJECT_INFO(PAPIEventSetNode, Object);
};
/* Get the PAPI component id for the given device.
* \param dev The device to get the component for.
* \returns PAPI component id for the device. Returns -1 if the device is not
* supported by PAPI.
*/
int component_for_device(Device dev) {
std::string component_name;
switch (dev.device_type) {
case kDLCPU:
component_name = "perf_event";
break;
case kDLCUDA:
component_name = "cuda";
break;
case kDLROCM:
component_name = "rocm";
break;
default:
LOG(WARNING) << "PAPI does not support device " << DLDeviceType2Str(dev.device_type);
return -1;
}
int cidx = PAPI_get_component_index(component_name.c_str());
if (cidx < 0) {
LOG(FATAL) << "Cannot find PAPI component \"" << component_name
<< "\". Maybe you need to build PAPI with support for this component (use "
"`./configure --with-components="
<< component_name << "`).";
}
return cidx;
}
/*! \brief MetricCollectorNode for PAPI metrics.
*
* PAPI (Performance Application Programming Interface) collects metrics on a
* variety of platforms including cpu, cuda and rocm.
*
* PAPI is avaliable at https://github.com/icl-utk-edu/papi.
*/
struct PAPIMetricCollectorNode final : public MetricCollectorNode {
/*! \brief Construct a metric collector that collects a specific set of metrics.
*
* \param metrics A mapping from a device type to the metrics that should be
* collected on that device. You can find the names of available metrics by
* running `papi_native_avail`.
*/
explicit PAPIMetricCollectorNode(Map<DeviceWrapper, Array<String>> metrics) {
for (auto& p : metrics) {
papi_metric_names[p.first->device] = {};
for (auto& metric : p.second) {
papi_metric_names[p.first->device].push_back(metric);
}
}
}
explicit PAPIMetricCollectorNode() {}
/*! \brief Initialization call.
* \param devices The devices this collector will be running on
*/
void Init(Array<DeviceWrapper> devices) {
if (!PAPI_is_initialized()) {
if (sizeof(long_long) > sizeof(int64_t)) {
LOG(WARNING) << "PAPI's long_long is larger than int64_t. Overflow may occur when "
"reporting metrics.";
}
CHECK_EQ(PAPI_library_init(PAPI_VER_CURRENT), PAPI_VER_CURRENT)
<< "Error while initializing PAPI";
}
// If no metrics were provided we use the default set. The names were not
// initialized in the constructor because we did not know which devices we
// were running on.
if (papi_metric_names.size() == 0) {
for (auto wrapped_device : devices) {
Device device = wrapped_device->device;
auto it = default_metric_names.find(device.device_type);
if (it != default_metric_names.end()) {
papi_metric_names[device] = it->second;
}
}
}
// create event sets for each device
for (auto wrapped_device : devices) {
Device device = wrapped_device->device;
int cidx = component_for_device(device);
// unknown device, skipping
if (cidx < 0) {
continue;
}
auto it = papi_metric_names.find(device);
// skip devices with no metrics defined
if (it == papi_metric_names.end() || it->second.size() == 0) {
continue;
}
auto& metric_names = it->second;
const PAPI_component_info_t* component = PAPI_get_component_info(cidx);
if (component->disabled) {
std::string help_message = "";
switch (device.device_type) {
case kDLCPU:
help_message =
"Try setting `sudo sh -c 'echo 1 >/proc/sys/kernel/perf_event_paranoid'`";
break;
case kDLCUDA:
help_message =
"Try enabling gpu profiling with `modprobe nvidia "
"NVreg_RestrictProfilingToAdminUsers=0`. If that does not work, try adding "
"`options nvidia \"NVreg_RestrictProfilingToAdminUsers=0\"` to "
"`/etc/modprobe.d/nvidia-kernel-common.conf`.";
break;
default:
break;
}
LOG(WARNING) << "PAPI could not initialize counters for "
<< DLDeviceType2Str(device.device_type) << ": " << component->disabled_reason
<< "\n"
<< help_message;
continue;
}
int event_set = PAPI_NULL;
PAPI_CALL(PAPI_create_eventset(&event_set));
PAPI_CALL(PAPI_assign_eventset_component(event_set, cidx));
if (device.device_type == kDLCPU) {
// we set PAPI_INHERIT to make it so threads created after this inherit the event_set.
PAPI_option_t opt;
memset(&opt, 0x0, sizeof(PAPI_option_t));
opt.inherit.inherit = PAPI_INHERIT_ALL;
opt.inherit.eventset = event_set;
PAPI_CALL(PAPI_set_opt(PAPI_INHERIT, &opt));
}
if (static_cast<int>(metric_names.size()) > PAPI_num_cmp_hwctrs(cidx)) {
PAPI_CALL(PAPI_set_multiplex(event_set));
}
// add all the metrics
for (auto metric : metric_names) {
int e = PAPI_add_named_event(event_set, metric.c_str());
if (e != PAPI_OK) {
LOG(FATAL) << "PAPIError: " << e << " " << std::string(PAPI_strerror(e)) << ": " << metric
<< ".";
}
}
// Because we may have multiple calls in flight at the same time, we
// start all the timers when we initialize. Then we calculate the metrics
// counts for a call by comparing counter values at the start vs end of
// the call.
PAPI_CALL(PAPI_start(event_set));
event_sets[device] = event_set;
}
}
/*! \brief Called right before a function call. Reads starting values of the
* measured metrics.
*
* \param dev The device the function will be run on.
* \returns A `PAPIEventSetNode` containing values for the counters at the
* start of the call. Passed to a corresponding `Stop` call.
*/
ObjectRef Start(Device dev) final {
// Record counter values at the start of the call, so we can calculate the
// metrics for the call by comparing the values at the end of the call.
auto it = event_sets.find(dev);
if (it != event_sets.end()) {
int event_set = it->second;
std::vector<long_long> values(papi_metric_names[dev].size());
PAPI_CALL(PAPI_read(event_set, values.data()));
return ObjectRef(make_object<PAPIEventSetNode>(values, dev));
} else {
return ObjectRef(nullptr);
}
}
/*! \brief Called right after a function call. Reads ending values of the
* measured metrics. Computes the change in each metric from the
* corresponding `Start` call.
*
* \param obj `PAPIEventSetNode` created by a call to `Start`.
* \returns A mapping from metric name to value.
*/
Map<String, ObjectRef> Stop(ObjectRef obj) final {
const PAPIEventSetNode* event_set_node = obj.as<PAPIEventSetNode>();
std::vector<long_long> end_values(papi_metric_names[event_set_node->dev].size());
PAPI_CALL(PAPI_read(event_sets[event_set_node->dev], end_values.data()));
std::unordered_map<String, ObjectRef> reported_metrics;
for (size_t i = 0; i < end_values.size(); i++) {
if (end_values[i] < event_set_node->start_values[i]) {
LOG(WARNING) << "Detected overflow when reading performance counter, setting value to -1.";
reported_metrics[papi_metric_names[event_set_node->dev][i]] =
ObjectRef(make_object<CountNode>(-1));
} else {
reported_metrics[papi_metric_names[event_set_node->dev][i]] =
ObjectRef(make_object<CountNode>(end_values[i] - event_set_node->start_values[i]));
}
}
return reported_metrics;
}
~PAPIMetricCollectorNode() final {
for (auto p : event_sets) {
PAPI_CALL(PAPI_stop(p.second, NULL));
PAPI_CALL(PAPI_cleanup_eventset(p.second));
PAPI_CALL(PAPI_destroy_eventset(&p.second));
}
}
/*! \brief Device-specific event sets. Contains the running counters (the int values) for that
* device. */
std::unordered_map<Device, int> event_sets;
/*! \brief Device-specific metric names. Order of names matches the order in the corresponding
* `event_set`. */
std::unordered_map<Device, std::vector<std::string>> papi_metric_names;
static constexpr const char* _type_key = "runtime.profiling.PAPIMetricCollector";
TVM_DECLARE_FINAL_OBJECT_INFO(PAPIMetricCollectorNode, MetricCollectorNode);
};
/*! \brief Wrapper for `PAPIMetricCollectorNode`. */
class PAPIMetricCollector : public MetricCollector {
public:
explicit PAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
data_ = make_object<PAPIMetricCollectorNode>(metrics);
}
TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PAPIMetricCollector, MetricCollector,
PAPIMetricCollectorNode);
};
MetricCollector CreatePAPIMetricCollector(Map<DeviceWrapper, Array<String>> metrics) {
return PAPIMetricCollector(metrics);
}
TVM_REGISTER_OBJECT_TYPE(PAPIEventSetNode);
TVM_REGISTER_OBJECT_TYPE(PAPIMetricCollectorNode);
TVM_REGISTER_GLOBAL("runtime.profiling.PAPIMetricCollector")
.set_body_typed([](Map<DeviceWrapper, Array<String>> metrics) {
return PAPIMetricCollector(metrics);
});
} // namespace profiling
} // namespace runtime
} // namespace tvm