Skip to content

Commit

Permalink
Introduce Weighted UVM Caching Stats Report (pytorch#1623)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#1623

previous cache stats report (e.g. cache miss rate, unique rate, etc) didn't take the number of requested indices of each TBE op into consideration, which could easily cause higher cache miss rate than the real situation, especially with unbalance-requested TBEs. Now we introduce the number of TBE ops into the UVMInfo and calculated the weighted unique rate, cache miss rate and conflict miss rate.

Differential Revision: D43729139

fbshipit-source-id: c4089b7fb5fb93586d389239bc694a346efa5222
  • Loading branch information
YuzeDaiMeta authored and facebook-github-bot committed Mar 7, 2023
1 parent 936ec59 commit e36afd1
Showing 1 changed file with 73 additions and 16 deletions.
89 changes: 73 additions & 16 deletions fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <ATen/core/op_registration/op_registration.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/library.h>
#include <algorithm>
#include "c10/core/ScalarType.h"
#ifdef FBCODE_CAFFE2
#include "common/stats/Stats.h"
Expand Down Expand Up @@ -73,27 +74,64 @@ void process_uvm_cache_stats(
const int64_t total_cache_hash_size,
const int64_t call_count,
const bool gather_uvm_stats,
const Tensor& uvm_cache_stats) {
const Tensor& uvm_cache_stats,
const bool populate_uvm_stats) {
if (gather_uvm_stats) {
static std::mutex cache_mutex;

// uvm_cache_stats_counters is a vector of size 4, storing the cumulated
// cache stats. Each element represents different counter respectively:
// uvm_cache_stats_counters[0]: num_req_indices
// uvm_cache_stats_counters[1]: num_unique_indices
// uvm_cache_stats_counters[2]: num_unique_misses
// uvm_cache_stats_counters[3]: num_unique_conflict_misses
// They should be zero-out after the calculated rates are populated into
// cache counters.
static std::vector<int64_t> uvm_cache_stats_counters(4);

// Export cache stats.
auto uvm_cache_stats_cpu = uvm_cache_stats.cpu();
auto* uvm_cache_stats_ptr = uvm_cache_stats_cpu.data_ptr<int32_t>();
if (uvm_cache_stats_ptr[1] > 0) {
// Report cache stats in per-mille.
double num_requested_indices =
static_cast<double>(uvm_cache_stats_ptr[1]);
double unique_rate = static_cast<double>(uvm_cache_stats_ptr[2] * 1000) /
num_requested_indices;
double unique_miss_rate =
static_cast<double>(uvm_cache_stats_ptr[3] * 1000) /
num_requested_indices;
double unique_conflict_miss_rate =
static_cast<double>(uvm_cache_stats_ptr[4] * 1000) /
num_requested_indices;
STATS_tbe_uvm_cache_unique_rate.addValue(unique_rate);
STATS_tbe_uvm_cache_unique_miss_rate.addValue(unique_miss_rate);
STATS_tbe_uvm_cache_conflict_unique_miss_rate.addValue(
unique_conflict_miss_rate);
{
// Add cache stats values into the culmulated variables.
std::lock_guard<std::mutex> guard(cache_mutex);
std::transform(
uvm_cache_stats_counters.begin(),
uvm_cache_stats_counters.end(),
uvm_cache_stats_ptr + 1,
uvm_cache_stats_counters.begin(),
std::plus<int>());

// Calculate cache related ratios based on the cumulated numbers and
// push them into the counter pools.
if (populate_uvm_stats) {
double unique_rate = (uvm_cache_stats_counters[0] != 0)
? static_cast<double>(uvm_cache_stats_counters[1]) /
uvm_cache_stats_counters[0] * 1000
: 0;
double unique_miss_rate = (uvm_cache_stats_counters[0] != 0)
? static_cast<double>(uvm_cache_stats_counters[2]) /
uvm_cache_stats_counters[0] * 1000
: 0;
double unique_conflict_miss_rate = (uvm_cache_stats_counters[0] != 0)
? static_cast<double>(uvm_cache_stats_counters[3]) /
uvm_cache_stats_counters[0] * 1000
: 0;
STATS_tbe_uvm_cache_unique_rate.addValue(unique_rate);
STATS_tbe_uvm_cache_unique_miss_rate.addValue(unique_miss_rate);
STATS_tbe_uvm_cache_conflict_unique_miss_rate.addValue(
unique_conflict_miss_rate);

// Fill all the elements of the vector uvm_cache_stats_counters as 0
// to zero out the cumulated counters.
std::fill(
uvm_cache_stats_counters.begin(),
uvm_cache_stats_counters.end(),
0);
}
}
}
if (call_count % FLAGS_tbe_uvm_cache_stats_print_out_period == 0) {
LOG(INFO) << "$Stats [" << signature << "] "
Expand Down Expand Up @@ -358,6 +396,13 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
cache_hash_size_cumsum.value(), indices, offsets);

bool gather_uvm_stats = false;
// populate_uvm_stats indicates whether to calculate cache related ratios,
// using the data from cumulated counters, and populate them into the cache
// stats pools to get the percentil stats. We want to calculate the weighted
// cache ratios, taking the # req indices of each TBE as the weight. so we
// will populate stats when we think the current lookup is for the last TBE
// call of the same round.
bool populate_uvm_stats = true;
Tensor uvm_cache_stats =
at::empty({0}, lxu_cache_weights.value().options().dtype(at::kInt));
#ifdef FBCODE_CAFFE2
Expand All @@ -370,6 +415,17 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
}
tbe_call_count[signature]++;
call_count = tbe_call_count[signature];

// populate_uvm_stats is used as an indicator whether to push the cache
// related ratios caclulated from cumulative counters into the cache stats
// pools. We want to wait until all the knwon TBE ops' data been included
// to get the weighted ratios.
for (const auto& [sig, count] : tbe_call_count) {
if (count < call_count) {
populate_uvm_stats = false;
break;
}
}
}

if (FLAGS_tbe_uvm_cache_stat_report > 0 &&
Expand Down Expand Up @@ -413,7 +469,8 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(
total_cache_hash_size.value(),
call_count,
gather_uvm_stats,
uvm_cache_stats);
uvm_cache_stats,
populate_uvm_stats);
#endif
}

Expand Down

0 comments on commit e36afd1

Please sign in to comment.