Skip to content

Commit

Permalink
Add per-cpu aggregations for min/max (#3226)
Browse files Browse the repository at this point in the history
Similar to what was done for `count` and `sum`
allow `min` and `max` to be used in expressions
and properly aggregated in map for loops.

Co-authored-by: Jordan Rome <linux@jordanrome.com>
  • Loading branch information
jordalgo and Jordan Rome committed Jun 10, 2024
1 parent 32e75a5 commit 12ee626
Show file tree
Hide file tree
Showing 17 changed files with 1,196 additions and 49 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ and this project adheres to
- [#3158](https://github.com/bpftrace/bpftrace/pull/3158)
- Add ability to attach uprobes to inlined functions
- [#3095](https://github.com/bpftrace/bpftrace/pull/3095)
- Enable count/sum map reads in kernel space (implicit casting)
- Enable count, sum, min, and max map reads in kernel space (implicit casting)
- [#3189](https://github.com/bpftrace/bpftrace/pull/3189)
- [#3226](https://github.com/bpftrace/bpftrace/pull/3226)
#### Changed
- Better error message for args in mixed probes
- [#3047](https://github.com/bpftrace/bpftrace/pull/3047)
Expand Down
2 changes: 2 additions & 0 deletions man/adoc/bpftrace.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -2545,6 +2545,7 @@ Prints:
* `max(int64 n)`

Update the map with `n` if `n` is bigger than the current value held.
Similar to `count` this uses a PER_CPU map (fast writes, slow reads).

[#map-functions-min]
=== min
Expand All @@ -2553,6 +2554,7 @@ Update the map with `n` if `n` is bigger than the current value held.
* `min(int64 n)`

Update the map with `n` if `n` is smaller than the current value held.
Similar to `count` this uses a PER_CPU map (fast writes, slow reads).

[#map-functions-stats]
=== stats
Expand Down
74 changes: 61 additions & 13 deletions src/ast/irbuilderbpf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -480,14 +480,15 @@ Value *IRBuilderBPF::CreateMapLookupElem(Value *ctx,
return ret;
}

Value *IRBuilderBPF::CreatePerCpuMapSumElems(Value *ctx,
Value *IRBuilderBPF::CreatePerCpuMapAggElems(Value *ctx,
Map &map,
Value *key,
const SizedType &type,
const location &loc,
bool is_aot)
{
/*
* int sum = 0;
* int ret = 0;
* int i = 0;
* while (i < nr_cpus) {
* int * cpu_value = map_lookup_percpu_elem(map, key, i);
Expand All @@ -498,25 +499,25 @@ Value *IRBuilderBPF::CreatePerCpuMapSumElems(Value *ctx,
* debug("No cpu found for cpu id: %lu", i) // Mostly for AOT
* break;
* }
* sum += *cpu_value;
* // Get the sum, min, or max value
* i++;
* }
* return sum;
* return ret;
*/

assert(ctx && ctx->getType() == GET_PTR_TY());

const std::string &map_name = map.ident;

AllocaInst *sum = CreateAllocaBPF(getInt64Ty(), "sum");
AllocaInst *ret = CreateAllocaBPF(getInt64Ty(), "ret");
AllocaInst *i = CreateAllocaBPF(getInt32Ty(), "i");

// Set a large upper bound if we don't know the number of cpus
// when generating the instructions
int nr_cpus = is_aot ? 1024 : bpftrace_.get_num_possible_cpus();

CreateStore(getInt32(0), i);
CreateStore(getInt64(0), sum);
CreateStore(getInt64(0), ret);

Function *parent = GetInsertBlock()->getParent();
BasicBlock *while_cond = BasicBlock::Create(module_.getContext(),
Expand Down Expand Up @@ -560,10 +561,16 @@ Value *IRBuilderBPF::CreatePerCpuMapSumElems(Value *ctx,
SetInsertPoint(lookup_success_block);
// createMapLookup returns an u8*
auto *cast = CreatePointerCast(call, getInt64Ty()->getPointerTo(), "cast");
// sum += cpu_value;
CreateStore(CreateAdd(CreateLoad(getInt64Ty(), cast),
CreateLoad(getInt64Ty(), sum)),
sum);

if (type.IsSumTy() || type.IsCountTy()) {
createPerCpuSum(ret, cast);
} else if (type.IsMaxTy()) {
createPerCpuMinMax(ret, cast, true);
} else if (type.IsMinTy()) {
createPerCpuMinMax(ret, cast, false);
} else {
LOG(BUG) << "Unsupported map aggregation type: " << type;
}

// ++i;
CreateStore(CreateAdd(CreateLoad(getInt32Ty(), i), getInt32(1)), i);
Expand Down Expand Up @@ -603,9 +610,50 @@ Value *IRBuilderBPF::CreatePerCpuMapSumElems(Value *ctx,
SetInsertPoint(while_end);

CreateLifetimeEnd(i);
Value *ret = CreateLoad(getInt64Ty(), sum);
CreateLifetimeEnd(sum);
return ret;
Value *ret_reg = CreateLoad(getInt64Ty(), ret);
CreateLifetimeEnd(ret);
return ret_reg;
}

void IRBuilderBPF::createPerCpuSum(AllocaInst *ret, Value *cpu_value)
{
CreateStore(CreateAdd(CreateLoad(getInt64Ty(), cpu_value),
CreateLoad(getInt64Ty(), ret)),
ret);
}

void IRBuilderBPF::createPerCpuMinMax(AllocaInst *ret,
Value *cpu_value,
bool is_max)
{
Function *parent = GetInsertBlock()->getParent();
BasicBlock *success_block = BasicBlock::Create(module_.getContext(),
"min_max_success",
parent);
BasicBlock *merge_block = BasicBlock::Create(module_.getContext(),
"min_max_merge",
parent);
Value *condition;

if (is_max) {
condition = CreateICmpSGT(CreateLoad(getInt64Ty(), cpu_value),
CreateLoad(getInt64Ty(), ret),
"max_cond");
} else {
condition = CreateICmpSLT(CreateLoad(getInt64Ty(), cpu_value),
CreateLoad(getInt64Ty(), ret),
"min_cond");
}
CreateCondBr(condition, success_block, merge_block);

SetInsertPoint(success_block);

// ret = cpu_value;
CreateStore(CreateLoad(getInt64Ty(), cpu_value), ret);

CreateBr(merge_block);

SetInsertPoint(merge_block);
}

void IRBuilderBPF::CreateMapUpdateElem(Value *ctx,
Expand Down
7 changes: 5 additions & 2 deletions src/ast/irbuilderbpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ class IRBuilderBPF : public IRBuilder<> {
Value *key,
SizedType &type,
const location &loc);

Value *CreatePerCpuMapSumElems(Value *ctx,
Value *CreatePerCpuMapAggElems(Value *ctx,
Map &map,
Value *key,
const SizedType &type,
const location &loc,
bool is_aot);
void CreateMapUpdateElem(Value *ctx,
Expand Down Expand Up @@ -338,6 +338,9 @@ class IRBuilderBPF : public IRBuilder<> {
size_t size,
const location *loc = nullptr);

void createPerCpuSum(AllocaInst *ret, Value *cpu_value);
void createPerCpuMinMax(AllocaInst *ret, Value *cpu_value, bool is_max);

std::map<std::string, StructType *> structs_;
};

Expand Down
8 changes: 5 additions & 3 deletions src/ast/passes/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1425,7 +1425,8 @@ void CodegenLLVM::visit(Map &map)
const auto &val_type = map_info->second.value_type;
Value *value;
if (canAggPerCpuMapElems(val_type, map_info->second.key)) {
value = b_.CreatePerCpuMapSumElems(ctx_, map, key, map.loc, is_aot_);
value = b_.CreatePerCpuMapAggElems(
ctx_, map, key, val_type, map.loc, is_aot_);
} else {
value = b_.CreateMapLookupElem(ctx_, map, key, map.loc);
}
Expand Down Expand Up @@ -4217,13 +4218,14 @@ Function *CodegenLLVM::createForEachMapCallback(
auto &val_type = decl.type.GetField(1).type;
Value *val = callback->getArg(2);

auto map_val_type = map_info->second.value_type;
const auto &map_val_type = map_info->second.value_type;
if (canAggPerCpuMapElems(map_val_type, map_info->second.key)) {
AllocaInst *key_ptr = b_.CreateAllocaBPF(b_.GetType(key_type),
"lookup_key");
b_.CreateStore(key, key_ptr);

val = b_.CreatePerCpuMapSumElems(ctx_, map, key_ptr, map.loc, is_aot_);
val = b_.CreatePerCpuMapAggElems(
ctx_, map, key_ptr, map_val_type, map.loc, is_aot_);
} else if (!onStack(val_type)) {
val = b_.CreateLoad(b_.GetType(val_type), val, "val");
}
Expand Down
3 changes: 2 additions & 1 deletion src/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,8 @@ class SizedType {
}
bool IsCastableMapTy() const
{
return type_ == Type::count || type_ == Type::sum;
return type_ == Type::count || type_ == Type::sum || type_ == Type::max ||
type_ == Type::min;
}

friend std::ostream &operator<<(std::ostream &, const SizedType &);
Expand Down
14 changes: 7 additions & 7 deletions tests/codegen/llvm/count_cast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ entry:
%key = alloca i32, align 4
%print_integer_8_t = alloca %print_integer_8_t, align 8
%i = alloca i32, align 4
%sum = alloca i64, align 8
%ret = alloca i64, align 8
%"@x_key1" = alloca i64, align 8
%initial_value = alloca i64, align 8
%lookup_elem_val = alloca i64, align 8
Expand Down Expand Up @@ -59,12 +59,12 @@ lookup_merge: ; preds = %lookup_failure, %lo
%9 = bitcast i64* %"@x_key1" to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %9)
store i64 0, i64* %"@x_key1", align 8
%10 = bitcast i64* %sum to i8*
%10 = bitcast i64* %ret to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %10)
%11 = bitcast i32* %i to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %11)
store i32 0, i32* %i, align 4
store i64 0, i64* %sum, align 8
store i64 0, i64* %ret, align 8
br label %while_cond

if_body: ; preds = %while_end
Expand Down Expand Up @@ -100,8 +100,8 @@ while_body: ; preds = %while_cond
while_end: ; preds = %error_failure, %error_success, %while_cond
%20 = bitcast i32* %i to i8*
call void @llvm.lifetime.end.p0i8(i64 -1, i8* %20)
%21 = load i64, i64* %sum, align 8
%22 = bitcast i64* %sum to i8*
%21 = load i64, i64* %ret, align 8
%22 = bitcast i64* %ret to i8*
call void @llvm.lifetime.end.p0i8(i64 -1, i8* %22)
%23 = bitcast i64* %"@x_key1" to i8*
call void @llvm.lifetime.end.p0i8(i64 -1, i8* %23)
Expand All @@ -112,10 +112,10 @@ while_end: ; preds = %error_failure, %err

lookup_success2: ; preds = %while_body
%cast5 = bitcast i8* %lookup_percpu_elem to i64*
%26 = load i64, i64* %sum, align 8
%26 = load i64, i64* %ret, align 8
%27 = load i64, i64* %cast5, align 8
%28 = add i64 %27, %26
store i64 %28, i64* %sum, align 8
store i64 %28, i64* %ret, align 8
%29 = load i32, i32* %i, align 4
%30 = add i32 %29, 1
store i32 %30, i32* %i, align 4
Expand Down
14 changes: 7 additions & 7 deletions tests/codegen/llvm/count_cast_loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,18 @@ define internal i64 @map_for_each_cb(i8* %0, i8* %1, i8* %2, i8* %3) section ".t
%tuple = alloca %"unsigned int64_count__tuple_t", align 8
%"$kv" = alloca %"unsigned int64_count__tuple_t", align 8
%i = alloca i32, align 4
%sum = alloca i64, align 8
%ret = alloca i64, align 8
%lookup_key = alloca i64, align 8
%key = load i64, i8* %1, align 8
%5 = bitcast i64* %lookup_key to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %5)
store i64 %key, i64* %lookup_key, align 8
%6 = bitcast i64* %sum to i8*
%6 = bitcast i64* %ret to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %6)
%7 = bitcast i32* %i to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %7)
store i32 0, i32* %i, align 4
store i64 0, i64* %sum, align 8
store i64 0, i64* %ret, align 8
br label %while_cond

while_cond: ; preds = %lookup_success, %4
Expand All @@ -96,8 +96,8 @@ while_body: ; preds = %while_cond
while_end: ; preds = %error_failure, %error_success, %while_cond
%10 = bitcast i32* %i to i8*
call void @llvm.lifetime.end.p0i8(i64 -1, i8* %10)
%11 = load i64, i64* %sum, align 8
%12 = bitcast i64* %sum to i8*
%11 = load i64, i64* %ret, align 8
%12 = bitcast i64* %ret to i8*
call void @llvm.lifetime.end.p0i8(i64 -1, i8* %12)
%13 = bitcast %"unsigned int64_count__tuple_t"* %"$kv" to i8*
call void @llvm.lifetime.start.p0i8(i64 -1, i8* %13)
Expand Down Expand Up @@ -137,10 +137,10 @@ while_end: ; preds = %error_failure, %err

lookup_success: ; preds = %while_body
%cast = bitcast i8* %lookup_percpu_elem to i64*
%32 = load i64, i64* %sum, align 8
%32 = load i64, i64* %ret, align 8
%33 = load i64, i64* %cast, align 8
%34 = add i64 %33, %32
store i64 %34, i64* %sum, align 8
store i64 %34, i64* %ret, align 8
%35 = load i32, i32* %i, align 4
%36 = add i32 %35, 1
store i32 %36, i32* %i, align 4
Expand Down
Loading

0 comments on commit 12ee626

Please sign in to comment.