Skip to content

Commit

Permalink
feature(proctree): start the process tree data structure
Browse files Browse the repository at this point in the history
- Implement a "process tree" based on processes and threads hashes.
- Make sure entries are limited (per LRU map hash:Process).
- Update tree from fork, exec and exit signal events.
- Feed the process tree from procfs on initialization.
- Display the process tree periodically for debugging purposes.
- Add timestamps to process signal events.
- Functions capable of transforming jiffies and ticks into ns and Time.
- Adjust hash to userland timestamp precision.
  • Loading branch information
rafaeldtinoco committed Sep 13, 2023
1 parent 8bef843 commit 8f06bbe
Show file tree
Hide file tree
Showing 22 changed files with 2,001 additions and 222 deletions.
66 changes: 47 additions & 19 deletions pkg/changelog/changelog.go
Expand Up @@ -3,43 +3,65 @@ package changelog
import "time"

type item[T any] struct {
stamp time.Time // timestamp of the change
value T // value of the change
timestamp time.Time // timestamp of the change
value T // value of the change
}

// The changelog package provides a changelog data structure. It is a list of changes, each with a
// timestamp. The changelog can be queried for the value at a given time.

// ATTENTION: You should use Changelog within a struct and provide methods to access it,
// coordinating access through your struct mutexes. DO NOT EXPOSE the changelog object directly to
// the outside world as it is not thread-safe.

type Changelog[T any] struct {
changes []item[T] // list of changes
}

// Getters

// GetCurrent: Observation on single element changelog.
//
// If there's one element in the changelog, after the loop, left would be set to 1 if the single
// timestamp is before the targetTime, and 0 if it's equal or after.
//
// BEFORE: If the single timestamp is before the targetTime, when we return
// clv.changes[left-1].value, returns clv.changes[0].value, which is the expected behavior.
//
// AFTER: If the single timestamp is equal to, or after the targetTime, the current logic would
// return a "zero" value because of the condition if left == 0.
//
// We need to find the last change that occurred before or exactly at the targetTime. The binary
// search loop finds the position where a new entry with the targetTime timestamp would be inserted
// to maintain chronological order:
//
// This position is stored in "left".
//
// So, to get the last entry that occurred before the targetTime, we need to access the previous
// position, which is left-1.
//
// GetCurrent returns the latest value of the changelog.
func (clv *Changelog[T]) GetCurrent() T {
if len(clv.changes) == 0 {
var zero T
return zero
return returnZero[T]()
}

return clv.changes[len(clv.changes)-1].value
}

// Get returns the value of the changelog at the given time.
func (clv *Changelog[T]) Get(targetTime time.Time) T {
left, right := 0, len(clv.changes)

for left < right {
middle := (left + right) / 2
if clv.changes[middle].stamp.Before(targetTime) {
left = middle + 1
} else {
right = middle
}
if len(clv.changes) == 0 {
return returnZero[T]()
}
if left == 0 {

idx := clv.findIndex(targetTime)
if idx == 0 {
var zero T
return zero
}

return clv.changes[left-1].value
return clv.changes[idx-1].value
}

// GetAll returns all the values of the changelog.
Expand Down Expand Up @@ -68,11 +90,11 @@ func (clv *Changelog[T]) Set(value T, targetTime time.Time) {
// setAt sets the value of the changelog at the given time.
func (clv *Changelog[T]) setAt(value T, targetTime time.Time) {
entry := item[T]{
stamp: targetTime,
value: value,
timestamp: targetTime,
value: value,
}

idx := clv.findIndex(entry.stamp)
idx := clv.findIndex(entry.timestamp)
clv.changes = append(clv.changes, item[T]{})
copy(clv.changes[idx+1:], clv.changes[idx:])
clv.changes[idx] = entry
Expand All @@ -84,7 +106,7 @@ func (clv *Changelog[T]) findIndex(target time.Time) int {

for left < right {
middle := (left + right) / 2
if clv.changes[middle].stamp.Before(target) {
if clv.changes[middle].timestamp.Before(target) {
left = middle + 1
} else {
right = middle
Expand All @@ -93,3 +115,9 @@ func (clv *Changelog[T]) findIndex(target time.Time) int {

return left
}

// returnZero returns the zero value of the type T.
func returnZero[T any]() T {
var zero T
return zero
}
12 changes: 12 additions & 0 deletions pkg/ebpf/c/common/hash.h
Expand Up @@ -94,4 +94,16 @@ u32 hash_u32_and_u64(u32 arg1, u64 arg2)
return murmur32(buffer, 4 + 8); // 4 + 8 = sizeof(u32) + sizeof(u64)
}

// hash_task_id is a wrapper, around HashU32AndU64, that rounds up the timestamp argument to the
// precision userland will obtain from the procfs (since start_time is measured in clock ticks).
// This is needed so the process tree can be updated by procfs readings as well. The userland
// precision is defined by USER_HZ, which is 100HZ in almost all cases (untrue for embedded systems

u32 hash_task_id(u32 arg1, u64 arg2)
{
u64 round = arg2 / 10000000LL; // (1000000000 / USER_HZ) = 10000000
round *= 10000000LL;
return hash_u32_and_u64(arg1, round);
}

#endif
5 changes: 5 additions & 0 deletions pkg/ebpf/c/common/task.h
Expand Up @@ -164,6 +164,11 @@ statfunc struct task_struct *get_parent_task(struct task_struct *task)
return BPF_CORE_READ(task, real_parent);
}

statfunc struct task_struct *get_leader_task(struct task_struct *task)
{
return BPF_CORE_READ(task, group_leader);
}

statfunc u32 get_task_exit_code(struct task_struct *task)
{
return BPF_CORE_READ(task, exit_code);
Expand Down
167 changes: 99 additions & 68 deletions pkg/ebpf/c/tracee.bpf.c
Expand Up @@ -6082,46 +6082,86 @@ int sched_process_fork_signal(struct bpf_raw_tracepoint_args *ctx)
if (unlikely(signal == NULL))
return 0;

struct task_struct *parent = (struct task_struct *) ctx->args[0];
struct task_struct *leader = BPF_CORE_READ(parent, group_leader);

// Pick the leader (and not LWPs) as parent to construct the process tree:

u64 parent_starttime = get_task_start_time(leader);
int parent_pid = get_task_host_pid(leader);
int parent_tgid = get_task_host_tgid(leader);
int parent_ns_pid = get_task_ns_pid(leader);
int parent_ns_tgid = get_task_ns_tgid(leader);

struct task_struct *child = (struct task_struct *) ctx->args[1];
struct task_struct *leader = get_leader_task(child);
struct task_struct *parent = get_leader_task(get_parent_task(leader));

u64 child_starttime = get_task_start_time(child);
int child_pid = get_task_host_pid(child);
int child_tgid = get_task_host_tgid(child);
int child_ns_pid = get_task_ns_pid(child);
int child_ns_tgid = get_task_ns_tgid(child);
// In the Linux kernel:
//
// Every task (a process or a thread) is represented by a `task_struct`:
//
// - `pid`: Inside the `task_struct`, there's a field called `pid`. This is a unique identifier
// for every task, which can be thought of as the thread ID (TID) from a user space
// perspective. Every task, whether it's the main thread of a process or an additional thread,
// has a unique `pid`.
//
// - `tgid` (Thread Group ID): This field in the `task_struct` is used to group threads from the
// same process. For the main thread of a process, the `tgid` is the same as its `pid`. For
// other threads created by that process, the `tgid` matches the `pid` of the main thread.
//
// In userspace:
//
// - `getpid()` returns the TGID, effectively the traditional process ID.
// - `gettid()` returns the PID (from the `task_struct`), effectively the thread ID.
//
// This design in the Linux kernel leads to a unified handling of processes and threads. In the
// kernel's view, every thread is a task with potentially shared resources, but each has a
// unique PID. In user space, the distinction is made where processes have a unique PID, and
// threads within those processes have unique TIDs.

// Summary:
// userland pid = kernel tgid
// userland tgid = kernel pid

u64 parent_starttime = get_task_start_time(parent);
int parent_pid = get_task_host_tgid(parent);
int parent_tid = get_task_host_pid(parent);
int parent_ns_pid = get_task_ns_tgid(parent);
int parent_ns_tid = get_task_ns_pid(parent);

u64 leader_starttime = get_task_start_time(leader);
int leader_pid = get_task_host_tgid(leader);
int leader_tid = get_task_host_pid(leader);
int leader_ns_pid = get_task_ns_tgid(leader);
int leader_ns_tid = get_task_ns_pid(leader);

// Hashes
u64 child_starttime = get_task_start_time(child);
int child_pid = get_task_host_tgid(child);
int child_tid = get_task_host_pid(child);
int child_ns_pid = get_task_ns_tgid(child);
int child_ns_tid = get_task_ns_pid(child);

u32 task_hash = hash_u32_and_u64(get_task_host_tgid(child), child_starttime);
u32 parent_hash = hash_u32_and_u64(get_task_host_tgid(parent), parent_starttime);
save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 0);
save_to_submit_buf(&signal->args_buf, (void *) &parent_hash, sizeof(u32), 1);
// The event timestamp, so process tree info can be changelog'ed.
u64 timestamp = bpf_ktime_get_ns();
save_to_submit_buf(&signal->args_buf, &timestamp, sizeof(u64), 0);

// Fork logic
// The hash is always calculated with "task_struct->pid + start_time".
u32 task_hash = hash_task_id(child_tid, child_starttime);
u32 parent_hash = hash_task_id(parent_tid, parent_starttime);
u32 leader_hash = hash_task_id(leader_tid, leader_starttime);

// hashes
save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 1);
save_to_submit_buf(&signal->args_buf, (void *) &parent_hash, sizeof(u32), 2);
save_to_submit_buf(&signal->args_buf, (void *) &leader_hash, sizeof(u32), 3);
// parent
save_to_submit_buf(&signal->args_buf, (void *) &parent_tgid, sizeof(int), 2);
save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_tgid, sizeof(int), 3);
save_to_submit_buf(&signal->args_buf, (void *) &parent_pid, sizeof(int), 4);
save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_pid, sizeof(int), 5);
save_to_submit_buf(&signal->args_buf, (void *) &parent_starttime, sizeof(u64), 6);
save_to_submit_buf(&signal->args_buf, (void *) &parent_tid, sizeof(int), 4);
save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_tid, sizeof(int), 5);
save_to_submit_buf(&signal->args_buf, (void *) &parent_pid, sizeof(int), 6);
save_to_submit_buf(&signal->args_buf, (void *) &parent_ns_pid, sizeof(int), 7);
save_to_submit_buf(&signal->args_buf, (void *) &parent_starttime, sizeof(u64), 8);
// leader
save_to_submit_buf(&signal->args_buf, (void *) &leader_tid, sizeof(int), 9);
save_to_submit_buf(&signal->args_buf, (void *) &leader_ns_tid, sizeof(int), 10);
save_to_submit_buf(&signal->args_buf, (void *) &leader_pid, sizeof(int), 11);
save_to_submit_buf(&signal->args_buf, (void *) &leader_ns_pid, sizeof(int), 12);
save_to_submit_buf(&signal->args_buf, (void *) &leader_starttime, sizeof(u64), 13);
// child
save_to_submit_buf(&signal->args_buf, (void *) &child_tgid, sizeof(int), 7);
save_to_submit_buf(&signal->args_buf, (void *) &child_ns_tgid, sizeof(int), 8);
save_to_submit_buf(&signal->args_buf, (void *) &child_pid, sizeof(int), 9);
save_to_submit_buf(&signal->args_buf, (void *) &child_ns_pid, sizeof(int), 10);
save_to_submit_buf(&signal->args_buf, (void *) &child_starttime, sizeof(u64), 11);
save_to_submit_buf(&signal->args_buf, (void *) &child_tid, sizeof(int), 14);
save_to_submit_buf(&signal->args_buf, (void *) &child_ns_tid, sizeof(int), 15);
save_to_submit_buf(&signal->args_buf, (void *) &child_pid, sizeof(int), 16);
save_to_submit_buf(&signal->args_buf, (void *) &child_ns_pid, sizeof(int), 17);
save_to_submit_buf(&signal->args_buf, (void *) &child_starttime, sizeof(u64), 18);

signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_FORK);

Expand All @@ -6146,16 +6186,14 @@ int sched_process_exec_signal(struct bpf_raw_tracepoint_args *ctx)
struct task_struct *task = (struct task_struct *) ctx->args[0];
if (task == NULL)
return -1;
struct task_struct *parent = get_parent_task(task);
if (parent == NULL)
return -1;
struct task_struct *leader = BPF_CORE_READ(parent, group_leader);
if (leader == NULL)
return -1;

// Hash is always calculated with TID + START_TIME, for processes PID == TID
u32 task_hash = hash_u32_and_u64(get_task_host_tgid(task), get_task_start_time(task));
save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 0);
// The event timestamp, so process tree info can be changelog'ed.
u64 timestamp = bpf_ktime_get_ns();
save_to_submit_buf(&signal->args_buf, &timestamp, sizeof(u64), 0);

// The hash is always calculated with "task_struct->pid + start_time".
u32 task_hash = hash_task_id(get_task_host_tgid(task), get_task_start_time(task));
save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 1);

// Exec logic

Expand All @@ -6177,18 +6215,18 @@ int sched_process_exec_signal(struct bpf_raw_tracepoint_args *ctx)
u64 ctime = get_ctime_nanosec_from_file(file);
umode_t inode_mode = get_inode_mode_from_file(file);

save_str_to_buf(&signal->args_buf, (void *) filename, 1); // cmdpath
save_str_to_buf(&signal->args_buf, file_path, 2); // pathname
save_to_submit_buf(&signal->args_buf, &s_dev, sizeof(dev_t), 3); // dev
save_to_submit_buf(&signal->args_buf, &inode_nr, sizeof(unsigned long), 4); // inode
save_to_submit_buf(&signal->args_buf, &ctime, sizeof(u64), 5); // ctime
save_to_submit_buf(&signal->args_buf, &inode_mode, sizeof(umode_t), 6); // inode_mode
save_str_to_buf(&signal->args_buf, (void *) filename, 2); // cmdpath
save_str_to_buf(&signal->args_buf, file_path, 3); // pathname
save_to_submit_buf(&signal->args_buf, &s_dev, sizeof(dev_t), 4); // dev
save_to_submit_buf(&signal->args_buf, &inode_nr, sizeof(unsigned long), 5); // inode
save_to_submit_buf(&signal->args_buf, &ctime, sizeof(u64), 6); // ctime
save_to_submit_buf(&signal->args_buf, &inode_mode, sizeof(umode_t), 7); // inode_mode

// The proc_info interpreter field is set by "load_elf_phdrs" kprobe program.
save_str_to_buf(&signal->args_buf, &proc_info->interpreter.pathname, 7); // interpreter_pathname
save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 8); // interpreter_dev
save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.inode, sizeof(unsigned long), 9); // interpreter_inode
save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.ctime, sizeof(u64), 10); // interpreter_ctime
save_str_to_buf(&signal->args_buf, &proc_info->interpreter.pathname, 8); // interpreter_pathname
save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.device, sizeof(dev_t), 9); // interpreter_dev
save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.inode, sizeof(unsigned long), 10); // interpreter_inode
save_to_submit_buf(&signal->args_buf, &proc_info->interpreter.id.ctime, sizeof(u64), 11); // interpreter_ctime

struct mm_struct *mm = get_mm_from_task(task); // bprm->mm is null here, but task->mm is not

Expand All @@ -6206,11 +6244,11 @@ int sched_process_exec_signal(struct bpf_raw_tracepoint_args *ctx)
if (get_task_parent_flags(task) & PF_KTHREAD)
invoked_from_kernel = 1;

save_args_str_arr_to_buf(&signal->args_buf, (void *) arg_start, (void *) arg_end, argc, 11); // argv
save_str_to_buf(&signal->args_buf, (void *) interp, 12); // interp
save_to_submit_buf(&signal->args_buf, &stdin_type, sizeof(unsigned short), 13); // stdin_type
save_str_to_buf(&signal->args_buf, stdin_path, 14); // stdin_path
save_to_submit_buf(&signal->args_buf, &invoked_from_kernel, sizeof(int), 15); // invoked_from_kernel
save_args_str_arr_to_buf(&signal->args_buf, (void *) arg_start, (void *) arg_end, argc, 12); // argv
save_str_to_buf(&signal->args_buf, (void *) interp, 13); // interp
save_to_submit_buf(&signal->args_buf, &stdin_type, sizeof(unsigned short), 14); // stdin_type
save_str_to_buf(&signal->args_buf, stdin_path, 15); // stdin_path
save_to_submit_buf(&signal->args_buf, &invoked_from_kernel, sizeof(int), 16); // invoked_from_kernel

signal_perf_submit(ctx, signal, SIGNAL_SCHED_PROCESS_EXEC);

Expand All @@ -6228,23 +6266,16 @@ int sched_process_exit_signal(struct bpf_raw_tracepoint_args *ctx)

struct task_struct *task = (struct task_struct *) bpf_get_current_task();

// hashes

u64 id = bpf_get_current_pid_tgid();
u32 host_tid = id;
u32 host_pid = id >> 32;
u64 start_time = get_task_start_time(task);

struct task_struct *parent = get_parent_task(task);
if (parent == NULL)
return -1;
struct task_struct *leader = BPF_CORE_READ(parent, group_leader);
if (leader == NULL)
return -1;

u32 task_hash = hash_u32_and_u64(get_task_host_tgid(task), get_task_start_time(task));
save_to_submit_buf(&signal->args_buf, (void *) &task_hash, sizeof(u32), 0);
// The hash is always calculated with "task_struct->pid + start_time".
u32 hash = hash_task_id(host_tid, start_time);
save_to_submit_buf(&signal->args_buf, (void *) &hash, sizeof(u32), 0);

// exit logic
// Exit logic.

bool group_dead = false;
struct signal_struct *s = BPF_CORE_READ(task, signal);
Expand Down
17 changes: 17 additions & 0 deletions pkg/ebpf/controlplane/controller.go
Expand Up @@ -60,6 +60,23 @@ func (ctrl *Controller) Start() {

func (ctrl *Controller) Run(ctx context.Context) {
ctrl.ctx = ctx

// DEBUG: uncomment to print process tree periodically (for debugging purposes)
// go func() {
// for {
// time.Sleep(5 * time.Second)
// fmt.Printf("%s", ctrl.processTree)
// }
// }()

// TODO: Should tracee run the FeedFromProcFS periodically?
go func() {
err := ctrl.processTree.FeedFromProcFS()
if err != nil {
logger.Debugw("error feeding process tree from procfs", "error", err)
}
}()

for {
select {
case signalData := <-ctrl.signalChan:
Expand Down

0 comments on commit 8f06bbe

Please sign in to comment.