-
Notifications
You must be signed in to change notification settings - Fork 394
/
sorting.go
302 lines (288 loc) · 11.6 KB
/
sorting.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
// Package sorting is responsible for sorting incoming events from the BPF programs chronologically.
//
// There are 3 known sources to events sorting issues:
// 1. In perf buffer, events are read in round robing order from CPUs buffers (and not according to invocation time).
// 2. Syscall events are invoked after internal events of the syscall (though the syscall happened before the
// internal events).
// 3. Virtual CPUs might enter sleep mode by host machine scheduler and send events after some delay.
//
// To address the events perf buffers issue, the events are divided to queues according to the source CPU. This way
// the events are almost ordered (except for syscalls). The syscall events are inserted to their right chronological
// place manually.
// This way, all events which occurred before the last event of the most delaying CPU could be sent forward with
// guaranteed order.
// To make sure syscall events are not missed when sending, a small delay is needed.
// Lastly, to address the vCPU sleep issue (which might cause up to 2 events received in a delay), the events need to be
// sent after a delay which is bigger than max possible vCPU sleep time (which is just an increase of the syscall events
// delay sending).
//
// To summarize the algorithm main logic, here is textual simulation of the operation (assume that 2 scheduler ticks
// are larger than max possible vCPU sleep time):
//
// -------------------------------------------------------------------
//
// Tn = Timestamp (n == TOD)
// #m = Event's Source CPU
//
// ### Initial State
//
// [ CPU 0 ] [ CPU 1 ] [ CPU 2 ]
// HEAD T1 T2 T4
// T3 T5
// T6
// TAIL T8
//
// ### Scheduler Tick #1
//
// Incoming events: T9#1, T11#2, T13#1, T10#2, T12#2
//
// Queues state after insert:
//
// [ CPU 0 ] [ CPU 1 ] [ CPU 2 ]
// HEAD T1 T2 T4
// T3 T5 T10 +
// T6 T9 + T11 +
// TAIL T8 T13 + T12 +
//
// - No event sent.
// - Oldest timestamp = T1.
// - T8 is oldest timestamp in most recent timestamps.
// - In 2 ticks from now: send all events up to T8.
// - Bigger timestamps than T8 (+) will be sent in future scheduling.
//
// ### Scheduler Tick #2
//
// Incoming events: T7#0, T22#1, T23#2, T20#0, T25#1, T24#2, T21#0
//
// Queues state after insert:
//
// [ CPU 0 ] [ CPU 1 ] [ CPU 2 ]
// HEAD T1 ^ T2 ^ T4 ^
// T3 ^ T5 ^ T10
// T6 ^ T9 T11
// T7 +^ T13 T12
// T8 ^ T22 + T23 +
// T20 + T25 + T24 +
// TAIL T21 +
//
// - No event sent.
// - Oldest timestamp = T1.
// - T21 is oldest timestamp in most recent timestamps.
// - In 2 ticks from now: send all events up to T21.
// - T8 is previous oldest timestamp in most recent timestamps.
// - Next tick: send all events up to T8.
// - Bigger timestamps than T21 (+) will be sent in future scheduling.
//
// ### Scheduler Tick #3
//
// Incoming events: T30#0, T34#1, T35#2, T31#0, T36#2, T32#0, T37#2, T33#0, T38#2, T50#1, T51#1
//
// Queues state after insert:
//
// [ CPU 0 ] [ CPU 1 ] [ CPU 2 ]
// HEAD T20 ^ T9 ^ T10 ^
// T21 ^ T13 ^ T11 ^
// T30 + T22 T12 ^
// T31 + T23 T24
// T32 + T25 T35 +
// T33 + T34 + T36 +
// T50 + T37 +
// TAIL T51 + T38 +
//
// - Max sent timestamp = T8.
// - Oldest timestamp = T9.
// - T33 is oldest timestamp in most recent timestamps.
// - In 2 ticks from now: send all events up to T33.
// - T21 is previous oldest timestamp in most recent timestamps.
// - Next tick: send all events up to T21.
// - Bigger timestamps than T33 (+) will be sent in future scheduling.
// -------------------------------------------------------------------
package sorting
import (
gocontext "context"
"math"
"sync"
"time"
"github.com/aquasecurity/tracee/pkg/errfmt"
"github.com/aquasecurity/tracee/pkg/logger"
"github.com/aquasecurity/tracee/pkg/utils/environment"
"github.com/aquasecurity/tracee/types/trace"
)
// The minimum time of delay before sending events forward.
// It should resolve disorders originated from the way syscalls timestamps are taken (about 1ms disorder) and potential
// vCPU sleep (up to 98ms) [source - https://kinvolk.io/blog/2018/02/timing-issues-when-using-bpf-with-virtual-cpus/]
const minDelay = 100 * time.Millisecond
const eventsPassingInterval = 50 * time.Millisecond
const intervalsAmountThresholdForDelay = int(minDelay / eventsPassingInterval)
// EventsChronologicalSorter is an object responsible for sorting arriving events from perf buffer according to their
// chronological order - the time they were invoked in the kernel.
type EventsChronologicalSorter struct {
cpuEventsQueues []cpuEventsQueue // Each CPU has its own events queue because events per CPU arrive in almost chronological order
outputChanMutex sync.Mutex
extractionSavedTimestamps []int // Buffer to store timestamps of events for delayed extraction
errorChan chan<- error
eventsPassingInterval time.Duration
intervalsAmountThresholdForDelay int
}
func InitEventSorter() (*EventsChronologicalSorter, error) {
cpusAmount, err := environment.GetCPUAmount()
if err != nil {
return nil, errfmt.WrapError(err)
}
newSorter := EventsChronologicalSorter{
cpuEventsQueues: make([]cpuEventsQueue, cpusAmount),
eventsPassingInterval: eventsPassingInterval,
intervalsAmountThresholdForDelay: intervalsAmountThresholdForDelay,
}
return &newSorter, nil
}
func (sorter *EventsChronologicalSorter) StartPipeline(ctx gocontext.Context, in <-chan *trace.Event) (
chan *trace.Event, chan error) {
out := make(chan *trace.Event, 10000)
errc := make(chan error, 1)
go sorter.Start(in, out, ctx, errc)
return out, errc
}
// Start is the main function of the EventsChronologicalSorter class, which orders input events from events channels
// and pass forward all ordered events to the output channel after each interval.
// When exits, the sorter will send forward all buffered events in ordered matter.
func (sorter *EventsChronologicalSorter) Start(in <-chan *trace.Event, out chan<- *trace.Event,
ctx gocontext.Context, errc chan error) {
sorter.errorChan = errc
defer close(out)
defer close(errc)
ticker := time.NewTicker(sorter.eventsPassingInterval)
for {
select {
case newEvent := <-in:
if newEvent == nil {
sorter.sendEvents(out, math.MaxInt64)
return
}
sorter.addEvent(newEvent)
case <-ticker.C:
sorter.updateSavedTimestamps()
if len(sorter.extractionSavedTimestamps) > sorter.intervalsAmountThresholdForDelay {
extractionTimestamp := sorter.extractionSavedTimestamps[0]
sorter.extractionSavedTimestamps = sorter.extractionSavedTimestamps[1:]
go sorter.sendEvents(out, extractionTimestamp)
}
case <-ctx.Done():
sorter.sendEvents(out, math.MaxInt64)
return
}
}
}
// addEvent add a new event to the appropriate place in queue according to its timestamp
func (sorter *EventsChronologicalSorter) addEvent(newEvent *trace.Event) {
cq := &sorter.cpuEventsQueues[newEvent.ProcessorID]
err := cq.InsertByTimestamp(newEvent)
if err != nil {
sorter.errorChan <- err
}
cq.IsUpdated = true
}
// sendEvents send to output channel all events up to given timestamp
func (sorter *EventsChronologicalSorter) sendEvents(outputChan chan<- *trace.Event, extractionMaxTimestamp int) {
sorter.outputChanMutex.Lock()
defer sorter.outputChanMutex.Unlock()
for {
mostDelayingQueue, eventTimestamp, err := sorter.getMostDelayingEventCPUQueue()
if err != nil || eventTimestamp > extractionMaxTimestamp {
break
}
extractionEvent, err := mostDelayingQueue.Get()
if err != nil {
sorter.errorChan <- err
if extractionEvent == nil {
mostDelayingQueue.Empty()
continue
}
}
if extractionEvent.Timestamp != eventTimestamp {
logger.Warnw("Event queue changed while extracting events")
err := mostDelayingQueue.InsertByTimestamp(extractionEvent)
if err != nil {
sorter.errorChan <- err
}
} else {
outputChan <- extractionEvent
}
}
}
// updateSavedTimestamps add current most delaying timestamp to saved list
func (sorter *EventsChronologicalSorter) updateSavedTimestamps() {
mostDelayingLastEventTimestamp, err := sorter.getUpdatedMostDelayedLastCPUEventTimestamp()
if err != nil { // An error means no new event was received since last update
// If no CPU was updated, it means that all of the CPUs are fully updated and we can
// send all cached events received till this moment.
mostDelayingLastEventTimestamp, err = sorter.getMostRecentEventTimestamp()
if err != nil {
if len(sorter.extractionSavedTimestamps) > 0 {
mostDelayingLastEventTimestamp = sorter.extractionSavedTimestamps[len(sorter.extractionSavedTimestamps)-1]
} else {
mostDelayingLastEventTimestamp = 0
}
}
}
sorter.extractionSavedTimestamps = append(sorter.extractionSavedTimestamps, mostDelayingLastEventTimestamp)
}
// getMostDelayingEventCPUQueue search for the CPU queue which contains the oldest event.
// It also returns the timestamp of its head event, to be used for race condition checks.
// Return nil and timestamp of 0 if no valid queue found.
func (sorter *EventsChronologicalSorter) getMostDelayingEventCPUQueue() (*cpuEventsQueue, int, error) {
var mostDelayingEventQueue *cpuEventsQueue
mostDelayingEventQueueHeadTimestamp := 0
for i := 0; i < len(sorter.cpuEventsQueues); i++ {
cq := &sorter.cpuEventsQueues[i]
cqHead := cq.PeekHead()
if cqHead != nil &&
(mostDelayingEventQueue == nil ||
cqHead.Timestamp < mostDelayingEventQueueHeadTimestamp) {
mostDelayingEventQueue = cq
mostDelayingEventQueueHeadTimestamp = cqHead.Timestamp
}
}
if mostDelayingEventQueue == nil {
return nil, 0, errfmt.Errorf("no queue with events found")
}
return mostDelayingEventQueue, mostDelayingEventQueueHeadTimestamp, nil
}
// getUpdatedMostDelayedLastCPUEventTimestamp search for the CPU queue with the oldest last inserted event which was updated since
// last check
// Queues which were not updated since last check are ignored to prevent events starvation if a CPU is not active
func (sorter *EventsChronologicalSorter) getUpdatedMostDelayedLastCPUEventTimestamp() (int, error) {
var newMostDelayedEventTimestamp int
foundUpdatedQueue := false
for i := 0; i < len(sorter.cpuEventsQueues); i++ {
cq := &sorter.cpuEventsQueues[i]
queueTail := cq.PeekTail()
if queueTail != nil && cq.IsUpdated &&
(!foundUpdatedQueue ||
queueTail.Timestamp < newMostDelayedEventTimestamp) {
newMostDelayedEventTimestamp = queueTail.Timestamp
foundUpdatedQueue = true
}
cq.IsUpdated = false // Mark that the values of the queue were checked from previous time
}
if !foundUpdatedQueue {
return 0, errfmt.Errorf("no valid CPU events queue was updated since last interval")
}
return newMostDelayedEventTimestamp, nil
}
// getMostRecentEventTimestamp get the timestamp of the most recent event received from all CPUs.
func (sorter *EventsChronologicalSorter) getMostRecentEventTimestamp() (int, error) {
mostRecentEventTimestamp := 0
for i := 0; i < len(sorter.cpuEventsQueues); i++ {
cq := &sorter.cpuEventsQueues[i]
queueTail := cq.PeekTail()
if queueTail != nil &&
queueTail.Timestamp > mostRecentEventTimestamp {
mostRecentEventTimestamp = queueTail.Timestamp
}
}
if mostRecentEventTimestamp == 0 {
return 0, errfmt.Errorf("all CPU queues are empty")
}
return mostRecentEventTimestamp, nil
}