Skip to content

Commit 4bad3d6

Browse files
committed
linux: restore V1 monitoring-thread teardown protocol
A previous refactor (b388b1c) symmetrised the cleanup handler's lock/unlock for tsan-lockset cleanliness, which moved the kq_mtx unlock point ahead of monitoring_thread_cleanup clearing monitoring_tid. That opened a window where a racing kqueue() could observe monitoring_tid still set after the loop had broken out and F_SETOWN_EX its close-detect signal to a TID about to die; the kernel discards thread-directed RT signals when the target exits, so the kq stranded in kq_list and libkqueue_drain_pending_close spun until cap (test_kqueue_pipe_peer_close_uaf). A follow-up (67aea58) then dropped pthread_detach in the SELF_CANCEL path because libkqueue_free's tid-read could observe the joinable to detached transition - but that was a symptom of the same unlock-too-early move; V1 held kq_mtx continuously through pthread_detach, making the read atomic with respect to the detach. Restore V1: reintroduce thread_exit_state {SELF_CANCEL, CANCEL_LOCKED, CANCEL_UNLOCKED} and the state-driven cleanup; loop's break path holds kq_mtx through testcancel + cleanup + pthread_detach + final unlock; libkqueue_free reads monitoring_tid under kq_mtx and skips cancel/join when the thread has already self-detached. The linux_kqueue_interrupt and KQ_WAKE udata work from b62d6f8 / 4e5c1a0 is preserved untouched. The tools/tsan.supp 'race:monitoring_thread_cleanup' suppression (6465a87) handles the tsan limitation that motivated b388b1c.
1 parent abb653e commit 4bad3d6

1 file changed

Lines changed: 95 additions & 117 deletions

File tree

src/linux/platform.c

Lines changed: 95 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,19 @@ static __thread struct epoll_event epoll_events[MAX_KEVENT];
3535
*/
3636
static pthread_t monitoring_thread;
3737
static pid_t monitoring_tid; /* Monitoring thread */
38-
static atomic_bool monitoring_thread_created; /* set after successful pthread_create;
39-
distinguishes "never started" from
40-
"started, then exited" (which clears
41-
monitoring_tid). Cleared in the
42-
fork hook so the child can spin up
43-
its own monitoring thread. */
44-
4538
static pthread_mutex_t monitoring_thread_mtx = PTHREAD_MUTEX_INITIALIZER;
4639
static pthread_cond_t monitoring_thread_cond = PTHREAD_COND_INITIALIZER;
4740

41+
enum thread_exit_state {
42+
THREAD_EXIT_STATE_SELF_CANCEL = 0,
43+
THREAD_EXIT_STATE_CANCEL_LOCKED,
44+
THREAD_EXIT_STATE_CANCEL_UNLOCKED,
45+
};
46+
4847
/*
49-
* No more THREAD_EXIT_STATE_* enum: cleanup always acquires
50-
* kq_mtx in the thread context that runs it, with no
51-
* "inherited from cancellor" trick. See monitoring_thread_cleanup
52-
* and linux_libkqueue_free below for the protocol.
48+
* Monitoring thread is exiting because the process is terminating
5349
*/
50+
static enum thread_exit_state monitoring_thread_state;
5451

5552
/*
5653
* Map for kqueue pipes where index is the read side (for which signals are received)
@@ -119,84 +116,86 @@ fd_map_get(int fd)
119116
* fields and on kq_list with M0 in main's lockset but not T1's,
120117
* even though both threads acquire the same mutex.
121118
*
122-
* Cleanup always acquires kq_mtx in the thread context that
123-
* runs it (no "lock inherited from cancellor" trick). Both
124-
* monitoring_thread_loop and linux_libkqueue_free are responsible
125-
* for releasing kq_mtx before the cancellation point that triggers
126-
* this handler so the lock is genuinely free when we get here.
127-
*
128-
* TSAN cannot validate the synchronisation pattern: its
129-
* instrumentation does not propagate happens-before edges across
130-
* pthread_cancel into the cleanup-handler invocation, and warnings
131-
* fire on every memory access made by the chain (kqueue_free,
132-
* filter_unregister_all, knote_delete_all, ...). TSAN_IGNORE on
133-
* this function alone wouldn't help because the attribute doesn't
134-
* propagate to callees. The suppressions file at tools/tsan.supp
135-
* (`race:monitoring_thread_cleanup`) catches every warning under
136-
* this stack pattern, applied in CI via TSAN_OPTIONS=suppressions=.
119+
* The protection is real (lock is acquired before any access)
120+
* but the instrumentation can't see it. The suppressions file
121+
* at tools/tsan.supp (`race:monitoring_thread_cleanup`) catches
122+
* every warning under this stack pattern, applied in CI via
123+
* TSAN_OPTIONS=suppressions=.
137124
*/
138125
static void
139126
monitoring_thread_cleanup(UNUSED void *arg)
140127
{
141128
struct kqueue *kq, *kq_tmp;
142129

143-
tracing_mutex_lock(&kq_mtx);
130+
if ((monitoring_thread_state == THREAD_EXIT_STATE_CANCEL_LOCKED) ||
131+
(monitoring_thread_state == THREAD_EXIT_STATE_CANCEL_UNLOCKED)) {
144132

145-
/*
146-
* If the entire process is exiting, then free all
147-
* the kqueues.
148-
*
149-
* We do this because we don't reliably receive all the
150-
* close MONITORING_THREAD_SIGNALs before the process
151-
* exits, and this avoids ASAN or valgrind raising
152-
* spurious memory leaks.
153-
*
154-
* If the user _hasn't_ closed a KQ fd, then we don't
155-
* free the underlying memory, and it'll be correctly
156-
* reported as a memory leak.
157-
*
158-
* On the SELF_CANCEL path (kq_cnt hit zero in the loop)
159-
* kq_list is already empty so this walk is a no-op.
160-
*/
161-
LIST_FOREACH_SAFE(kq, &kq_list, kq_entry, kq_tmp) {
162133
/*
163-
* We only cleanup kqueues where their file descriptor
164-
* has been closed. Other kqueues may be in the middle
165-
* of operations with kq->kq_mtx held, and attempting to
166-
* clean them up would cause a deadlock. Those are
167-
* legitimate leaks the user application should have
168-
* freed before exit.
134+
* Keep the assertion in kqueue_free happy
169135
*/
170-
dbg_printf("kq=%p - fd=%i explicitly checking for closure", kq, kq->kq_id);
171-
if (fcntl(kq->kq_id, F_GETFD) < 0) {
172-
dbg_printf("kq=%p - fd=%i forcefully cleaning up, current use_count=%u: %s",
173-
kq, kq->kq_id, fd_use_cnt[kq->kq_id],
174-
errno == EBADF ? "File descriptor already closed" : strerror(errno));
175-
fd_use_cnt[kq->kq_id] = 0;
176-
} else {
136+
if (monitoring_thread_state == THREAD_EXIT_STATE_CANCEL_UNLOCKED)
137+
tracing_mutex_lock(&kq_mtx);
138+
139+
/*
140+
* If the entire process is exiting, then free all
141+
* the kqueues.
142+
*
143+
* We do this because we don't reliably receive all the
144+
* close MONITORING_THREAD_SIGNALs before the process
145+
* exits, and this avoids ASAN or valgrind raising
146+
* spurious memory leaks.
147+
*
148+
* If the user _hasn't_ closed a KQ fd, then we don't
149+
* free the underlying memory, and it'll be correctly
150+
* reported as a memory leak.
151+
*/
152+
LIST_FOREACH_SAFE(kq, &kq_list, kq_entry, kq_tmp) {
177153
/*
178-
* User never closed kqfd. We still treat this as a
179-
* leak (sanity check below) but interrupt any threads
180-
* parked in epoll_wait first so they exit kevent()
181-
* with EBADF instead of staying stuck forever. The
182-
* kq itself is still leaked per the existing contract
183-
* - the user is responsible for matching kqueue() with
184-
* close().
154+
* We only cleanup kqueues where their file descriptor
155+
* has been closed. Other kqueues may be in the middle
156+
* of operations with kq->kq_mtx held, and attempting to
157+
* clean them up would cause a deadlock. Those are
158+
* legitimate leaks the user application should have
159+
* freed before exit.
185160
*/
186-
assert(fd_use_cnt[kq->kq_id] > 0);
187-
linux_kqueue_interrupt(kq);
188-
}
161+
dbg_printf("kq=%p - fd=%i explicitly checking for closure", kq, kq->kq_id);
162+
if (fcntl(kq->kq_id, F_GETFD) < 0) {
163+
dbg_printf("kq=%p - fd=%i forcefully cleaning up, current use_count=%u: %s",
164+
kq, kq->kq_id, fd_use_cnt[kq->kq_id],
165+
errno == EBADF ? "File descriptor already closed" : strerror(errno));
166+
fd_use_cnt[kq->kq_id] = 0;
167+
} else {
168+
/*
169+
* User never closed kqfd. We still treat this as a
170+
* leak (sanity check below) but interrupt any threads
171+
* parked in epoll_wait first so they exit kevent()
172+
* with EBADF instead of staying stuck forever. The
173+
* kq itself is still leaked per the existing contract
174+
* - the user is responsible for matching kqueue() with
175+
* close().
176+
*/
177+
assert(fd_use_cnt[kq->kq_id] > 0);
178+
linux_kqueue_interrupt(kq);
179+
}
189180

190-
if (fd_use_cnt[kq->kq_id] == 0) {
191-
dbg_printf("kq=%p - fd=%i use_count=%u cleaning up...", kq, kq->kq_id, fd_use_cnt[kq->kq_id]);
192-
kqueue_free(kq);
193-
} else {
194-
dbg_printf("kq=%p - fd=%i is alive use_count=%u. Skipping, this is likely a leak...",
195-
kq, kq->kq_id, fd_use_cnt[kq->kq_id]);
181+
if (fd_use_cnt[kq->kq_id] == 0) {
182+
dbg_printf("kq=%p - fd=%i use_count=%u cleaning up...", kq, kq->kq_id, fd_use_cnt[kq->kq_id]);
183+
kqueue_free(kq);
184+
} else {
185+
dbg_printf("kq=%p - fd=%i is alive use_count=%u. Skipping, this is likely a leak...",
186+
kq, kq->kq_id, fd_use_cnt[kq->kq_id]);
187+
}
196188
}
189+
190+
if (monitoring_thread_state == THREAD_EXIT_STATE_CANCEL_UNLOCKED)
191+
tracing_mutex_unlock(&kq_mtx);
197192
}
198193

199-
dbg_printf("tid=%u - monitoring thread exiting", monitoring_tid);
194+
dbg_printf("tid=%u - monitoring thread exiting (%s)",
195+
monitoring_tid,
196+
monitoring_thread_state == THREAD_EXIT_STATE_SELF_CANCEL ?
197+
"no kqueues" : "process term");
198+
200199
/* Free thread resources */
201200
free(fd_map);
202201
fd_map = NULL;
@@ -206,7 +205,8 @@ monitoring_thread_cleanup(UNUSED void *arg)
206205
/* Reset so that thread can be restarted */
207206
monitoring_tid = 0;
208207

209-
tracing_mutex_unlock(&kq_mtx);
208+
if (monitoring_thread_state == THREAD_EXIT_STATE_CANCEL_LOCKED)
209+
tracing_mutex_unlock(&kq_mtx);
210210
}
211211

212212

@@ -332,6 +332,7 @@ monitoring_thread_loop(UNUSED void *arg)
332332
pthread_cond_signal(&monitoring_thread_cond);
333333
(void) pthread_mutex_unlock(&monitoring_thread_mtx);
334334

335+
monitoring_thread_state = THREAD_EXIT_STATE_CANCEL_UNLOCKED;
335336
pthread_cleanup_push(monitoring_thread_cleanup, NULL)
336337
while (true) {
337338
/*
@@ -368,30 +369,19 @@ monitoring_thread_loop(UNUSED void *arg)
368369
}
369370

370371
/*
371-
* Drop kq_mtx before the cancellation point so that, if a
372-
* pthread_cancel from main was already pending, the cleanup
373-
* handler runs in a clean lockset and reacquires kq_mtx
374-
* itself. Without this release, the handler would inherit a
375-
* lock owned by this thread but not visible to TSAN's
376-
* lockset-based race detection.
372+
* Ensure that any cancellation requests are acted on
377373
*/
378-
tracing_mutex_unlock(&kq_mtx);
374+
monitoring_thread_state = THREAD_EXIT_STATE_CANCEL_LOCKED;
379375
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
380376
pthread_testcancel();
381377

378+
monitoring_thread_state = THREAD_EXIT_STATE_SELF_CANCEL;
382379
pthread_cleanup_pop(true); /* Executes the cleanup function (monitoring_thread_cleanup) */
380+
res = pthread_detach(pthread_self());
381+
if (res != 0)
382+
dbg_printf("pthread_detach(3): %s", strerror(res));
383+
tracing_mutex_unlock(&kq_mtx);
383384

384-
/*
385-
* No pthread_detach here. Main thread reads monitoring_tid
386-
* under kq_mtx in linux_libkqueue_free; if the thread has
387-
* already cleaned up (tid == 0 from cleanup) main skips
388-
* pthread_join, otherwise main joins. Detaching from this
389-
* side races: the thread can detach between main's tid-read
390-
* and main's pthread_join, leaving join to fail with "joining
391-
* already joined thread". Without detach, the thread is
392-
* always joinable; the leak window is at most up to process
393-
* exit, which is when libkqueue_free runs anyway.
394-
*/
395385
return NULL;
396386
}
397387

@@ -405,7 +395,6 @@ linux_kqueue_start_thread(void)
405395

406396
return (-1);
407397
}
408-
atomic_store(&monitoring_thread_created, true);
409398
/*
410399
* Wait for thread creation to publish monitoring_tid. Loop on
411400
* the predicate (monitoring_tid != 0) to be safe against
@@ -433,14 +422,13 @@ linux_libkqueue_fork(void)
433422
* which called it.
434423
*
435424
* Ensure we don't try and cancel it on exit by
436-
* clearing the thread id and the created flag.
425+
* clearing the thread id.
437426
*
438427
* We don't need to lock the kq_mtx here as we're in
439428
* the child, and none of our threads will have been
440429
* duplicated.
441430
*/
442431
monitoring_tid = 0;
443-
atomic_store(&monitoring_thread_created, false);
444432

445433
/*
446434
* If the process has been forked, then we need
@@ -489,34 +477,23 @@ linux_libkqueue_fork(void)
489477
static void
490478
linux_libkqueue_free(void)
491479
{
492-
UNUSED pid_t tid;
480+
pid_t tid;
493481

494-
/*
495-
* Use monitoring_thread_created (set after pthread_create) to
496-
* decide whether the thread exists at all - monitoring_tid is
497-
* cleared by the cleanup handler when the thread exits, so it
498-
* can't distinguish "never started" from "started, then
499-
* exited". Always pthread_join when created so the thread is
500-
* reaped (no thread-leak warning from TSAN). pthread_join on
501-
* an exited-but-joinable thread returns immediately with the
502-
* exit status; pthread_cancel on the same is a benign ESRCH.
503-
*
504-
* No kq_mtx around the read - monitoring_thread_created is an
505-
* atomic; main has already finished its kq_mtx-protected work
506-
* by the time atexit fires.
507-
*/
508-
if (atomic_load(&monitoring_thread_created)) {
482+
tracing_mutex_lock(&kq_mtx);
483+
tid = monitoring_tid; /* Gets trashed when the thread exits */
484+
if (tid) {
509485
void *retval;
510486
int ret;
511487

512-
tracing_mutex_lock(&kq_mtx);
513-
tid = monitoring_tid; /* may be 0 if cleanup already ran */
514-
tracing_mutex_unlock(&kq_mtx);
515-
516488
dbg_printf("tid=%u - cancelling", tid);
517489
ret = pthread_cancel(monitoring_thread);
518490
if (ret != 0)
519491
dbg_printf("tid=%u - cancel failed: %s", tid, strerror(ret));
492+
/*
493+
* We unlock here to allow the monitoring thread
494+
* to continue if it was processing a cleanup.
495+
*/
496+
tracing_mutex_unlock(&kq_mtx);
520497

521498
ret = pthread_join(monitoring_thread, &retval);
522499
if (ret == 0) {
@@ -528,7 +505,8 @@ linux_libkqueue_free(void)
528505
} else {
529506
dbg_printf("tid=%u - join failed: %s", tid, strerror(ret));
530507
}
531-
atomic_store(&monitoring_thread_created, false);
508+
} else {
509+
tracing_mutex_unlock(&kq_mtx);
532510
}
533511
}
534512

0 commit comments

Comments
 (0)