From f8024f1f36a30a082b0457d5779c8847cea57f57 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Oct 2023 18:14:08 -0600 Subject: [PATCH 1/7] io_uring/kbuf: don't allow registered buffer rings on highmem pages syzbot reports that registering a mapped buffer ring on arm32 can trigger an OOPS. Registered buffer rings have two modes, one of them is the application passing in the memory that the buffer ring should reside in. Once those pages are mapped, we use page_address() to get a virtual address. This will obviously fail on highmem pages, which aren't mapped. Add a check if we have any highmem pages after mapping, and fail the attempt to register a provided buffer ring if we do. This will return the same error as kernels that don't support provided buffer rings to begin with. Link: https://lore.kernel.org/io-uring/000000000000af635c0606bcb889@google.com/ Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring") Cc: stable@vger.kernel.org Reported-by: syzbot+2113e61b8848fa7951d8@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 556f4df25b0fa0..9123138aa9f48b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -477,7 +477,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, { struct io_uring_buf_ring *br; struct page **pages; - int nr_pages; + int i, nr_pages; pages = io_pin_pages(reg->ring_addr, flex_array_size(br, bufs, reg->ring_entries), @@ -485,6 +485,17 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * Apparently some 32-bit boxes (ARM) will return highmem pages, + * which then need to be mapped. We could support that, but it'd + * complicate the code and slowdown the common cases quite a bit. + * So just error out, returning -EINVAL just like we did on kernels + * that didn't support mapped buffer rings. + */ + for (i = 0; i < nr_pages; i++) + if (PageHighMem(pages[i])) + goto error_unpin; + br = page_address(pages[0]); #ifdef SHM_COLOUR /* @@ -496,13 +507,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, * should use IOU_PBUF_RING_MMAP instead, and liburing will handle * this transparently. */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { - int i; - - for (i = 0; i < nr_pages; i++) - unpin_user_page(pages[i]); - return -EINVAL; - } + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) + goto error_unpin; #endif bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; @@ -510,6 +516,11 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->is_mapped = 1; bl->is_mmap = 0; return 0; +error_unpin: + for (i = 0; i < nr_pages; i++) + unpin_user_page(pages[i]); + kvfree(pages); + return -EINVAL; } static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, From 1658633c04653578429ff5dfc62fdc159203a8f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Oct 2023 19:51:38 -0600 Subject: [PATCH 2/7] io_uring: ensure io_lockdep_assert_cq_locked() handles disabled rings io_lockdep_assert_cq_locked() checks that locking is correctly done when a CQE is posted. If the ring is setup in a disabled state with IORING_SETUP_R_DISABLED, then ctx->submitter_task isn't assigned until the ring is later enabled. We generally don't post CQEs in this state, as no SQEs can be submitted. However it is possible to generate a CQE if tagged resources are being updated. If this happens and PROVE_LOCKING is enabled, then the locking check helper will dereference ctx->submitter_task, which hasn't been set yet. Fixup io_lockdep_assert_cq_locked() to handle this case correctly. While at it, convert it to a static inline as well, so that generated line offsets will actually reflect which condition failed, rather than just the line offset for io_lockdep_assert_cq_locked() itself. Reported-and-tested-by: syzbot+efc45d4e7ba6ab4ef1eb@syzkaller.appspotmail.com Fixes: f26cc9593581 ("io_uring: lockdep annotate CQ locking") Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 547c30582fb887..0bc145614a6e66 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -86,20 +86,33 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -#define io_lockdep_assert_cq_locked(ctx) \ - do { \ - lockdep_assert(in_task()); \ - \ - if (ctx->flags & IORING_SETUP_IOPOLL) { \ - lockdep_assert_held(&ctx->uring_lock); \ - } else if (!ctx->task_complete) { \ - lockdep_assert_held(&ctx->completion_lock); \ - } else if (ctx->submitter_task->flags & PF_EXITING) { \ - lockdep_assert(current_work()); \ - } else { \ - lockdep_assert(current == ctx->submitter_task); \ - } \ - } while (0) +#if defined(CONFIG_PROVE_LOCKING) +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) +{ + lockdep_assert(in_task()); + + if (ctx->flags & IORING_SETUP_IOPOLL) { + lockdep_assert_held(&ctx->uring_lock); + } else if (!ctx->task_complete) { + lockdep_assert_held(&ctx->completion_lock); + } else if (ctx->submitter_task) { + /* + * ->submitter_task may be NULL and we can still post a CQE, + * if the ring has been setup with IORING_SETUP_R_DISABLED. + * Not from an SQE, as those cannot be submitted, but via + * updating tagged resources. + */ + if (ctx->submitter_task->flags & PF_EXITING) + lockdep_assert(current_work()); + else + lockdep_assert(current == ctx->submitter_task); + } +} +#else +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) +{ +} +#endif static inline void io_req_task_work_add(struct io_kiocb *req) { From 2fd7b0f6d5ad655b1d947d3acdd82f687c31465e Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 2 Oct 2023 14:32:29 -0400 Subject: [PATCH 3/7] md/raid5: release batch_last before waiting for another stripe_head When raid5_get_active_stripe is called with a ctx containing a stripe_head in its batch_last pointer, it can cause a deadlock if the task sleeps waiting on another stripe_head to become available. The stripe_head held by batch_last can be blocking the advancement of other stripe_heads, leading to no stripe_heads being released so raid5_get_active_stripe waits forever. Like with the quiesce state handling earlier in the function, batch_last needs to be released by raid5_get_active_stripe before it waits for another stripe_head. Fixes: 3312e6c887fe ("md/raid5: Keep a reference to last stripe_head for batch") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: David Jeffery Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231002183422.13047-1-djeffery@redhat.com --- drivers/md/raid5.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4cb9c608ee1919..284cd71bcc685b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -854,6 +854,13 @@ struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); r5l_wake_reclaim(conf->log, 0); + + /* release batch_last before wait to avoid risk of deadlock */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + wait_event_lock_irq(conf->wait_for_stripe, is_inactive_blocked(conf, hash), *(conf->hash_locks + hash)); From 223ef474316466e9f61f6e0064f3a6fe4923a2c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Oct 2023 09:59:58 -0600 Subject: [PATCH 4/7] io_uring: don't allow IORING_SETUP_NO_MMAP rings on highmem pages On at least arm32, but presumably any arch with highmem, if the application passes in memory that resides in highmem for the rings, then we should fail that ring creation. We fail it with -EINVAL, which is what kernels that don't support IORING_SETUP_NO_MMAP will do as well. Cc: stable@vger.kernel.org Fixes: 03d89a2de25b ("io_uring: support for user allocated memory for rings/sqes") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 783ed0fff71b58..d839a80a6751c1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2686,7 +2686,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, { struct page **page_array; unsigned int nr_pages; - int ret; + int ret, i; *npages = 0; @@ -2716,6 +2716,20 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, */ if (page_array[0] != page_array[ret - 1]) goto err; + + /* + * Can't support mapping user allocated ring memory on 32-bit archs + * where it could potentially reside in highmem. Just fail those with + * -EINVAL, just like we did on kernels that didn't support this + * feature. + */ + for (i = 0; i < nr_pages; i++) { + if (PageHighMem(page_array[i])) { + ret = -EINVAL; + goto err; + } + } + *pages = page_array; *npages = nr_pages; return page_to_virt(page_array[0]); From 07a1141ff170ff5d4f9c4fbb0453727ab48096e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Oct 2023 17:31:06 +0200 Subject: [PATCH 5/7] nbd: don't call blk_mark_disk_dead nbd_clear_sock_ioctl blk_mark_disk_dead is the proper interface to shut down a block device, but it also makes the disk unusable forever. nbd_clear_sock_ioctl on the other hand wants to shut down the file system, but allow the block device to be used again when when connected to another socket. Switch nbd to use disk_force_media_change and nbd_bdev_reset to go back to a behavior of the old __invalidate_device call, with the added benefit of incrementing the device generation as there is no guarantee the old content comes back when the device is reconnected. Reported-by: Samuel Holland Reported-by: Shinichiro Kawasaki Fixes: 0c1c9a27ce90 ("nbd: call blk_mark_disk_dead in nbd_clear_sock_ioctl") Signed-off-by: Christoph Hellwig Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20231003153106.1331363-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index df1cd0f718b81c..800f131222fc8f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1436,8 +1436,9 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd) static void nbd_clear_sock_ioctl(struct nbd_device *nbd) { - blk_mark_disk_dead(nbd->disk); nbd_clear_sock(nbd); + disk_force_media_change(nbd->disk); + nbd_bdev_reset(nbd); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); From b21f18ef964b2c71aa0b451df6d17b7bcad8280d Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Wed, 4 Oct 2023 10:31:15 +0530 Subject: [PATCH 6/7] PM: hibernate: Fix copying the zero bitmap to safe pages The following crash is observed 100% of the time during resume from the hibernation on a x86 QEMU system. [ 12.931887] ? __die_body+0x1a/0x60 [ 12.932324] ? page_fault_oops+0x156/0x420 [ 12.932824] ? search_exception_tables+0x37/0x50 [ 12.933389] ? fixup_exception+0x21/0x300 [ 12.933889] ? exc_page_fault+0x69/0x150 [ 12.934371] ? asm_exc_page_fault+0x26/0x30 [ 12.934869] ? get_buffer.constprop.0+0xac/0x100 [ 12.935428] snapshot_write_next+0x7c/0x9f0 [ 12.935929] ? submit_bio_noacct_nocheck+0x2c2/0x370 [ 12.936530] ? submit_bio_noacct+0x44/0x2c0 [ 12.937035] ? hib_submit_io+0xa5/0x110 [ 12.937501] load_image+0x83/0x1a0 [ 12.937919] swsusp_read+0x17f/0x1d0 [ 12.938355] ? create_basic_memory_bitmaps+0x1b7/0x240 [ 12.938967] load_image_and_restore+0x45/0xc0 [ 12.939494] software_resume+0x13c/0x180 [ 12.939994] resume_store+0xa3/0x1d0 The commit being fixed introduced a bug in copying the zero bitmap to safe pages. A temporary bitmap is allocated with PG_ANY flag in prepare_image() to make a copy of zero bitmap after the unsafe pages are marked. Freeing this temporary bitmap with PG_UNSAFE_KEEP later results in an inconsistent state of unsafe pages. Since free bit is left as is for this temporary bitmap after free, these pages are treated as unsafe pages when they are allocated again. This results in incorrect calculation of the number of pages pre-allocated for the image. nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; The allocate_unsafe_pages is estimated to be higher than the actual which results in running short of pages in safe_pages_list. Hence the crash is observed in get_buffer() due to NULL pointer access of safe_pages_list. Fix this issue by creating the temporary zero bitmap from safe pages (free bit not set) so that the corresponding free bits can be cleared while freeing this bitmap. Fixes: 005e8dddd497 ("PM: hibernate: don't store zero pages in the image file") Suggested-by:: Brian Geffon Signed-off-by: Pavankumar Kondeti Reviewed-by: Brian Geffon Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 87e9f7e2bdc057..0f12e0a97e432e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2647,7 +2647,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, memory_bm_free(bm, PG_UNSAFE_KEEP); /* Make a copy of zero_bm so it can be created in safe pages */ - error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY); + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); if (error) goto Free; @@ -2660,7 +2660,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, goto Free; duplicate_memory_bitmap(zero_bm, &tmp); - memory_bm_free(&tmp, PG_UNSAFE_KEEP); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); /* At this point zero_bm is in safe pages and it can be used for restoring. */ if (nr_highmem > 0) { From 0f8baa3c9802fbfe313c901e1598397b61b91ada Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 5 Oct 2023 13:55:31 -0400 Subject: [PATCH 7/7] io-wq: fully initialize wqe before calling cpuhp_state_add_instance_nocalls() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I received a bug report with the following signature: [ 1759.937637] BUG: unable to handle page fault for address: ffffffffffffffe8 [ 1759.944564] #PF: supervisor read access in kernel mode [ 1759.949732] #PF: error_code(0x0000) - not-present page [ 1759.954901] PGD 7ab615067 P4D 7ab615067 PUD 7ab617067 PMD 0 [ 1759.960596] Oops: 0000 1 PREEMPT SMP PTI [ 1759.964804] CPU: 15 PID: 109 Comm: cpuhp/15 Kdump: loaded Tainted: G X ------- — 5.14.0-362.3.1.el9_3.x86_64 #1 [ 1759.976609] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 06/20/2018 [ 1759.985181] RIP: 0010:io_wq_for_each_worker.isra.0+0x24/0xa0 [ 1759.990877] Code: 90 90 90 90 90 90 0f 1f 44 00 00 41 56 41 55 41 54 55 48 8d 6f 78 53 48 8b 47 78 48 39 c5 74 4f 49 89 f5 49 89 d4 48 8d 58 e8 <8b> 13 85 d2 74 32 8d 4a 01 89 d0 f0 0f b1 0b 75 5c 09 ca 78 3d 48 [ 1760.009758] RSP: 0000:ffffb6f403603e20 EFLAGS: 00010286 [ 1760.015013] RAX: 0000000000000000 RBX: ffffffffffffffe8 RCX: 0000000000000000 [ 1760.022188] RDX: ffffb6f403603e50 RSI: ffffffffb11e95b0 RDI: ffff9f73b09e9400 [ 1760.029362] RBP: ffff9f73b09e9478 R08: 000000000000000f R09: 0000000000000000 [ 1760.036536] R10: ffffffffffffff00 R11: ffffb6f403603d80 R12: ffffb6f403603e50 [ 1760.043712] R13: ffffffffb11e95b0 R14: ffffffffb28531e8 R15: ffff9f7a6fbdf548 [ 1760.050887] FS: 0000000000000000(0000) GS:ffff9f7a6fbc0000(0000) knlGS:0000000000000000 [ 1760.059025] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1760.064801] CR2: ffffffffffffffe8 CR3: 00000007ab610002 CR4: 00000000007706e0 [ 1760.071976] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1760.079150] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1760.086325] PKRU: 55555554 [ 1760.089044] Call Trace: [ 1760.091501] [ 1760.093612] ? show_trace_log_lvl+0x1c4/0x2df [ 1760.097995] ? show_trace_log_lvl+0x1c4/0x2df [ 1760.102377] ? __io_wq_cpu_online+0x54/0xb0 [ 1760.106584] ? __die_body.cold+0x8/0xd [ 1760.110356] ? page_fault_oops+0x134/0x170 [ 1760.114479] ? kernelmode_fixup_or_oops+0x84/0x110 [ 1760.119298] ? exc_page_fault+0xa8/0x150 [ 1760.123247] ? asm_exc_page_fault+0x22/0x30 [ 1760.127458] ? __pfx_io_wq_worker_affinity+0x10/0x10 [ 1760.132453] ? __pfx_io_wq_worker_affinity+0x10/0x10 [ 1760.137446] ? io_wq_for_each_worker.isra.0+0x24/0xa0 [ 1760.142527] __io_wq_cpu_online+0x54/0xb0 [ 1760.146558] cpuhp_invoke_callback+0x109/0x460 [ 1760.151029] ? __pfx_io_wq_cpu_offline+0x10/0x10 [ 1760.155673] ? __pfx_smpboot_thread_fn+0x10/0x10 [ 1760.160320] cpuhp_thread_fun+0x8d/0x140 [ 1760.164266] smpboot_thread_fn+0xd3/0x1a0 [ 1760.168297] kthread+0xdd/0x100 [ 1760.171457] ? __pfx_kthread+0x10/0x10 [ 1760.175225] ret_from_fork+0x29/0x50 [ 1760.178826] [ 1760.181022] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs rfkill sunrpc vfat fat dm_multipath intel_rapl_msr intel_rapl_common isst_if_common ipmi_ssif nfit libnvdimm mgag200 i2c_algo_bit ioatdma drm_shmem_helper drm_kms_helper acpi_ipmi syscopyarea x86_pkg_temp_thermal sysfillrect ipmi_si intel_powerclamp sysimgblt ipmi_devintf coretemp acpi_power_meter ipmi_msghandler rapl pcspkr dca intel_pch_thermal intel_cstate ses lpc_ich intel_uncore enclosure hpilo mei_me mei acpi_tad fuse drm xfs sd_mod sg bnx2x nvme nvme_core crct10dif_pclmul crc32_pclmul nvme_common ghash_clmulni_intel smartpqi tg3 t10_pi mdio uas libcrc32c crc32c_intel scsi_transport_sas usb_storage hpwdt wmi dm_mirror dm_region_hash dm_log dm_mod [ 1760.248623] CR2: ffffffffffffffe8 A cpu hotplug callback was issued before wq->all_list was initialized. This results in a null pointer dereference. The fix is to fully setup the io_wq before calling cpuhp_state_add_instance_nocalls(). Signed-off-by: Jeff Moyer Link: https://lore.kernel.org/r/x49y1ghnecs.fsf@segfault.boston.devel.redhat.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 1ecc8c7487682c..522196dfb0ff5a 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1151,9 +1151,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); - ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) - goto err_wq; refcount_inc(&data->hash->refs); wq->hash = data->hash; @@ -1186,13 +1183,14 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); + ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); + if (ret) + goto err; + return wq; err: io_wq_put_hash(data->hash); - cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - free_cpumask_var(wq->cpu_mask); -err_wq: kfree(wq); return ERR_PTR(ret); }