From 14d24d2696dde62d5eb9e68a001e934e88c92d0d Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 26 Jan 2023 10:23:13 -0700 Subject: [PATCH] Verifying race exists for mmap on master After doing some tracing on mmap using the O_DIRECT code, I decided to test out the mmap test case just on master to see if there existed a race condition that is outside of O_DIRECT. There does exist a race condition which is shown in the trace below. We are hitting the same ASSERT that the page should be up to date in mappedread(). The only difference in the test case that is run on the O_DIRECT branch is it use `direct=1` for fio. However, I changed this to `direct=0` to make sure we are always going down the buffered path. The reason I did this my O_DIRECT stack traces were showing we only hit this in the first write of O_DIRECT while growing the file (that is always sent to the ARC). [12039.760554] VERIFY(PageUptodate(pp)) failed [12039.764745] PANIC at zfs_vnops_os.c:298:mappedread() [12039.769712] Showing stack for process 1125719 [12039.774071] CPU: 19 PID: 1125719 Comm: fio Kdump: loaded Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12039.785193] Hardware name: GIGABYTE R272-Z32-00/MZ32-AR0-00, BIOS R21 10/08/2020 [12039.792577] Call Trace: [12039.795024] dump_stack+0x41/0x60 [12039.798343] spl_panic+0xd0/0xe8 [spl] [12039.802103] ? _cond_resched+0x15/0x30 [12039.805854] ? mutex_lock+0xe/0x30 [12039.809253] ? arc_cksum_compute+0xcb/0x180 [zfs] [12039.814129] ? __raw_spin_unlock+0x5/0x10 [zfs] [12039.818819] ? dmu_read_uio_dnode+0xf1/0x130 [zfs] [12039.823802] ? kfree+0xd3/0x250 [12039.826948] ? xas_load+0x8/0x80 [12039.830182] ? find_get_entry+0xd6/0x1c0 [12039.834106] ? _cond_resched+0x15/0x30 [12039.837852] spl_assert+0x17/0x20 [zfs] [12039.841885] mappedread+0x136/0x140 [zfs] [12039.846077] zfs_read+0x165/0x2e0 [zfs] [12039.850100] zpl_iter_read+0xa8/0x110 [zfs] [12039.854456] ? __handle_mm_fault+0x44f/0x6c0 [12039.858729] new_sync_read+0x10f/0x150 [12039.862484] vfs_read+0x91/0x140 [12039.865716] ksys_read+0x4f/0xb0 [12039.868946] do_syscall_64+0x5b/0x1a0 [12039.872614] entry_SYSCALL_64_after_hwframe+0x65/0xca [12039.877668] RIP: 0033:0x7fa34855cab4 [12039.881246] Code: c3 0f 1f 44 00 00 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 7b fc ff ff 4c 89 e2 48 89 ee 89 df 41 89 c0 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 38 44 89 c7 48 89 44 24 08 e8 b7 fc ff ff 48 [12039.899990] RSP: 002b:00007ffc7a17caf0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12039.907550] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fa34855cab4 [12039.914680] RDX: 0000000000100000 RSI: 00007fa2ddc99010 RDI: 0000000000000005 [12039.921804] RBP: 00007fa2ddc99010 R08: 0000000000000000 R09: 0000000000000000 [12039.928928] R10: 0000000020f2cfd0 R11: 0000000000000246 R12: 0000000000100000 [12039.936052] R13: 000055f6f3432ec0 R14: 0000000000100000 R15: 000055f6f3432ee8 [12166.129858] INFO: task fio:1125717 blocked for more than 120 seconds. [12166.136307] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12166.144043] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12166.151870] task:fio state:D stack: 0 pid:1125717 ppid:1125444 flags:0x00000080 [12166.160478] Call Trace: [12166.162930] __schedule+0x2d1/0x830 [12166.166433] schedule+0x35/0xa0 [12166.169585] cv_wait_common+0x153/0x240 [spl] [12166.173954] ? finish_wait+0x80/0x80 [12166.177539] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12166.183054] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12166.188461] zfs_write+0x615/0xd70 [zfs] [12166.192570] zpl_iter_write+0xe0/0x120 [zfs] [12166.197032] ? __handle_mm_fault+0x44f/0x6c0 [12166.201303] new_sync_write+0x112/0x160 [12166.205146] vfs_write+0xa5/0x1a0 [12166.208473] ksys_write+0x4f/0xb0 [12166.211793] do_syscall_64+0x5b/0x1a0 [12166.215465] entry_SYSCALL_64_after_hwframe+0x65/0xca [12166.220520] RIP: 0033:0x7fcb60d50a17 [12166.224103] Code: Unable to access opcode bytes at RIP 0x7fcb60d509ed. [12166.230633] RSP: 002b:00007ffce0bbd250 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [12166.238199] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fcb60d50a17 [12166.245331] RDX: 0000000000100000 RSI: 00007fcaf648d010 RDI: 0000000000000005 [12166.252463] RBP: 00007fcaf648d010 R08: 0000000000000000 R09: 0000000000000000 [12166.259598] R10: 000055aeb180bf80 R11: 0000000000000293 R12: 0000000000100000 [12166.266729] R13: 000055aeb1803ec0 R14: 0000000000100000 R15: 000055aeb1803ee8 [12166.273873] INFO: task fio:1125719 blocked for more than 120 seconds. [12166.280317] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12166.288057] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12166.295882] task:fio state:D stack: 0 pid:1125719 ppid:1125447 flags:0x80004080 [12166.304489] Call Trace: [12166.306942] __schedule+0x2d1/0x830 [12166.310435] schedule+0x35/0xa0 [12166.313580] spl_panic+0xe6/0xe8 [spl] [12166.317341] ? _cond_resched+0x15/0x30 [12166.321094] ? mutex_lock+0xe/0x30 [12166.324501] ? arc_cksum_compute+0xcb/0x180 [zfs] [12166.329379] ? __raw_spin_unlock+0x5/0x10 [zfs] [12166.334078] ? dmu_read_uio_dnode+0xf1/0x130 [zfs] [12166.339042] ? kfree+0xd3/0x250 [12166.342187] ? xas_load+0x8/0x80 [12166.345421] ? find_get_entry+0xd6/0x1c0 [12166.349346] ? _cond_resched+0x15/0x30 [12166.353102] spl_assert+0x17/0x20 [zfs] [12166.357130] mappedread+0x136/0x140 [zfs] [12166.361324] zfs_read+0x165/0x2e0 [zfs] [12166.365347] zpl_iter_read+0xa8/0x110 [zfs] [12166.369706] ? __handle_mm_fault+0x44f/0x6c0 [12166.373978] new_sync_read+0x10f/0x150 [12166.377731] vfs_read+0x91/0x140 [12166.380964] ksys_read+0x4f/0xb0 [12166.384196] do_syscall_64+0x5b/0x1a0 [12166.387863] entry_SYSCALL_64_after_hwframe+0x65/0xca [12166.392913] RIP: 0033:0x7fa34855cab4 [12166.396494] Code: Unable to access opcode bytes at RIP 0x7fa34855ca8a. [12166.403018] RSP: 002b:00007ffc7a17caf0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12166.410586] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fa34855cab4 [12166.417717] RDX: 0000000000100000 RSI: 00007fa2ddc99010 RDI: 0000000000000005 [12166.424852] RBP: 00007fa2ddc99010 R08: 0000000000000000 R09: 0000000000000000 [12166.431985] R10: 0000000020f2cfd0 R11: 0000000000000246 R12: 0000000000100000 [12166.439117] R13: 000055f6f3432ec0 R14: 0000000000100000 R15: 000055f6f3432ee8 [12289.008759] INFO: task kworker/u128:1:32743 blocked for more than 120 seconds. [12289.015983] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12289.023722] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12289.031550] task:kworker/u128:1 state:D stack: 0 pid:32743 ppid: 2 flags:0x80004080 [12289.039905] Workqueue: writeback wb_workfn (flush-zfs-915) [12289.045397] Call Trace: [12289.047850] __schedule+0x2d1/0x830 [12289.051343] schedule+0x35/0xa0 [12289.054491] cv_wait_common+0x153/0x240 [spl] [12289.058859] ? finish_wait+0x80/0x80 [12289.062435] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12289.067949] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12289.073357] zfs_putpage+0x13b/0x5b0 [zfs] [12289.077640] ? pmdp_collapse_flush+0x10/0x10 [12289.081909] ? rmap_walk_file+0x116/0x290 [12289.085922] ? __mod_memcg_lruvec_state+0x5d/0x160 [12289.090715] zpl_putpage+0x67/0xd0 [zfs] [12289.094817] write_cache_pages+0x197/0x420 [12289.098916] ? zpl_readpage_filler+0x10/0x10 [zfs] [12289.103890] zpl_writepages+0x98/0x130 [zfs] [12289.108335] do_writepages+0xc2/0x1c0 [12289.112000] ? __wb_calc_thresh+0x3a/0x120 [12289.116100] __writeback_single_inode+0x39/0x2f0 [12289.120719] writeback_sb_inodes+0x1e6/0x450 [12289.124993] __writeback_inodes_wb+0x5f/0xc0 [12289.129263] wb_writeback+0x247/0x2e0 [12289.132931] wb_workfn+0x346/0x4d0 [12289.136337] ? __switch_to_asm+0x35/0x70 [12289.140262] ? __switch_to_asm+0x41/0x70 [12289.144189] ? __switch_to_asm+0x35/0x70 [12289.148116] ? __switch_to_asm+0x41/0x70 [12289.152042] ? __switch_to_asm+0x35/0x70 [12289.155964] ? __switch_to_asm+0x41/0x70 [12289.159893] ? __switch_to_asm+0x35/0x70 [12289.163819] ? __switch_to_asm+0x41/0x70 [12289.167748] process_one_work+0x1a7/0x360 [12289.171766] ? create_worker+0x1a0/0x1a0 [12289.175690] worker_thread+0x30/0x390 [12289.179357] ? create_worker+0x1a0/0x1a0 [12289.183284] kthread+0x10a/0x120 [12289.186514] ? set_kthread_struct+0x40/0x40 [12289.190702] ret_from_fork+0x35/0x40 [12289.194323] INFO: task fio:1125717 blocked for more than 120 seconds. [12289.200766] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12289.208504] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12289.216331] task:fio state:D stack: 0 pid:1125717 ppid:1125444 flags:0x00000080 [12289.224935] Call Trace: [12289.227390] __schedule+0x2d1/0x830 [12289.230882] schedule+0x35/0xa0 [12289.234026] cv_wait_common+0x153/0x240 [spl] [12289.238396] ? finish_wait+0x80/0x80 [12289.241975] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12289.247471] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12289.252874] zfs_write+0x615/0xd70 [zfs] [12289.256979] zpl_iter_write+0xe0/0x120 [zfs] [12289.261429] ? __handle_mm_fault+0x44f/0x6c0 [12289.265708] new_sync_write+0x112/0x160 [12289.269553] vfs_write+0xa5/0x1a0 [12289.272869] ksys_write+0x4f/0xb0 [12289.276191] do_syscall_64+0x5b/0x1a0 [12289.279856] entry_SYSCALL_64_after_hwframe+0x65/0xca [12289.284908] RIP: 0033:0x7fcb60d50a17 [12289.288493] Code: Unable to access opcode bytes at RIP 0x7fcb60d509ed. [12289.295022] RSP: 002b:00007ffce0bbd250 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [12289.302590] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fcb60d50a17 [12289.309723] RDX: 0000000000100000 RSI: 00007fcaf648d010 RDI: 0000000000000005 [12289.316853] RBP: 00007fcaf648d010 R08: 0000000000000000 R09: 0000000000000000 [12289.323987] R10: 000055aeb180bf80 R11: 0000000000000293 R12: 0000000000100000 [12289.331121] R13: 000055aeb1803ec0 R14: 0000000000100000 R15: 000055aeb1803ee8 [12289.338253] INFO: task fio:1125719 blocked for more than 120 seconds. [12289.344693] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12289.352431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12289.360256] task:fio state:D stack: 0 pid:1125719 ppid:1125447 flags:0x80004080 [12289.368864] Call Trace: [12289.371316] __schedule+0x2d1/0x830 [12289.374808] schedule+0x35/0xa0 [12289.377953] spl_panic+0xe6/0xe8 [spl] [12289.381717] ? _cond_resched+0x15/0x30 [12289.385467] ? mutex_lock+0xe/0x30 [12289.388873] ? arc_cksum_compute+0xcb/0x180 [zfs] [12289.393756] ? __raw_spin_unlock+0x5/0x10 [zfs] [12289.398451] ? dmu_read_uio_dnode+0xf1/0x130 [zfs] [12289.403423] ? kfree+0xd3/0x250 [12289.406571] ? xas_load+0x8/0x80 [12289.409806] ? find_get_entry+0xd6/0x1c0 [12289.413732] ? _cond_resched+0x15/0x30 [12289.417484] spl_assert+0x17/0x20 [zfs] [12289.421511] mappedread+0x136/0x140 [zfs] [12289.425709] zfs_read+0x165/0x2e0 [zfs] [12289.429731] zpl_iter_read+0xa8/0x110 [zfs] [12289.434097] ? __handle_mm_fault+0x44f/0x6c0 [12289.438369] new_sync_read+0x10f/0x150 [12289.442121] vfs_read+0x91/0x140 [12289.445353] ksys_read+0x4f/0xb0 [12289.448588] do_syscall_64+0x5b/0x1a0 [12289.452254] entry_SYSCALL_64_after_hwframe+0x65/0xca [12289.457307] RIP: 0033:0x7fa34855cab4 [12289.460885] Code: Unable to access opcode bytes at RIP 0x7fa34855ca8a. [12289.467411] RSP: 002b:00007ffc7a17caf0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12289.474977] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fa34855cab4 [12289.482110] RDX: 0000000000100000 RSI: 00007fa2ddc99010 RDI: 0000000000000005 [12289.489241] RBP: 00007fa2ddc99010 R08: 0000000000000000 R09: 0000000000000000 [12289.496376] R10: 0000000020f2cfd0 R11: 0000000000000246 R12: 0000000000100000 [12289.503506] R13: 000055f6f3432ec0 R14: 0000000000100000 R15: 000055f6f3432ee8 [12411.887657] INFO: task kworker/u128:1:32743 blocked for more than 120 seconds. [12411.894885] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12411.902627] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12411.910450] task:kworker/u128:1 state:D stack: 0 pid:32743 ppid: 2 flags:0x80004080 [12411.918798] Workqueue: writeback wb_workfn (flush-zfs-915) [12411.924292] Call Trace: [12411.926747] __schedule+0x2d1/0x830 [12411.930247] schedule+0x35/0xa0 [12411.933394] cv_wait_common+0x153/0x240 [spl] [12411.937758] ? finish_wait+0x80/0x80 [12411.941339] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12411.946852] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12411.952260] zfs_putpage+0x13b/0x5b0 [zfs] [12411.956541] ? pmdp_collapse_flush+0x10/0x10 [12411.960821] ? rmap_walk_file+0x116/0x290 [12411.964833] ? __mod_memcg_lruvec_state+0x5d/0x160 [12411.969627] zpl_putpage+0x67/0xd0 [zfs] [12411.973734] write_cache_pages+0x197/0x420 [12411.977835] ? zpl_readpage_filler+0x10/0x10 [zfs] [12411.982802] zpl_writepages+0x98/0x130 [zfs] [12411.987246] do_writepages+0xc2/0x1c0 [12411.990910] ? __wb_calc_thresh+0x3a/0x120 [12411.995013] __writeback_single_inode+0x39/0x2f0 [12411.999632] writeback_sb_inodes+0x1e6/0x450 [12412.003904] __writeback_inodes_wb+0x5f/0xc0 [12412.008176] wb_writeback+0x247/0x2e0 [12412.011841] wb_workfn+0x346/0x4d0 [12412.015248] ? __switch_to_asm+0x35/0x70 [12412.019175] ? __switch_to_asm+0x41/0x70 [12412.023098] ? __switch_to_asm+0x35/0x70 [12412.027027] ? __switch_to_asm+0x41/0x70 [12412.030952] ? __switch_to_asm+0x35/0x70 [12412.034879] ? __switch_to_asm+0x41/0x70 [12412.038805] ? __switch_to_asm+0x35/0x70 [12412.042731] ? __switch_to_asm+0x41/0x70 [12412.046656] process_one_work+0x1a7/0x360 [12412.050671] ? create_worker+0x1a0/0x1a0 [12412.054594] worker_thread+0x30/0x390 [12412.058260] ? create_worker+0x1a0/0x1a0 [12412.062186] kthread+0x10a/0x120 [12412.065421] ? set_kthread_struct+0x40/0x40 [12412.069607] ret_from_fork+0x35/0x40 [12412.073227] INFO: task fio:1125717 blocked for more than 120 seconds. [12412.079667] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12412.087406] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12412.095233] task:fio state:D stack: 0 pid:1125717 ppid:1125444 flags:0x00000080 [12412.103838] Call Trace: [12412.106291] __schedule+0x2d1/0x830 [12412.109785] schedule+0x35/0xa0 [12412.112930] cv_wait_common+0x153/0x240 [spl] [12412.117299] ? finish_wait+0x80/0x80 [12412.120877] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12412.126373] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12412.131778] zfs_write+0x615/0xd70 [zfs] [12412.135881] zpl_iter_write+0xe0/0x120 [zfs] [12412.140336] ? __handle_mm_fault+0x44f/0x6c0 [12412.144607] new_sync_write+0x112/0x160 [12412.148446] vfs_write+0xa5/0x1a0 [12412.151765] ksys_write+0x4f/0xb0 [12412.155085] do_syscall_64+0x5b/0x1a0 [12412.158753] entry_SYSCALL_64_after_hwframe+0x65/0xca [12412.163811] RIP: 0033:0x7fcb60d50a17 [12412.167395] Code: Unable to access opcode bytes at RIP 0x7fcb60d509ed. [12412.173916] RSP: 002b:00007ffce0bbd250 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [12412.181481] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fcb60d50a17 [12412.188614] RDX: 0000000000100000 RSI: 00007fcaf648d010 RDI: 0000000000000005 [12412.195749] RBP: 00007fcaf648d010 R08: 0000000000000000 R09: 0000000000000000 [12412.202882] R10: 000055aeb180bf80 R11: 0000000000000293 R12: 0000000000100000 [12412.210014] R13: 000055aeb1803ec0 R14: 0000000000100000 R15: 000055aeb1803ee8 [12412.217146] INFO: task fio:1125719 blocked for more than 120 seconds. [12412.223587] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12412.231325] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12412.239150] task:fio state:D stack: 0 pid:1125719 ppid:1125447 flags:0x80004080 [12412.247755] Call Trace: [12412.250213] __schedule+0x2d1/0x830 [12412.253713] schedule+0x35/0xa0 [12412.256867] spl_panic+0xe6/0xe8 [spl] [12412.260639] ? _cond_resched+0x15/0x30 [12412.264398] ? mutex_lock+0xe/0x30 [12412.267803] ? arc_cksum_compute+0xcb/0x180 [zfs] [12412.272704] ? __raw_spin_unlock+0x5/0x10 [zfs] [12412.277417] ? dmu_read_uio_dnode+0xf1/0x130 [zfs] [12412.282396] ? kfree+0xd3/0x250 [12412.285543] ? xas_load+0x8/0x80 [12412.288785] ? find_get_entry+0xd6/0x1c0 [12412.292711] ? _cond_resched+0x15/0x30 [12412.296465] spl_assert+0x17/0x20 [zfs] [12412.300494] mappedread+0x136/0x140 [zfs] [12412.304690] zfs_read+0x165/0x2e0 [zfs] [12412.308728] zpl_iter_read+0xa8/0x110 [zfs] [12412.313105] ? __handle_mm_fault+0x44f/0x6c0 [12412.317379] new_sync_read+0x10f/0x150 [12412.321138] vfs_read+0x91/0x140 [12412.324372] ksys_read+0x4f/0xb0 [12412.327613] do_syscall_64+0x5b/0x1a0 [12412.331279] entry_SYSCALL_64_after_hwframe+0x65/0xca [12412.336334] RIP: 0033:0x7fa34855cab4 [12412.339912] Code: Unable to access opcode bytes at RIP 0x7fa34855ca8a. [12412.346437] RSP: 002b:00007ffc7a17caf0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12412.354002] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fa34855cab4 [12412.361135] RDX: 0000000000100000 RSI: 00007fa2ddc99010 RDI: 0000000000000005 [12412.368268] RBP: 00007fa2ddc99010 R08: 0000000000000000 R09: 0000000000000000 [12412.375402] R10: 0000000020f2cfd0 R11: 0000000000000246 R12: 0000000000100000 [12412.382534] R13: 000055f6f3432ec0 R14: 0000000000100000 R15: 000055f6f3432ee8 [12534.766588] INFO: task kworker/u128:1:32743 blocked for more than 120 seconds. [12534.773814] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12534.781556] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12534.789382] task:kworker/u128:1 state:D stack: 0 pid:32743 ppid: 2 flags:0x80004080 [12534.797737] Workqueue: writeback wb_workfn (flush-zfs-915) [12534.803228] Call Trace: [12534.805684] __schedule+0x2d1/0x830 [12534.809183] schedule+0x35/0xa0 [12534.812330] cv_wait_common+0x153/0x240 [spl] [12534.816698] ? finish_wait+0x80/0x80 [12534.820287] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12534.825806] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12534.831216] zfs_putpage+0x13b/0x5b0 [zfs] [12534.835498] ? pmdp_collapse_flush+0x10/0x10 [12534.839777] ? rmap_walk_file+0x116/0x290 [12534.843790] ? __mod_memcg_lruvec_state+0x5d/0x160 [12534.848581] zpl_putpage+0x67/0xd0 [zfs] [12534.852692] write_cache_pages+0x197/0x420 [12534.856798] ? zpl_readpage_filler+0x10/0x10 [zfs] [12534.861764] zpl_writepages+0x98/0x130 [zfs] [12534.866210] do_writepages+0xc2/0x1c0 [12534.869878] ? __wb_calc_thresh+0x3a/0x120 [12534.873975] __writeback_single_inode+0x39/0x2f0 [12534.878594] writeback_sb_inodes+0x1e6/0x450 [12534.882866] __writeback_inodes_wb+0x5f/0xc0 [12534.887137] wb_writeback+0x247/0x2e0 [12534.890806] wb_workfn+0x346/0x4d0 [12534.894213] ? __switch_to_asm+0x35/0x70 [12534.898136] ? __switch_to_asm+0x41/0x70 [12534.902064] ? __switch_to_asm+0x35/0x70 [12534.905990] ? __switch_to_asm+0x41/0x70 [12534.909915] ? __switch_to_asm+0x35/0x70 [12534.913840] ? __switch_to_asm+0x41/0x70 [12534.917768] ? __switch_to_asm+0x35/0x70 [12534.921696] ? __switch_to_asm+0x41/0x70 [12534.925620] process_one_work+0x1a7/0x360 [12534.929635] ? create_worker+0x1a0/0x1a0 [12534.933559] worker_thread+0x30/0x390 [12534.937226] ? create_worker+0x1a0/0x1a0 [12534.941152] kthread+0x10a/0x120 [12534.944392] ? set_kthread_struct+0x40/0x40 [12534.948576] ret_from_fork+0x35/0x40 [12534.952215] INFO: task fio:1125717 blocked for more than 120 seconds. [12534.958656] Tainted: P OE --------- - - 4.18.0-408.el8.x86_64 #1 [12534.966395] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [12534.974219] task:fio state:D stack: 0 pid:1125717 ppid:1125444 flags:0x00000080 [12534.982828] Call Trace: [12534.985282] __schedule+0x2d1/0x830 [12534.988770] schedule+0x35/0xa0 [12534.991920] cv_wait_common+0x153/0x240 [spl] [12534.996287] ? finish_wait+0x80/0x80 [12534.999867] zfs_rangelock_enter_writer+0x46/0x1b0 [zfs] [12535.005359] zfs_rangelock_enter_impl+0x110/0x170 [zfs] [12535.010757] zfs_write+0x615/0xd70 [zfs] [12535.014861] zpl_iter_write+0xe0/0x120 [zfs] [12535.019307] ? __handle_mm_fault+0x44f/0x6c0 [12535.023576] new_sync_write+0x112/0x160 [12535.027418] vfs_write+0xa5/0x1a0 [12535.030735] ksys_write+0x4f/0xb0 [12535.034054] do_syscall_64+0x5b/0x1a0 [12535.037722] entry_SYSCALL_64_after_hwframe+0x65/0xca [12535.042773] RIP: 0033:0x7fcb60d50a17 [12535.046359] Code: Unable to access opcode bytes at RIP 0x7fcb60d509ed. [12535.052889] RSP: 002b:00007ffce0bbd250 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [12535.060454] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007fcb60d50a17 [12535.067585] RDX: 0000000000100000 RSI: 00007fcaf648d010 RDI: 0000000000000005 [12535.074718] RBP: 00007fcaf648d010 R08: 0000000000000000 R09: 0000000000000000 [12535.081851] R10: 000055aeb180bf80 R11: 0000000000000293 R12: 0000000000100000 [12535.088987] R13: 000055aeb1803ec0 R14: 0000000000100000 R15: 000055aeb1803ee8 Signed-off-by: Brian Atkinson --- tests/runfiles/common.run | 4 + tests/zfs-tests/tests/Makefile.am | 5 + .../tests/functional/direct_mmap/cleanup.ksh | 31 ++ .../tests/functional/direct_mmap/dio.cfg | 26 ++ .../tests/functional/direct_mmap/dio.kshlib | 302 ++++++++++++++++++ .../tests/functional/direct_mmap/dio_mmap.ksh | 88 +++++ .../tests/functional/direct_mmap/setup.ksh | 32 ++ 7 files changed, 488 insertions(+) create mode 100755 tests/zfs-tests/tests/functional/direct_mmap/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/direct_mmap/dio.cfg create mode 100644 tests/zfs-tests/tests/functional/direct_mmap/dio.kshlib create mode 100755 tests/zfs-tests/tests/functional/direct_mmap/dio_mmap.ksh create mode 100755 tests/zfs-tests/tests/functional/direct_mmap/setup.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 005c539fc89d..2b4497133d1c 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -606,6 +606,10 @@ pre = post = tags = ['functional', 'deadman'] +[tests/functional/direct_mmap] +tests = ['dio_mmap'] +tags = ['functional', 'direct_mmap'] + [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbe94f9177ae..b0fee2708b77 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -259,6 +259,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ + functional/direct_mmap/dio.cfg \ + functional/direct_mmap/dio.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ @@ -1359,6 +1361,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ + functional/direct_mmap/cleanup.ksh \ + functional/direct_mmap/dio_mmap.ksh \ + functional/direct_mmap/setup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/direct_mmap/cleanup.ksh b/tests/zfs-tests/tests/functional/direct_mmap/cleanup.ksh new file mode 100755 index 000000000000..382e9b1734b0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct_mmap/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/direct_mmap/dio.cfg b/tests/zfs-tests/tests/functional/direct_mmap/dio.cfg new file mode 100644 index 000000000000..6472610d7b41 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct_mmap/dio.cfg @@ -0,0 +1,26 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +DIO_VDEV1=$TEST_BASE_DIR/file1 +DIO_VDEV2=$TEST_BASE_DIR/file2 +DIO_VDEV3=$TEST_BASE_DIR/file3 +DIO_VDEVS="$DIO_VDEV1 $DIO_VDEV2 $DIO_VDEV3" + +DIO_FILESIZE=4M +DIO_BS=128K diff --git a/tests/zfs-tests/tests/functional/direct_mmap/dio.kshlib b/tests/zfs-tests/tests/functional/direct_mmap/dio.kshlib new file mode 100644 index 000000000000..caf4b782f660 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct_mmap/dio.kshlib @@ -0,0 +1,302 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct_mmap/dio.cfg + +function dio_cleanup +{ + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + rm -f $DIO_VDEVS +} + +# +# Generate an IO workload using fio and then verify the resulting data. +# +function dio_and_verify # mode file-size block-size directory ioengine extra-args +{ + typeset mode=$1 + typeset size=$2 + typeset bs=$3 + typeset mntpnt=$4 + typeset ioengine=$5 + typeset extra_args=$6 + + # Invoke an fio workload via direct IO and verify with direct IO. + log_must fio --directory=$mntpnt --name=direct-$mode \ + --rw=$mode --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --verify=sha1 --ioengine=$ioengine --fallocate=none \ + --group_reporting --minimal --do_verify=1 $extra_args + + # Now just read back the file without direct IO into the ARC as an additional + # verfication step. + log_must fio --directory=$mntpnt --name=direct-$mode \ + --rw=read --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=$ioengine --group_reporting --minimal + + log_must rm -f "$mntpnt/direct-*" +} + +# +# Get zpool status -d checksum verify failures +# +function get_zpool_status_chksum_verify_failures # pool_name vdev_type +{ + typeset pool=$1 + typeset vdev_type=$2 + + if [[ "$vdev_type" == "stripe" ]]; then + val=$(zpool status -dp $pool | \ + awk '{s+=$6} END {print s}' ) + elif [[ "$vdev_type" == "mirror" || "$vdev_type" == "raidz" || + "$vdev_type" == "draid" ]]; then + val=$(zpool status -dp $pool | \ + awk -v d="$vdev_type" '$0 ~ d {print $6}' ) + else + log_fail "Unsupported VDEV type in \ + get_zpool_status_chksum_verify_failures(): $vdev_type" + fi + echo "$val" +} + +# +# Get ZED dio_verify events +# +function get_zed_dio_verify_events # pool +{ + typeset pool=$1 + + val=$(zpool events $pool | grep -c dio_verify) + + echo "$val" +} + +# +# Checking for checksum verify write failures with: +# zpool status -d +# zpool events +# After getting that counts will clear the out the ZPool errors and events +# +function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors +{ + typeset pool=$1 + typeset vdev_type=$2 + typeset expect_errors=$3 + typeset note_str="expecting none" + + if [[ $expect_errors -ne 0 ]]; then + note_str="expecting some" + fi + + log_note "Checking for Direct I/O write checksum verify errors \ + $note_str on ZPool: $pool" + + status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type) + zed_dio_verify_events=$(get_zed_dio_verify_events $pool) + + if [[ $expect_errors -ne 0 ]]; then + if [[ $status_failures -eq 0 || + $zed_dio_verify_events -eq 0 ]]; then + zpool status -dp $pool + zpool events $pool + log_fail "Checksum verifies in zpool status -d \ + $status_failures. ZED dio_verify events \ + $zed_dio_verify_events. Neither should be 0." + fi + else + if [[ $status_failures -ne 0 || + $zed_dio_verify_events -ne 0 ]]; then + zpool status -dp $pool + zpool events $pool + log_fail "Checksum verifies in zpool status -d \ + $status_failures. ZED dio_verify events \ + $zed_dio_verify_events. Both should be zero." + fi + fi + + log_must zpool clear $pool + log_must zpool events -c + +} + +# +# Get the value of a counter from +# Linux: /proc/spl/kstat/zfs/$pool/iostats file. +# FreeBSD: kstat.zfs.$pool.msic.iostats.$stat +# +function get_iostats_stat # pool stat +{ + typeset pool=$1 + typeset stat=$2 + + if is_linux; then + iostats_file=/proc/spl/kstat/zfs/$pool/iostats + val=$(grep -m1 "$stat" $iostats_file | awk '{ print $3 }') + else + val=$(sysctl -n kstat.zfs.$pool.misc.iostats.$stat) + fi + if [[ -z "$val" ]]; then + log_fail "Unable to read $stat counter" + fi + + echo "$val" +} + +# +# Evict any buffered blocks by overwritting them using an O_DIRECT request. +# +function evict_blocks +{ + typeset pool=$1 + typeset file=$2 + typeset size=$3 + + log_must stride_dd -i /dev/urandom -o $file -b $size -c 1 -D +} + +# +# Perform FIO direct IO writes to a file with the given arguments. +# Then verify thae minimum expected number of blocks were written as +# direct IO. +# +function verify_dio_write_count #pool bs size mnpnt +{ + typeset pool=$1 + typeset bs=$2 + typeset size=$3 + typeset mntpnt=$4 + typeset dio_wr_expected=$(((size / bs) -1)) + + log_note "Checking for $dio_wr_expected Direct IO writes" + + prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_and_verify write $size $bs $mntpnt "sync" + curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) + + if [[ $dio_wr_actual -lt $dio_wr_expected ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct writes $dio_wr_actual of $dio_wr_expected" + fi +} + +# +# Perform a stride_dd write command to the file with the given arguments. +# Then verify the minimum expected number of blocks were written as either +# buffered IO (by the ARC), or direct IO to the application (dd). +# +function check_write # pool file bs count seek flags buf_wr dio_wr +{ + typeset pool=$1 + typeset file=$2 + typeset bs=$3 + typeset count=$4 + typeset seek=$5 + typeset flags=$6 + typeset buf_wr_expect=$7 + typeset dio_wr_expect=$8 + + log_note "Checking $count * $bs write(s) at offset $seek, $flags" + + prev_buf_wr=$(get_iostats_stat $pool arc_write_count) + prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + + log_must stride_dd -i /dev/urandom -o $file -b $bs -c $count \ + -k $seek $flags + + curr_buf_wr=$(get_iostats_stat $pool arc_write_count) + buf_wr_actual=$((curr_buf_wr - prev_buf_wr)) + + curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) + + if [[ $buf_wr_actual -lt $buf_wr_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Buffered writes $buf_wr_actual of $buf_wr_expect" + fi + + if [[ $dio_wr_actual -lt $dio_wr_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct writes $dio_wr_actual of $dio_wr_expect" + fi +} + +# +# Perform a stride_dd read command to the file with the given arguments. +# Then verify the minimum expected number of blocks were read as either +# buffered IO (by the ARC), or direct IO to the application (dd). +# +function check_read # pool file bs count skip flags buf_rd dio_rd +{ + typeset pool=$1 + typeset file=$2 + typeset bs=$3 + typeset count=$4 + typeset skip=$5 + typeset flags=$6 + typeset buf_rd_expect=$7 + typeset dio_rd_expect=$8 + + log_note "Checking $count * $bs read(s) at offset $skip, $flags" + + prev_buf_rd=$(get_iostats_stat $pool arc_read_count) + prev_dio_rd=$(get_iostats_stat $pool direct_read_count) + + log_must stride_dd -i $file -o /dev/null -b $bs -c $count \ + -p $skip $flags + + curr_buf_rd=$(get_iostats_stat $pool arc_read_count) + buf_rd_actual=$((curr_buf_rd - prev_buf_rd)) + + curr_dio_rd=$(get_iostats_stat $pool direct_read_count) + dio_rd_actual=$((curr_dio_rd - prev_dio_rd)) + + if [[ $buf_rd_actual -lt $buf_rd_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Buffered reads $buf_rd_actual of $buf_rd_expect" + fi + + if [[ $dio_rd_actual -lt $dio_rd_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct reads $dio_rd_actual of $dio_rd_expect" + fi +} diff --git a/tests/zfs-tests/tests/functional/direct_mmap/dio_mmap.ksh b/tests/zfs-tests/tests/functional/direct_mmap/dio_mmap.ksh new file mode 100755 index 000000000000..139d0ae16b14 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct_mmap/dio_mmap.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct_mmap/dio.cfg +. $STF_SUITE/tests/functional/direct_mmap/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed direct IO and mmap IO. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background fio randomly direct writing to the file. +# 3. Start a background fio randomly mmap writing to the file. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify mixed direct IO and mmap IO" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((1024 * 1024)) +blocks=32 +size=$((bs * blocks)) +runtime=10 + +log_must dd if=/dev/zero of=$tmp_file bs=$bs count=$blocks + +# Direct IO writes +log_must eval "fio --filename=$tmp_file --name=direct-write \ + --rw=write --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +# Direct IO reads +log_must eval "fio --filename=$tmp_file --name=direct-read \ + --rw=read --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +# mmap IO writes +log_must eval "fio --filename=$tmp_file --name=mmap-write \ + --rw=write --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +# mmap IO reads +log_must eval "fio --filename=$tmp_file --name=mmap-read \ + --rw=read --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +wait + +log_pass "Verfied mixed direct IO and mmap IO" diff --git a/tests/zfs-tests/tests/functional/direct_mmap/setup.ksh b/tests/zfs-tests/tests/functional/direct_mmap/setup.ksh new file mode 100755 index 000000000000..5ce95dddf401 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct_mmap/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +verify_runnable "global" + +default_raidz_setup_noexit "$DISKS" +log_must zfs set compression=off $TESTPOOL/$TESTFS +log_pass