From 5d476a3ad5c42b615b4408a401371ea48a0104a4 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 16 Feb 2022 15:30:45 +1100 Subject: [PATCH 001/334] coredump: also dump first pages of non-executable ELF libraries When I rewrote the VMA dumping logic for coredumps, I changed it to recognize ELF library mappings based on the file being executable instead of the mapping having an ELF header. But turns out, distros ship many ELF libraries as non-executable, so the heuristic goes wrong... Restore the old behavior where FILTER(ELF_HEADERS) dumps the first page of any offset-0 readable mapping that starts with the ELF magic. This fix is technically layer-breaking a bit, because it checks for something ELF-specific in fs/coredump.c; but since we probably want to share this between standard ELF and FDPIC ELF anyway, I guess it's fine? And this also keeps the change small for backporting. Link: https://lkml.kernel.org/r/20220126025739.2014888-1-jannh@google.com Fixes: 429a22e776a2 ("coredump: rework elf/elf_fdpic vma_dump_size() into common helper") Signed-off-by: Jann Horn Reported-by: Bill Messmer Cc: "Eric W . Biederman" Cc: Al Viro Cc: Randy Dunlap Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/coredump.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 1c060c0a2d72f..b73817712dd25 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -980,6 +981,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) return false; } +#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 + /* * Decide how much of @vma's contents should be included in a core dump. */ @@ -1039,9 +1042,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * dump the first page to aid in determining what was mapped here. */ if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && - (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) - return PAGE_SIZE; + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + + /* + * ELF libraries aren't always executable. + * We'll want to check whether the mapping starts with the ELF + * magic, but not now - we're holding the mmap lock, + * so copy_from_user() doesn't work here. + * Use a placeholder instead, and fix it up later in + * dump_vma_snapshot(). + */ + return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; + } #undef FILTER @@ -1116,8 +1130,6 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); - - vma_data_size += m->dump_size; } mmap_write_unlock(mm); @@ -1127,6 +1139,23 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, return -EFAULT; } + for (i = 0; i < *vma_count; i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { + char elfmag[SELFMAG]; + + if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || + memcmp(elfmag, ELFMAG, SELFMAG) != 0) { + m->dump_size = 0; + } else { + m->dump_size = PAGE_SIZE; + } + } + + vma_data_size += m->dump_size; + } + *vma_data_size_ptr = vma_data_size; return 0; } From d4147151423107a46b884ae36a859481f0fc8e62 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Wed, 16 Feb 2022 15:30:46 +1100 Subject: [PATCH 002/334] mm: fix panic in __alloc_pages There is a kernel panic caused by pcpu_alloc_pages() passing offlined and uninitialized node to alloc_pages_node() leading to panic by NULL dereferencing uninitialized NODE_DATA(nid). CPU2 has been hot-added BUG: unable to handle page fault for address: 0000000000001608 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 1 Comm: systemd Tainted: G E 5.15.0-rc7+ #11 Hardware name: VMware, Inc. VMware7,1/440BX Desktop Reference Platform, BIOS VMW RIP: 0010:__alloc_pages+0x127/0x290 Code: 4c 89 f0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 44 89 e0 48 8b 55 b8 c1 e8 0c 83 e0 01 88 45 d0 4c 89 c8 48 85 d2 0f 85 1a 01 00 00 <45> 3b 41 08 0f 82 10 01 00 00 48 89 45 c0 48 8b 00 44 89 e2 81 e2 RSP: 0018:ffffc900006f3bc8 EFLAGS: 00010246 RAX: 0000000000001600 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000cc2 RBP: ffffc900006f3c18 R08: 0000000000000001 R09: 0000000000001600 R10: ffffc900006f3a40 R11: ffff88813c9fffe8 R12: 0000000000000cc2 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000cc2 FS: 00007f27ead70500(0000) GS:ffff88807ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000001608 CR3: 000000000582c003 CR4: 00000000001706b0 Call Trace: pcpu_alloc_pages.constprop.0+0xe4/0x1c0 pcpu_populate_chunk+0x33/0xb0 pcpu_alloc+0x4d3/0x6f0 __alloc_percpu_gfp+0xd/0x10 alloc_mem_cgroup_per_node_info+0x54/0xb0 mem_cgroup_alloc+0xed/0x2f0 mem_cgroup_css_alloc+0x33/0x2f0 css_create+0x3a/0x1f0 cgroup_apply_control_enable+0x12b/0x150 cgroup_mkdir+0xdd/0x110 kernfs_iop_mkdir+0x4f/0x80 vfs_mkdir+0x178/0x230 do_mkdirat+0xfd/0x120 __x64_sys_mkdir+0x47/0x70 ? syscall_exit_to_user_mode+0x21/0x50 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Panic can be easily reproduced by disabling udev rule for automatic onlining hot added CPU followed by CPU with memoryless node (NUMA node with CPU only) hot add. Hot adding CPU and memoryless node does not bring the node to online state. Memoryless node will be onlined only during the onlining its CPU. Node can be in one of the following states: 1. not present.(nid == NUMA_NO_NODE) 2. present, but offline (nid > NUMA_NO_NODE, node_online(nid) == 0, NODE_DATA(nid) == NULL) 3. present and online (nid > NUMA_NO_NODE, node_online(nid) > 0, NODE_DATA(nid) != NULL) Percpu code is doing allocations for all possible CPUs. The issue happens when it serves hot added but not yet onlined CPU when its node is in 2nd state. This node is not ready to use, fallback to numa_mem_id(). Link: https://lkml.kernel.org/r/20211108202325.20304-1-amakhalov@vmware.com Signed-off-by: Alexey Makhalov Reviewed-by: David Hildenbrand Acked-by: Dennis Zhou Cc: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Cc: Tejun Heo Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/percpu-vm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 2054c9213c433..f58d73c927892 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -84,15 +84,19 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, gfp_t gfp) { unsigned int cpu, tcpu; - int i; + int i, nid; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { + nid = cpu_to_node(cpu); + if (nid == NUMA_NO_NODE || !node_online(nid)) + nid = numa_mem_id(); + for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; - *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + *pagep = alloc_pages_node(nid, gfp, 0); if (!*pagep) goto err; } From bccf1afab76c88c538618d5c30cfda8131c6c671 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 16 Feb 2022 15:30:46 +1100 Subject: [PATCH 003/334] selftests/vm: cleanup hugetlb file after mremap test The hugepage-mremap test will create a file in a hugetlb filesystem. In a default 'run_vmtests' run, the file will contain all the hugetlb pages. After the test, the file remains and there are no free hugetlb pages for subsequent tests. This causes those hugetlb tests to fail. Change hugepage-mremap to take the name of the hugetlb file as an argument. Unlink the file within the test, and just to be sure remove the file in the run_vmtests script. Link: https://lkml.kernel.org/r/20220201033459.156944-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Shuah Khan Acked-by: Yosry Ahmed Reviewed-by: Muchun Song Cc: Mina Almasry Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/hugepage-mremap.c | 26 ++++++++++++++------ tools/testing/selftests/vm/run_vmtests.sh | 3 ++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 2a7c33631a298..1d689084a54ba 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -3,9 +3,10 @@ * hugepage-mremap: * * Example of remapping huge page memory in a user application using the - * mremap system call. Code assumes a hugetlbfs filesystem is mounted - * at './huge'. The amount of memory used by this test is decided by a command - * line argument in MBs. If missing, the default amount is 10MB. + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. * * To make sure the test triggers pmd sharing and goes through the 'unshare' * path in the mremap code use 1GB (1024) or more. @@ -25,7 +26,6 @@ #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) -#define FILE_NAME "huge/hugepagefile" #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) #define FLAGS (MAP_SHARED | MAP_ANONYMOUS) @@ -107,17 +107,26 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { + size_t length; + + if (argc != 2 && argc != 3) { + printf("Usage: %s [length_in_MB] \n", argv[0]); + exit(1); + } + /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. Any additional args are ignored. + * the default length. */ - size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL; + if (argc == 3) + length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL; length = length > 0 ? length : DEFAULT_LENGTH_MB; length = MB_TO_BYTES(length); int ret = 0; - int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); + /* last arg is the hugetlb file name */ + int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755); if (fd < 0) { perror("Open failed"); @@ -169,5 +178,8 @@ int main(int argc, char *argv[]) munmap(addr, length); + close(fd); + unlink(argv[argc-1]); + return ret; } diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 75d4017413944..71d2dc198fc17 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -111,13 +111,14 @@ fi echo "-----------------------" echo "running hugepage-mremap" echo "-----------------------" -./hugepage-mremap 256 +./hugepage-mremap $mnt/huge_mremap if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 else echo "[PASS]" fi +rm -f $mnt/huge_mremap echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" From 050f689e13aa67aec9c848f4b1d795324374b5e9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 16 Feb 2022 15:30:46 +1100 Subject: [PATCH 004/334] mm/hugetlb: fix kernel crash with hugetlb mremap This fixes the below crash: kernel BUG at include/linux/mm.h:2373! cpu 0x5d: Vector: 700 (Program Check) at [c00000003c6e76e0] pc: c000000000581a54: pmd_to_page+0x54/0x80 lr: c00000000058d184: move_hugetlb_page_tables+0x4e4/0x5b0 sp: c00000003c6e7980 msr: 9000000000029033 current = 0xc00000003bd8d980 paca = 0xc000200fff610100 irqmask: 0x03 irq_happened: 0x01 pid = 9349, comm = hugepage-mremap kernel BUG at include/linux/mm.h:2373! [link register ] c00000000058d184 move_hugetlb_page_tables+0x4e4/0x5b0 [c00000003c6e7980] c00000000058cecc move_hugetlb_page_tables+0x22c/0x5b0 (unreliable) [c00000003c6e7a90] c00000000053b78c move_page_tables+0xdbc/0x1010 [c00000003c6e7bd0] c00000000053bc34 move_vma+0x254/0x5f0 [c00000003c6e7c90] c00000000053c790 sys_mremap+0x7c0/0x900 [c00000003c6e7db0] c00000000002c450 system_call_exception+0x160/0x2c0 the kernel can't use huge_pte_offset before it set the pte entry because a page table lookup check for huge PTE bit in the page table to differentiate between a huge pte entry and a pointer to pte page. A huge_pte_alloc won't mark the page table entry huge and hence kernel should not use huge_pte_offset after a huge_pte_alloc. Link: https://lkml.kernel.org/r/20220211063221.99293-1-aneesh.kumar@linux.ibm.com Fixes: 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Aneesh Kumar K.V Reviewed-by: Mike Kravetz Reviewed-by: Mina Almasry Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d098..e57650a9404f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4851,14 +4851,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pte_t *src_pte) + unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; - pte_t *dst_pte, pte; spinlock_t *src_ptl, *dst_ptl; + pte_t pte; - dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h)); dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); @@ -4917,7 +4916,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (!dst_pte) break; - move_huge_pte(vma, old_addr, new_addr, src_pte); + move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); } flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); From 91bc23179ce07d91cf9312ed26a5ef243020de74 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:47 +1100 Subject: [PATCH 005/334] kasan: test: prevent cache merging in kmem_cache_double_destroy With HW_TAGS KASAN and kasan.stacktrace=off, the cache created in the kmem_cache_double_destroy() test might get merged with an existing one. Thus, the first kmem_cache_destroy() call won't actually destroy it but will only decrease the refcount. This causes the test to fail. Provide an empty constructor for the created cache to prevent the cache from getting merged. Link: https://lkml.kernel.org/r/b597bd434c49591d8af00ee3993a42c609dc9a59.1644346040.git.andreyknvl@google.com Fixes: f98f966cd750 ("kasan: test: add test case for double-kmem_cache_destroy()") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 26a5c9007653a..3b413f8c8a715 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -869,11 +869,14 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void empty_cache_ctor(void *object) { } + static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; - cache = kmem_cache_create("test_cache", 200, 0, 0, NULL); + /* Provide a constructor to prevent cache merging. */ + cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); kmem_cache_destroy(cache); KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); From 97dc7acd104b2c0519634135fbbf510fab5b4886 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 16 Feb 2022 15:30:47 +1100 Subject: [PATCH 006/334] mm: fix use-after-free when anon vma name is used after vma is freed When adjacent vmas are being merged it can result in the vma that was originally passed to madvise_update_vma being destroyed. In the current implementation, the name parameter passed to madvise_update_vma points directly to vma->anon_name->name and it is used after the call to vma_merge. In the cases when vma_merge merges the original vma and destroys it, this will result in use-after-free bug as shown below: madvise_vma_behavior << passes vma->anon_name->name as name param madvise_update_vma(name) vma_merge __vma_adjust vm_area_free <-- frees the vma replace_vma_anon_name(name) <-- UAF Fix this by raising the name refcount and stabilizing it. Introduce vma_anon_name_{get/put} API for this purpose. Link: https://lkml.kernel.org/r/20220211013032.623763-1-surenb@google.com Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory") Signed-off-by: Suren Baghdasaryan Reported-by: syzbot+aa7b3d4b35f9dc46a366@syzkaller.appspotmail.com Cc: Colin Cross Cc: Sumit Semwal Cc: Michal Hocko Cc: Dave Hansen Cc: Kees Cook Cc: Matthew Wilcox Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Johannes Weiner Cc: "Eric W. Biederman" Cc: Christian Brauner Cc: Alexey Gladkov Cc: Sasha Levin Cc: Chris Hyser Cc: Davidlohr Bueso Cc: Peter Collingbourne Cc: Xiaofeng Cao Cc: David Hildenbrand Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm_inline.h | 13 ++++++++ mm/madvise.c | 67 +++++++++++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe715..2ad9b28499b13 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -145,6 +145,11 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ extern const char *vma_anon_name(struct vm_area_struct *vma); +/* mmap_lock should be read-locked */ +extern struct anon_vma_name *vma_anon_name_get(struct vm_area_struct *vma); + +extern void vma_anon_name_put(struct anon_vma_name *anon_name); + /* * mmap_lock should be read-locked for orig_vma->vm_mm. * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be @@ -176,6 +181,14 @@ static inline const char *vma_anon_name(struct vm_area_struct *vma) { return NULL; } + +static inline +struct anon_vma_name *vma_anon_name_get(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline void vma_anon_name_put(struct anon_vma_name *anon_name) {} static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_vma_anon_name(struct vm_area_struct *vma) {} diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df4646..1807778a5f70e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -70,6 +70,9 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name) struct anon_vma_name *anon_name; size_t count; + if (!name) + return NULL; + /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); @@ -103,6 +106,23 @@ const char *vma_anon_name(struct vm_area_struct *vma) return vma->anon_name->name; } +struct anon_vma_name *vma_anon_name_get(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + kref_get(&vma->anon_name->kref); + return vma->anon_name; +} + +void vma_anon_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, vma_anon_name_free); +} + void dup_vma_anon_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { @@ -126,33 +146,34 @@ void free_vma_anon_name(struct vm_area_struct *vma) } /* mmap_lock should be write-locked */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_vma_anon_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - const char *anon_name; + const char *orig_name; - if (!name) { + if (!anon_name) { free_vma_anon_name(vma); return 0; } - anon_name = vma_anon_name(vma); - if (anon_name) { + orig_name = vma_anon_name(vma); + if (orig_name) { /* Same name, nothing to do here */ - if (!strcmp(name, anon_name)) + if (!strcmp(anon_name->name, orig_name)) return 0; free_vma_anon_name(vma); } - vma->anon_name = anon_vma_name_alloc(name); - if (!vma->anon_name) - return -ENOMEM; + kref_get(&anon_name->kref); + vma->anon_name = anon_name; return 0; } #else /* CONFIG_ANON_VMA_NAME */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_vma_anon_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - if (name) + if (anon_name) return -EINVAL; return 0; @@ -161,12 +182,15 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; + * Caller should ensure anon_name stability by raising its refcount even when + * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, - const char *name) + struct anon_vma_name *anon_name) { + const char *name = anon_name ? anon_name->name : NULL; struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; @@ -209,7 +233,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, */ vma->vm_flags = new_flags; if (!vma->vm_file) { - error = replace_vma_anon_name(vma, name); + error = replace_vma_anon_name(vma, anon_name); if (error) return error; } @@ -976,6 +1000,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, { int error; unsigned long new_flags = vma->vm_flags; + struct anon_vma_name *anon_name; switch (behavior) { case MADV_REMOVE: @@ -1040,8 +1065,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } + anon_name = vma_anon_name_get(vma); error = madvise_update_vma(vma, prev, start, end, new_flags, - vma_anon_name(vma)); + anon_name); + vma_anon_name_put(anon_name); out: /* @@ -1225,7 +1252,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long name) + unsigned long anon_name) { int error; @@ -1234,7 +1261,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (const char *)name); + (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1248,8 +1275,10 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, const char *name) { + struct anon_vma_name *anon_name; unsigned long end; unsigned long len; + int ret; if (start & ~PAGE_MASK) return -EINVAL; @@ -1266,8 +1295,12 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)name, + anon_name = anon_vma_name_alloc(name); + ret = madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); + vma_anon_name_put(anon_name); + + return ret; } #endif /* CONFIG_ANON_VMA_NAME */ /* From 83e40f32ab09c40f22067952edbfe843c0913776 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 16 Feb 2022 15:30:47 +1100 Subject: [PATCH 007/334] mm: fix use-after-free bug when mm->mmap is reused after being freed After exit_mmap frees all vmas in the mm, mm->mmap needs to be reset, otherwise it points to a vma that was freed and when reused leads to a use-after-free bug. Link: https://lore.kernel.org/all/00000000000072ef2c05d7f81950@google.com/ Link: https://lkml.kernel.org/r/20220215201922.1908156-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reported-by: syzbot+2ccf63a4bd07cf39cab0@syzkaller.appspotmail.com Suggested-by: Michal Hocko Reviewed-by: Rik van Riel Cc: Michal Hocko Cc: Yang Shi Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Rik van Riel Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Andy Lutomirski Cc: Christian Brauner Cc: Florian Weimer Cc: Jan Engelhardt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 1e8fdb0b51edd..d445c1b9d6065 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3186,6 +3186,7 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); cond_resched(); } + mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } From 0dfc61feb0f43c455d041940fb20931c0c65d08c Mon Sep 17 00:00:00 2001 From: Liu Yuntao Date: Wed, 16 Feb 2022 15:30:47 +1100 Subject: [PATCH 008/334] hugetlbfs: fix a truncation issue in hugepages parameter When we specify a large number for node in hugepages parameter, it may be parsed to another number due to truncation in this statement: node = tmp; For example, add following parameter in command line: hugepagesz=1G hugepages=4294967297:5 and kernel will allocate 5 hugepages for node 1 instead of ignoring it. I move the validation check earlier to fix this issue, and slightly simplifies the condition here. Link: https://lkml.kernel.org/r/20220209134018.8242-1-liuyuntao10@huawei.com Fixes: b5389086ad7be0 ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Signed-off-by: Liu Yuntao Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e57650a9404f7..f294db835f4bc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4159,10 +4159,10 @@ static int __init hugepages_setup(char *s) pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 0; } + if (tmp >= nr_online_nodes) + goto invalid; node = tmp; p += count + 1; - if (node < 0 || node >= nr_online_nodes) - goto invalid; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; From 031a9eda77ccbaf50c8d0b52e33af8490989222f Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 16 Feb 2022 15:30:48 +1100 Subject: [PATCH 009/334] fs/file_table: fix adding missing kmemleak_not_leak() Commit b42bc9a3c511 ("Fix regression due to "fs: move binfmt_misc sysctl to its own file") fixed a regression, however it failed to add a kmemleak_not_leak(). Link: https://lkml.kernel.org/r/20220215020828.4180911-1-mcgrof@kernel.org Fixes: b42bc9a3c511 ("Fix regression due to "fs: move binfmt_misc sysctl to its own file") Signed-off-by: Luis Chamberlain Reported-by: Tong Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/file_table.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 4969021fa6764..7d2e692b66a94 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -119,8 +120,11 @@ static struct ctl_table fs_stat_sysctls[] = { static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); - if (IS_ENABLED(CONFIG_BINFMT_MISC)) - register_sysctl_mount_point("fs/binfmt_misc"); + if (IS_ENABLED(CONFIG_BINFMT_MISC)) { + struct ctl_table_header *hdr; + hdr = register_sysctl_mount_point("fs/binfmt_misc"); + kmemleak_not_leak(hdr); + } return 0; } fs_initcall(init_fs_stat_sysctls); From 4ad5a477404ef7efecd3dfcf008d9b8bd8f40800 Mon Sep 17 00:00:00 2001 From: Toshiki Fukasawa Date: Wed, 16 Feb 2022 15:30:48 +1100 Subject: [PATCH 010/334] /proc/kpageflags: prevent an integer overflow in stable_page_flags() stable_page_flags() returns kpageflags info in u64, but it uses "1 << KPF_*" internally which is considered as int. This type mismatch causes no visible problem now, but it will if you set bit 32 or more as done in a subsequent patch. So use BIT_ULL in order to avoid future overflow issues. Link: http://lkml.kernel.org/r/20190725023100.31141-2-t-fukasawa@vx.jp.nec.com Signed-off-by: Toshiki Fukasawa Cc: Michal Hocko Cc: Dan Williams Cc: Alexey Dobriyan Cc: Christoph Hellwig Cc: Naoya Horiguchi Cc: Junichi Nomura Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde1..265f4fca15e29 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,7 @@ u64 stable_page_flags(struct page *page) * it differentiates a memory hole from a page with no flags */ if (!page) - return 1 << KPF_NOPAGE; + return BIT_ULL(KPF_NOPAGE); k = page->flags; u = 0; @@ -127,22 +127,22 @@ u64 stable_page_flags(struct page *page) * simple test in page_mapped() is not enough. */ if (!PageSlab(page) && page_mapped(page)) - u |= 1 << KPF_MMAP; + u |= BIT_ULL(KPF_MMAP); if (PageAnon(page)) - u |= 1 << KPF_ANON; + u |= BIT_ULL(KPF_ANON); if (PageKsm(page)) - u |= 1 << KPF_KSM; + u |= BIT_ULL(KPF_KSM); /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; + u |= BIT_ULL(KPF_COMPOUND_HEAD); if (PageTail(page)) - u |= 1 << KPF_COMPOUND_TAIL; + u |= BIT_ULL(KPF_COMPOUND_TAIL); if (PageHuge(page)) - u |= 1 << KPF_HUGE; + u |= BIT_ULL(KPF_HUGE); /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it @@ -153,14 +153,13 @@ u64 stable_page_flags(struct page *page) struct page *head = compound_head(page); if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_THP); else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_ZERO_PAGE); + u |= BIT_ULL(KPF_THP); } } else if (is_zero_pfn(page_to_pfn(page))) - u |= 1 << KPF_ZERO_PAGE; - + u |= BIT_ULL(KPF_ZERO_PAGE); /* * Caveats on high order pages: page->_refcount will only be set @@ -168,23 +167,23 @@ u64 stable_page_flags(struct page *page) * SLOB won't set PG_slab at all on compound pages. */ if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); else if (page_count(page) == 0 && is_free_buddy_page(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); if (PageOffline(page)) - u |= 1 << KPF_OFFLINE; + u |= BIT_ULL(KPF_OFFLINE); if (PageTable(page)) - u |= 1 << KPF_PGTABLE; + u |= BIT_ULL(KPF_PGTABLE); if (page_is_idle(page)) - u |= 1 << KPF_IDLE; + u |= BIT_ULL(KPF_IDLE); u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; + u |= BIT_ULL(KPF_SLAB); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); @@ -197,7 +196,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); if (PageSwapCache(page)) - u |= 1 << KPF_SWAPCACHE; + u |= BIT_ULL(KPF_SWAPCACHE); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); From 259d0bfeec7eeb71a223b07614d12bfd85917f44 Mon Sep 17 00:00:00 2001 From: Toshiki Fukasawa Date: Wed, 16 Feb 2022 15:30:48 +1100 Subject: [PATCH 011/334] /proc/kpageflags: do not use uninitialized struct pages A kernel panic was observed during reading /proc/kpageflags for first few pfns allocated by pmem namespace: BUG: unable to handle page fault for address: fffffffffffffffe [ 114.495280] #PF: supervisor read access in kernel mode [ 114.495738] #PF: error_code(0x0000) - not-present page [ 114.496203] PGD 17120e067 P4D 17120e067 PUD 171210067 PMD 0 [ 114.496713] Oops: 0000 [#1] SMP PTI [ 114.497037] CPU: 9 PID: 1202 Comm: page-types Not tainted 5.3.0-rc1 #1 [ 114.497621] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014 [ 114.498706] RIP: 0010:stable_page_flags+0x27/0x3f0 [ 114.499142] Code: 82 66 90 66 66 66 66 90 48 85 ff 0f 84 d1 03 00 00 41 54 55 48 89 fd 53 48 8b 57 08 48 8b 1f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 02 0f 84 57 03 00 00 45 31 e4 48 8b 55 08 48 89 ef [ 114.500788] RSP: 0018:ffffa5e601a0fe60 EFLAGS: 00010202 [ 114.501373] RAX: fffffffffffffffe RBX: ffffffffffffffff RCX: 0000000000000000 [ 114.502009] RDX: 0000000000000001 RSI: 00007ffca13a7310 RDI: ffffd07489000000 [ 114.502637] RBP: ffffd07489000000 R08: 0000000000000001 R09: 0000000000000000 [ 114.503270] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000240000 [ 114.503896] R13: 0000000000080000 R14: 00007ffca13a7310 R15: ffffa5e601a0ff08 [ 114.504530] FS: 00007f0266c7f540(0000) GS:ffff962dbbac0000(0000) knlGS:0000000000000000 [ 114.505245] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 114.505754] CR2: fffffffffffffffe CR3: 000000023a204000 CR4: 00000000000006e0 [ 114.506401] Call Trace: [ 114.506660] kpageflags_read+0xb1/0x130 [ 114.507051] proc_reg_read+0x39/0x60 [ 114.507387] vfs_read+0x8a/0x140 [ 114.507686] ksys_pread64+0x61/0xa0 [ 114.508021] do_syscall_64+0x5f/0x1a0 [ 114.508372] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 114.508844] RIP: 0033:0x7f0266ba426b The reason for the panic is that stable_page_flags() which parses the page flags uses uninitialized struct pages reserved by the ZONE_DEVICE driver. Earlier approach to fix this was discussed here: https://marc.info/?l=linux-mm&m=152964770000672&w=2 This is another approach. To avoid using the uninitialized struct page, immediately return with KPF_RESERVED at the beginning of stable_page_flags() if the page is reserved by ZONE_DEVICE driver. Dan said: : The nvdimm implementation uses vmem_altmap to arrange for the 'struct : page' array to be allocated from a reservation of a pmem namespace. A : namespace in this mode contains an info-block that consumes the first : 8K of the namespace capacity, capacity designated for page mapping, : capacity for padding the start of data to optionally 4K, 2MB, or 1GB : (on x86), and then the namespace data itself. The implementation : specifies a section aligned (now sub-section aligned) address to : arch_add_memory() to establish the linear mapping to map the metadata, : and then vmem_altmap indicates to memmap_init_zone() which pfns : represent data. The implementation only specifies enough 'struct page' : capacity for pfn_to_page() to operate on the data space, not the : namespace metadata space. : : The proposal to validate ZONE_DEVICE pfns against the altmap seems the : right approach to me. Link: http://lkml.kernel.org/r/20190725023100.31141-3-t-fukasawa@vx.jp.nec.com Signed-off-by: Toshiki Fukasawa Cc: Alexey Dobriyan Cc: Christoph Hellwig Cc: Dan Williams Cc: Junichi Nomura Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 3 +++ include/linux/memremap.h | 6 ++++++ mm/memremap.c | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/proc/page.c b/fs/proc/page.c index 265f4fca15e29..4dcbcd506cb6e 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -117,6 +117,9 @@ u64 stable_page_flags(struct page *page) if (!page) return BIT_ULL(KPF_NOPAGE); + if (pfn_zone_device_reserved(page_to_pfn(page))) + return BIT_ULL(KPF_RESERVED); + k = page->flags; u = 0; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1fafcc38acbad..eea1b5cf25716 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -130,6 +130,7 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -142,6 +143,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else +static inline bool pfn_zone_device_reserved(unsigned long pfn) +{ + return false; +} + static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { diff --git a/mm/memremap.c b/mm/memremap.c index 6aa5f0c2d11fd..d2a72cf2ff831 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -115,6 +115,26 @@ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } +/* + * This returns true if the page is reserved by ZONE_DEVICE driver. + */ +bool pfn_zone_device_reserved(unsigned long pfn) +{ + struct dev_pagemap *pgmap; + struct vmem_altmap *altmap; + bool ret = false; + + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ret; + altmap = pgmap_altmap(pgmap); + if (altmap && pfn < (altmap->base_pfn + altmap->reserve)) + ret = true; + put_dev_pagemap(pgmap); + + return ret; +} + #define for_each_device_pfn(pfn, map, i) \ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ pfn = pfn_next(map, pfn)) From cdbf603cdd777c02022f9cd45238b61855bf1894 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 16 Feb 2022 15:30:48 +1100 Subject: [PATCH 012/334] procfs: prevent unprivileged processes accessing fdinfo dir The file permissions on the fdinfo dir from were changed from S_IRUSR|S_IXUSR to S_IRUGO|S_IXUGO, and a PTRACE_MODE_READ check was added for opening the fdinfo files [1]. However, the ptrace permission check was not added to the directory, allowing anyone to get the open FD numbers by reading the fdinfo directory. Add the missing ptrace permission check for opening the fdinfo directory. [1] https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com Link: https://lkml.kernel.org/r/20210713162008.1056986-1-kaleshsingh@google.com Fixes: 7bc3fa0172a4 ("procfs: allow reading fdinfo with PTRACE_MODE_READ") Signed-off-by: Kalesh Singh Cc: Kees Cook Cc: Eric W. Biederman Cc: Christian Brauner Cc: Suren Baghdasaryan Cc: Hridya Valsaraju Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/fd.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b312..913bef0d2a36c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ static int seq_show(struct seq_file *m, void *v) return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, From 46620a2790baedf5bb1f277a527336c07f76ffbb Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 16 Feb 2022 15:30:49 +1100 Subject: [PATCH 013/334] ntfs: add sanity check on allocation size ntfs_read_inode_mount invokes ntfs_malloc_nofs with zero allocation size. It triggers one BUG in the __ntfs_malloc function. Fix this by adding sanity check on ni->attr_list_size. Link: https://lkml.kernel.org/r/20220120094914.47736-1-dzm91@hust.edu.cn Reported-by: syzbot+3c765c5248797356edaa@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ntfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4474adb393ca8..517b71c73aa96 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " From 2bf0b92ff8aa15a8d6f411eb3060d4bd7283ea1e Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 16 Feb 2022 15:30:49 +1100 Subject: [PATCH 014/334] ocfs2: cleanup some return variables Simply return directly instead of assign the return value to another variable. Link: https://lkml.kernel.org/r/20220114021641.13927-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reported-by: Zeal Robot Cc: Minghao Chi Cc: CGEL ZTE Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/file.c | 9 +++------ fs/ocfs2/stack_user.c | 18 ++++++------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fc5f780fa2355..24321c44cd42e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int ret; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); - ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, - clusters_to_add, mark_unwritten, - data_ac, meta_ac, reason_ret); - - return ret; + return ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); } static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 85a47621e0c07..a75e2b7d67f56 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - int ret; - if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + sizeof(struct dlm_lksb); - ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, - flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, lksb, - fsdlm_blocking_ast_wrapper); - return ret; + return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - int ret; - - ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, lksb); - return ret; + return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); } static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) From 9c0ba9d6dfddbcc610c3fe3191ea3ac7089b15a5 Mon Sep 17 00:00:00 2001 From: hongnanli Date: Wed, 16 Feb 2022 15:30:49 +1100 Subject: [PATCH 015/334] fs/ocfs2: fix comments mentioning i_mutex inode->i_mutex has been replaced with inode->i_rwsem long ago. Fix comments still mentioning i_mutex. Link: https://lkml.kernel.org/r/20220214031314.100094-1-hongnan.li@linux.alibaba.com Signed-off-by: hongnanli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/ocfs2/cluster/nodemanager.c | 2 +- fs/ocfs2/dir.c | 4 ++-- fs/ocfs2/file.c | 4 ++-- fs/ocfs2/inode.c | 2 +- fs/ocfs2/localalloc.c | 6 +++--- fs/ocfs2/namei.c | 2 +- fs/ocfs2/ocfs2.h | 4 ++-- fs/ocfs2/quota_global.c | 2 +- fs/ocfs2/xattr.c | 2 +- 11 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bf9357123bc58..49f41074baadd 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5981,7 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, return status; } -/* Expects you to already be holding tl_inode->i_mutex */ +/* Expects you to already be holding tl_inode->i_rwsem */ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 498da317580a7..c23e1c243cc61 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2311,7 +2311,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, down_write(&oi->ip_alloc_sem); - /* Delete orphan before acquire i_mutex. */ + /* Delete orphan before acquire i_rwsem. */ if (dwc->dw_orphaned) { BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 625c925214169..27fee68f860a6 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -689,7 +689,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - /* this runs under the parent dir's i_mutex; there can be only + /* this runs under the parent dir's i_rwsem; there can be only * one caller in here at a time */ if (o2nm_single_cluster) return ERR_PTR(-ENOSPC); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f2cc1ff29e6de..81c3d65d68fec 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1957,7 +1957,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) } /* - * NOTE: this should always be called with parent dir i_mutex taken. + * NOTE: this should always be called with parent dir i_rwsem taken. */ int ocfs2_find_files_on_disk(const char *name, int namelen, @@ -2003,7 +2003,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name * - * Callers should have i_mutex + a cluster lock on dir + * Callers should have i_rwsem + a cluster lock on dir */ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 24321c44cd42e..01b7407a8893f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -270,7 +270,7 @@ int ocfs2_update_inode_atime(struct inode *inode, /* * Don't use ocfs2_mark_inode_dirty() here as we don't always - * have i_mutex to guard against concurrent changes to other + * have i_rwsem to guard against concurrent changes to other * inode fields. */ inode->i_atime = current_time(inode); @@ -1065,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode, /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on - * i_mutex to block other extend/truncate calls while we're + * i_rwsem to block other extend/truncate calls while we're * here. We even have to hold it for sparse files because there * might be some tail zeroing. */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 6c2411c2afcf1..5739dc3015698 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -713,7 +713,7 @@ static int ocfs2_remove_inode(struct inode *inode, /* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir - * i_mutex held, we'll deadlock here. Instead we detect this + * i_rwsem held, we'll deadlock here. Instead we detect this * and exit early - recovery will wipe this inode for us. */ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 5f6bacbeef6b8..c4426d12a2adb 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -606,7 +606,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, /* * make sure we've got at least bits_wanted contiguous bits in the - * local alloc. You lose them when you drop i_mutex. + * local alloc. You lose them when you drop i_rwsem. * * We will add ourselves to the transaction passed in, but may start * our own in order to shift windows. @@ -636,7 +636,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * We must double check state and allocator bits because - * another process may have changed them while holding i_mutex. + * another process may have changed them while holding i_rwsem. */ spin_lock(&osb->osb_lock); if (!ocfs2_la_state_enabled(osb) || @@ -1029,7 +1029,7 @@ enum ocfs2_la_event { /* * Given an event, calculate the size of our next local alloc window. * - * This should always be called under i_mutex of the local alloc inode + * This should always be called under i_rwsem of the local alloc inode * so that local alloc disabling doesn't race with processes trying to * use the allocator. * diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea2..c75fd54b91854 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -476,7 +476,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, ocfs2_free_alloc_context(meta_ac); /* - * We should call iput after the i_mutex of the bitmap been + * We should call iput after the i_rwsem of the bitmap been * unlocked in ocfs2_free_alloc_context, or the * ocfs2_delete_inode will mutex_lock again. */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index bb62cc2e0211b..3375275714612 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -355,7 +355,7 @@ struct ocfs2_super struct delayed_work la_enable_wq; /* - * Must hold local alloc i_mutex and osb->osb_lock to change + * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; @@ -430,7 +430,7 @@ struct ocfs2_super atomic_t osb_tl_disable; /* * How many clusters in our truncate log. - * It must be protected by osb_tl_inode->i_mutex. + * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index f033de733adb3..273f65e0aabac 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -36,7 +36,7 @@ * should be obeyed by all the functions: * - any write of quota structure (either to local or global file) is protected * by dqio_sem or dquot->dq_lock. - * - any modification of global quota file holds inode cluster lock, i_mutex, + * - any modification of global quota file holds inode cluster lock, i_rwsem, * and ip_alloc_sem of the global quota file (achieved by * ocfs2_lock_global_qf). It also has to hold qinfo_lock. * - an allocation of new blocks for local quota file is protected by diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c4..95d0611c5fc7d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7205,7 +7205,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, * Used for reflink a non-preserve-security file. * * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. + * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, From 31902d6a40957ce46a2f764c88b48494183ac9fb Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 16 Feb 2022 15:30:50 +1100 Subject: [PATCH 016/334] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c75fd54b91854..e3dd30dd3547f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7f6355cbb5875..a9a0c7c37e8ed 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7d..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From b786c778d34d5544537fb37925b689a71fdad111 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Wed, 16 Feb 2022 15:30:50 +1100 Subject: [PATCH 017/334] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e3dd30dd3547f..ea27e63ec278f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2027,8 +2033,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From 7810f2fce4a79d661533bba514cff42f858b27c5 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Wed, 16 Feb 2022 15:30:50 +1100 Subject: [PATCH 018/334] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ea27e63ec278f..7d7f2b8f0554e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -640,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 392621d77da177a40b5c58bdee5034862c7ab9e5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:50 +1100 Subject: [PATCH 019/334] mm/fs: remove inode_congested() Patch series "Remove remaining parts of congestions tracking code". Congestion hasn't been reliably tracked for quite some time. Most MM uses of it for guiding writeback decisions were removed in 5.16. Some other uses were removed in 17-rc1. This series removes the remaining places that test for congestion, and the few places which still set it. This patch (of 9): inode_congested() reports if the backing-device for the inode is congested. Few bdi report congestion any more, only ceph, fuse, and nfs. Having support just for those is unlikely to be useful. The places which test inode_congested() or it variants like inode_write_congested(), avoid initiating IO if congestion is present. We now have to rely on other places in the stack to back off, or abort requests - we already do for everything except these 3 filesystems. So remove inode_congested() and related functions, and remove the call sites, assuming that inode_congested() always returns 'false'. Link: https://lkml.kernel.org/r/164325106958.29787.4865219843242892726.stgit@noble.brown Link: https://lkml.kernel.org/r/164325158954.29787.7856652136298668100.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fs-writeback.c | 37 ------------------------------------- include/linux/backing-dev.h | 22 ---------------------- mm/fadvise.c | 5 ++--- mm/readahead.c | 6 ------ mm/vmscan.c | 17 +---------------- 5 files changed, 3 insertions(+), 84 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f8d7fe6db989e..42a3dfad40b80 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -893,43 +893,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); -/** - * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion (may be NULL) - * @cong_bits: mask of WB_[a]sync_congested bits to test - * - * Tests whether @inode is congested. @cong_bits is the mask of congestion - * bits to test and the return value is the mask of set bits. - * - * If cgroup writeback is enabled for @inode, the congestion state is - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg - * associated with @inode is congested; otherwise, the root wb's congestion - * state is used. - * - * @inode is allowed to be NULL as this function is often called on - * mapping->host which is NULL for the swapper space. - */ -int inode_congested(struct inode *inode, int cong_bits) -{ - /* - * Once set, ->i_wb never becomes NULL while the inode is alive. - * Start transaction iff ->i_wb is visible. - */ - if (inode && inode_to_wb_is_valid(inode)) { - struct bdi_writeback *wb; - struct wb_lock_cookie lock_cookie = {}; - bool congested; - - wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); - congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, &lock_cookie); - return congested; - } - - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} -EXPORT_SYMBOL_GPL(inode_congested); - /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 483979c1b9f43..860b675c29295 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -162,7 +162,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); -int inode_congested(struct inode *inode, int cong_bits); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -390,29 +389,8 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline int inode_congested(struct inode *inode, int cong_bits) -{ - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} - #endif /* CONFIG_CGROUP_WRITEBACK */ -static inline int inode_read_congested(struct inode *inode) -{ - return inode_congested(inode, 1 << WB_sync_congested); -} - -static inline int inode_write_congested(struct inode *inode) -{ - return inode_congested(inode, 1 << WB_async_congested); -} - -static inline int inode_rw_congested(struct inode *inode) -{ - return inode_congested(inode, (1 << WB_sync_congested) | - (1 << WB_async_congested)); -} - static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) { return wb_congested(&bdi->wb, cong_bits); diff --git a/mm/fadvise.c b/mm/fadvise.c index d6baa4f451c5f..338f160220129 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!inode_write_congested(mapping->host)) - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/readahead.c b/mm/readahead.c index cf0dcf89eb69b..feda2b1702f1b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -595,12 +595,6 @@ void page_cache_async_ra(struct readahead_control *ractl, folio_clear_readahead(folio); - /* - * Defer asynchronous read-ahead on IO congestion. - */ - if (inode_read_congested(ractl->mapping->host)) - return; - if (blk_cgroup_congested()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 59b14e0d696c9..e38de6456cdcc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -989,17 +989,6 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } -static int may_write_to_inode(struct inode *inode) -{ - if (current->flags & PF_SWAPWRITE) - return 1; - if (!inode_write_congested(inode)) - return 1; - if (inode_to_bdi(inode) == current->backing_dev_info) - return 1; - return 0; -} - /* * We detected a synchronous write error writing a page out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -1201,8 +1190,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host)) - return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; @@ -1578,9 +1565,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, * end of the LRU a second time. */ mapping = page_mapping(page); - if (((dirty || writeback) && mapping && - inode_write_congested(mapping->host)) || - (writeback && PageReclaim(page))) + if (writeback && PageReclaim(page)) stat->nr_congested++; /* From 107939281577ba484cd08915fe6bbfb4c94eeeeb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:51 +1100 Subject: [PATCH 020/334] mm/fs: remove bdi_congested() and wb_congested() and related functions These functions are no longer useful as the only bdis that report congestion are in ceph, fuse, and nfs. None of those bdis can be the target of the calls in drbd, ext2, nilfs2, or xfs. Removing the test on bdi_write_contested() in current_may_throttle() could cause a small change in behaviour, but only when PF_LOCAL_THROTTLE is set. So replace the calls by 'false' and simplify the code - and remove the functions. Link: https://lkml.kernel.org/r/164325158955.29787.4769373293473421057.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/block/drbd/drbd_int.h | 3 --- drivers/block/drbd/drbd_req.c | 3 +-- fs/ext2/ialloc.c | 2 -- fs/nilfs2/segbuf.c | 11 ----------- fs/xfs/xfs_buf.c | 3 --- include/linux/backing-dev.h | 26 -------------------------- mm/vmscan.c | 4 +--- 7 files changed, 2 insertions(+), 50 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f27d5b0f9a0bb..f804b1bfb3e6d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -638,9 +638,6 @@ enum { STATE_SENT, /* Do not change state/UUIDs while this is set */ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) * pending, from drbd worker context. - * If set, bdi_write_congested() returns true, - * so shrink_page_list() would not recurse into, - * and potentially deadlock on, this drbd worker. */ DISCONNECT_SENT, diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3235532ae0778..2e5fb7e442e3d 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -909,8 +909,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - return bdi_read_congested( - device->ldev->backing_bdev->bd_disk->bdi); + return 0; case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index df14e750e9fe3..d632764da2403 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -173,8 +173,6 @@ static void ext2_preread_inode(struct inode *inode) struct backing_dev_info *bdi; bdi = inode_to_bdi(inode); - if (bdi_rw_congested(bdi)) - return; block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 43287b0d3e9b6..d1ebc9da71308 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,17 +343,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, struct bio *bio = wi->bio; int err; - if (segbuf->sb_nbio > 0 && - bdi_write_congested(segbuf->sb_super->s_bdi)) { - wait_for_completion(&segbuf->sb_bio_event); - segbuf->sb_nbio--; - if (unlikely(atomic_read(&segbuf->sb_err))) { - bio_put(bio); - err = -EIO; - goto failed; - } - } - bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; bio_set_op_attrs(bio, mode, mode_flags); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b45e0d50a4052..b7ebcfe6b8d3f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -843,9 +843,6 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) - return; - xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 860b675c29295..2d764566280c8 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -135,11 +135,6 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb) struct backing_dev_info *inode_to_bdi(struct inode *inode); -static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) -{ - return wb->congested & cong_bits; -} - long congestion_wait(int sync, long timeout); static inline bool mapping_can_writeback(struct address_space *mapping) @@ -391,27 +386,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) #endif /* CONFIG_CGROUP_WRITEBACK */ -static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) -{ - return wb_congested(&bdi->wb, cong_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} - -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); -} - const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e38de6456cdcc..5e1469887afa8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2364,9 +2364,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, */ static int current_may_throttle(void) { - return !(current->flags & PF_LOCAL_THROTTLE) || - current->backing_dev_info == NULL || - bdi_write_congested(current->backing_dev_info); + return !(current->flags & PF_LOCAL_THROTTLE); } /* From 3dc99f97078e22def54e8179f967d79703d3bbbe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:30:51 +1100 Subject: [PATCH 021/334] remove-bdi_congested-and-wb_congested-and-related-functions-fix fix build fs/nilfs2/segbuf.c: In function 'nilfs_segbuf_submit_bio': fs/nilfs2/segbuf.c:358:2: error: label 'failed' defined but not used [-Werror=unused-label] Cc: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/nilfs2/segbuf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index d1ebc9da71308..7c43d654a33f2 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -355,7 +355,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, wi->start = wi->end; return 0; - failed: wi->bio = NULL; return err; } From d9723475888a317ab16aeedc76ad315e308110ab Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 16 Feb 2022 15:30:51 +1100 Subject: [PATCH 022/334] ext2: remove unused pointer bdi The call to bdi_congested has been removed and so the bdi pointer is no longer required. Remove it. Link: https://lkml.kernel.org/r/20220207134039.337197-1-colin.i.king@gmail.com Fixes: 9bbab3a63d49 ("mm/fs: remove bdi_congested() and wb_congested() and related functions") Signed-off-by: Colin Ian King Cc: NeilBrown Cc: Jan Kara Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ext2/ialloc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index d632764da2403..998dd2ac80089 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,9 +170,6 @@ static void ext2_preread_inode(struct inode *inode) unsigned long offset; unsigned long block; struct ext2_group_desc * gdp; - struct backing_dev_info *bdi; - - bdi = inode_to_bdi(inode); block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); From 476f0247d64ff93a80d848e7a104b286af463b36 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:51 +1100 Subject: [PATCH 023/334] f2fs: change retry waiting for f2fs_write_single_data_page() f2fs_write_single_data_page() can return -EAGAIN if it cannot get the cp_rwsem lock - it holds a page lock and so cannot wait for it. Some code which calls f2fs_write_single_data_page() use congestion_wait() and then tries again. congestion_wait() doesn't do anything useful as congestion is no longer tracked. So this is just a simple sleep. A better approach is it wait until the cp_rwsem lock can be taken - then try again. There is certainly no point trying again *before* the lock can be taken. Link: https://lkml.kernel.org/r/164325158956.29787.7016948342209980097.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/f2fs/compress.c | 6 +++--- fs/f2fs/data.c | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d0c3aeba59454..58ff7f4b296c9 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1505,9 +1505,9 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + /* Wait until we can get the lock, then try again. */ + f2fs_lock_op(F2FS_I_SB(cc->inode)); + f2fs_unlock_op(F2FS_I_SB(cc->inode)); goto retry_write; } return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c417864c66ae..1d2341163e2cf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3047,9 +3047,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + /* Wait until we can get the + * lock, then try again. + */ + f2fs_lock_op(F2FS_I_SB(mapping->host)); + f2fs_unlock_op(F2FS_I_SB(mapping->host)); + goto retry_write; } goto next; From 25dffd2354aaacc72835eddd23ee0aa7b79d3804 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:51 +1100 Subject: [PATCH 024/334] f2f2: replace some congestion_wait() calls with io_schedule_timeout() As congestion is no longer tracked, contestion_wait() is effectively equivalent to io_schedule_timeout(). It isn't clear to me what these contestion_wait() calls are waiting for, so I cannot change them to wait for some particular event. So simply change them to io_schedule_timeout(), which will have exactly the same behaviour. Link: https://lkml.kernel.org/r/164325158957.29787.2116312603613564596.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/f2fs/segment.c | 14 ++++++++------ fs/f2fs/super.c | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1dabc8244083d..78e3fbc24e77c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -313,8 +313,8 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); if (gc_failure) { if (++looped >= count) return; @@ -802,9 +802,10 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); - if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + if (ret) { + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } } while (ret && --count); if (ret) { @@ -3133,7 +3134,8 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index baefd398ec1a3..4977bc06ec35c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2135,8 +2135,8 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) @@ -2504,8 +2504,8 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); From 5c8cb9e5a91eab0915f911bd4525fb4a4a0a6011 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:52 +1100 Subject: [PATCH 025/334] cephfs: don't set/clear bdi_congestion The bdi congestion framework is no-longer used - writeback uses other mechanisms to manage throughput. So remove calls to set_bdi_congested() and clear_bdi_congested(), and remove the writeback_count which is used only to guide the setting and clearing. The congestion_kb mount option is no longer meaningful, but as it is visible to user-space, removing it needs more consideration. Link: https://lkml.kernel.org/r/164325158958.29787.8840004338500709466.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/ceph/addr.c | 27 --------------------------- fs/ceph/super.c | 2 -- fs/ceph/super.h | 2 -- 3 files changed, 31 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c98e5238a1b6a..9147667f8cd55 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -57,11 +57,6 @@ * accounting is preserved. */ -#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) -#define CONGESTION_OFF_THRESH(congestion_kb) \ - (CONGESTION_ON_THRESH(congestion_kb) - \ - (CONGESTION_ON_THRESH(congestion_kb) >> 2)) - static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, struct folio *folio, void **_fsdata); @@ -561,10 +556,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", inode, page, page->index, page_off, len, snapc, snapc->seq); - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, @@ -621,10 +612,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ - if (atomic_long_dec_return(&fsc->writeback_count) < - CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - return err; } @@ -704,12 +691,6 @@ static void writepages_finish(struct ceph_osd_request *req) BUG_ON(!page); WARN_ON(!PageUptodate(page)); - if (atomic_long_dec_return(&fsc->writeback_count) < - CONGESTION_OFF_THRESH( - fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); dout("unlocking %p\n", page); @@ -952,14 +933,6 @@ static int ceph_writepages_start(struct address_space *mapping, dout("%p will write page %p idx %lu\n", inode, page, page->index); - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) { - set_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - } - - pages[locked_pages++] = page; pvec.pages[i] = NULL; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf79f369aec68..b2f38af9fca83 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -801,8 +801,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->filp_gen = 1; fsc->have_copy_from2 = true; - atomic_long_set(&fsc->writeback_count, 0); - err = -ENOMEM; /* * The number of concurrent works can be high but they don't need diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 67f145e1ae7a3..fc58adf1d36ae 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -120,8 +120,6 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; - atomic_long_t writeback_count; - struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; From 8d2da2d7298f1a7c0f33d6a4857f1f1fe3774cc8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:52 +1100 Subject: [PATCH 026/334] fuse: don't set/clear bdi_congested The bdo congestion framework is no longer used to manage writeout etc, so drop updating it in fuse. Link: https://lkml.kernel.org/r/164325158958.29787.9472805850412952920.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fuse/control.c | 17 ----------------- fs/fuse/dev.c | 8 -------- 2 files changed, 25 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 000d2e5627e99..7cede9a3bc962 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; - struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - - /* - * Get any fuse_mount belonging to this fuse_conn; s_bdi is - * shared between all of them - */ - - if (!list_empty(&fc->mounts)) { - fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); - if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } else { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } - } spin_unlock(&fc->bg_lock); up_read(&fc->killsb); fuse_conn_put(fc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460da..e1b4a846c90d1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fm->sb) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } fc->num_background--; fc->active_background--; flush_bg_queue(fc); @@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fm->sb) { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; From a3f8b865bb800e315e4e680a2faca49a06dc2f03 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:52 +1100 Subject: [PATCH 027/334] NFS: remove congestion control Linux no longer uses the bdi congestion tracking framework. So remove code from bdi which tries to support it. Also remove the "nfs_congestion_kb" sysctl. This is a user-visible change, but unlikely to be a problematic one. Link: https://lkml.kernel.org/r/164325158959.29787.14903007819591774556.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/nfs/sysctl.c | 7 ------ fs/nfs/write.c | 53 +-------------------------------------- include/linux/nfs_fs.h | 1 - include/linux/nfs_fs_sb.h | 1 - 4 files changed, 1 insertion(+), 61 deletions(-) diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 7aea195ddb353..18f3ff77fd0c2 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -22,13 +22,6 @@ static struct ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "nfs_congestion_kb", - .data = &nfs_congestion_kb, - .maxlen = sizeof(nfs_congestion_kb), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 987a187bd39aa..1c22ea6f23c3a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -397,33 +397,8 @@ static int wb_priority(struct writeback_control *wbc) return ret; } -/* - * NFS congestion control - */ - -int nfs_congestion_kb; - -#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) -#define NFS_CONGESTION_OFF_THRESH \ - (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) - -static void nfs_set_page_writeback(struct page *page) -{ - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - int ret = test_set_page_writeback(page); - - WARN_ON_ONCE(ret != 0); - - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); -} - static void nfs_end_page_writeback(struct nfs_page *req) { - struct inode *inode = page_file_mapping(req->wb_page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); bool is_done; is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); @@ -432,8 +407,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) return; end_page_writeback(req->wb_page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } /* @@ -617,7 +590,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, if (IS_ERR(req)) goto out; - nfs_set_page_writeback(page); + set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ @@ -1850,7 +1823,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; - struct nfs_server *nfss; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1891,9 +1863,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Latency breaker */ cond_resched(); } - nfss = NFS_SERVER(data->inode); - if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); @@ -2162,26 +2131,6 @@ int __init nfs_init_writepagecache(void) if (nfs_commit_mempool == NULL) goto out_destroy_commit_cache; - /* - * NFS congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); - if (nfs_congestion_kb > 256*1024) - nfs_congestion_kb = 256*1024; - return 0; out_destroy_commit_cache: diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 68f81d8d36def..4829e6869f2ad 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -570,7 +570,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); /* * linux/fs/nfs/write.c */ -extern int nfs_congestion_kb; extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ca0959e51e817..3444ebbc63b6c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -137,7 +137,6 @@ struct nfs_server { struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ - atomic_long_t writeback; /* number of writeback pages */ unsigned int flags; /* various flags */ /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */ From 30bcf1d8aba368c2032507aab47a9a0805ceb681 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:52 +1100 Subject: [PATCH 028/334] block/bfq-iosched.c: use "false" rather than "BLK_RW_ASYNC" bfq_get_queue() expects a "bool" for the third arg, so pass "false" rather than "BLK_RW_ASYNC" which will soon be removed. Link: https://lkml.kernel.org/r/164325158959.29787.11286416793279041497.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0c612a9116967..4e645ae1e0665 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5448,7 +5448,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); bic_set_bfqq(bic, bfqq, false); } From bf6de015a8bf21591fa5a7b464173e410b12fc87 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:30:52 +1100 Subject: [PATCH 029/334] mm: remove congestion tracking framework This framework is no longer used - so discard it. Link: https://lkml.kernel.org/r/164325158960.29787.17588717894058708630.stgit@noble.brown Signed-off-by: NeilBrown Cc: Jaegeuk Kim Cc: Chao Yu Cc: Jeff Layton Cc: Ilya Dryomov Cc: Miklos Szeredi Cc: Trond Myklebust Cc: Anna Schumaker Cc: Ryusuke Konishi Cc: Darrick J. Wong Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Paolo Valente Cc: Jens Axboe Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/backing-dev-defs.h | 8 ----- include/linux/backing-dev.h | 2 -- include/trace/events/writeback.h | 28 ---------------- mm/backing-dev.c | 57 -------------------------------- 4 files changed, 95 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 993c5628a7263..e863c88df95f9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -207,14 +207,6 @@ struct backing_dev_info { #endif }; -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); - struct wb_lock_cookie { bool locked; unsigned long flags; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2d764566280c8..87ce24d238f34 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -135,8 +135,6 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb) struct backing_dev_info *inode_to_bdi(struct inode *inode); -long congestion_wait(int sync, long timeout); - static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a345b1e12daf3..86b2a82da546a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -735,34 +735,6 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ) ); -DECLARE_EVENT_CLASS(writeback_congest_waited_template, - - TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), - - TP_ARGS(usec_timeout, usec_delayed), - - TP_STRUCT__entry( - __field( unsigned int, usec_timeout ) - __field( unsigned int, usec_delayed ) - ), - - TP_fast_assign( - __entry->usec_timeout = usec_timeout; - __entry->usec_delayed = usec_delayed; - ), - - TP_printk("usec_timeout=%u usec_delayed=%u", - __entry->usec_timeout, - __entry->usec_delayed) -); - -DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, - - TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), - - TP_ARGS(usec_timeout, usec_delayed) -); - DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eae96dfe0261c..7176af65b103a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1005,60 +1005,3 @@ const char *bdi_dev_name(struct backing_dev_info *bdi) return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); - -static wait_queue_head_t congestion_wqh[2] = { - __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), - __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) - }; -static atomic_t nr_wb_congested[2]; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) -{ - wait_queue_head_t *wqh = &congestion_wqh[sync]; - enum wb_congested_state bit; - - bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &bdi->wb.congested)) - atomic_dec(&nr_wb_congested[sync]); - smp_mb__after_atomic(); - if (waitqueue_active(wqh)) - wake_up(wqh); -} -EXPORT_SYMBOL(clear_bdi_congested); - -void set_bdi_congested(struct backing_dev_info *bdi, int sync) -{ - enum wb_congested_state bit; - - bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &bdi->wb.congested)) - atomic_inc(&nr_wb_congested[sync]); -} -EXPORT_SYMBOL(set_bdi_congested); - -/** - * congestion_wait - wait for a backing_dev to become uncongested - * @sync: SYNC or ASYNC IO - * @timeout: timeout in jiffies - * - * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit - * write congestion. If no backing_devs are congested then just wait for the - * next write to be completed. - */ -long congestion_wait(int sync, long timeout) -{ - long ret; - unsigned long start = jiffies; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[sync]; - - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - - trace_writeback_congestion_wait(jiffies_to_usecs(timeout), - jiffies_to_usecs(jiffies - start)); - - return ret; -} -EXPORT_SYMBOL(congestion_wait); From cb7a6ff6b5091fe4e37d6dc1c77a95236cc270e9 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Wed, 16 Feb 2022 15:30:53 +1100 Subject: [PATCH 030/334] mount: warn only once about timestamp range expiration Commit f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry") introduced a mount warning regarding filesystem timestamp limits, that is printed upon each writable mount or remount. This can result in a lot of unnecessary messages in the kernel log in setups where filesystems are being frequently remounted (or mounted multiple times). Avoid this by setting a superblock flag which indicates that the warning has been emitted at least once for any particular mount, as suggested in [1]. [1] https://lore.kernel.org/CAHk-=wim6VGnxQmjfK_tDg6fbHYKL4EFkmnTjVr9QnRqjDBAeA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220119202934.26495-1-ailiop@suse.com Signed-off-by: Anthony Iliopoulos Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Darrick J. Wong Cc: Alexander Viro Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namespace.c | 2 ++ include/linux/fs.h | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 40b994a29e90d..a090cf92e5057 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2567,6 +2567,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2581,6 +2582,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b07..a757d21d9bf6f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1435,6 +1435,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ /* Possible states of 'frozen' field */ enum { From ce6ebd105897ed5018f76cf168915db4e8499739 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:53 +1100 Subject: [PATCH 031/334] kasan, page_alloc: deduplicate should_skip_kasan_poison Patch series "kasan, vmalloc, arm64: add vmalloc tagging support for SW/HW_TAGS", v6. This patchset adds vmalloc tagging support for SW_TAGS and HW_TAGS KASAN modes. About half of patches are cleanups I went for along the way. None of them seem to be important enough to go through stable, so I decided not to split them out into separate patches/series. The patchset is partially based on an early version of the HW_TAGS patchset by Vincenzo that had vmalloc support. Thus, I added a Co-developed-by tag into a few patches. SW_TAGS vmalloc tagging support is straightforward. It reuses all of the generic KASAN machinery, but uses shadow memory to store tags instead of magic values. Naturally, vmalloc tagging requires adding a few kasan_reset_tag() annotations to the vmalloc code. HW_TAGS vmalloc tagging support stands out. HW_TAGS KASAN is based on Arm MTE, which can only assigns tags to physical memory. As a result, HW_TAGS KASAN only tags vmalloc() allocations, which are backed by page_alloc memory. It ignores vmap() and others. This patch (of 39): Currently, should_skip_kasan_poison() has two definitions: one for when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, one for when it's not. Instead of duplicating the checks, add a deferred_pages_enabled() helper and use it in a single should_skip_kasan_poison() definition. Also move should_skip_kasan_poison() closer to its caller and clarify all conditions in the comment. Link: https://lkml.kernel.org/r/cover.1643047180.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/658b79f5fb305edaf7dc16bc52ea870d3220d4a8.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Peter Collingbourne Cc: Evgenii Stepanov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 55 +++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d319..25d4f9ad35258 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1271,6 +1253,35 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; } +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} + static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) { int i; From 46d2ff21c3aced658d34b257194e4db8ef7595bc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:53 +1100 Subject: [PATCH 032/334] kasan, page_alloc: move tag_clear_highpage out of kernel_init_free_pages Currently, kernel_init_free_pages() serves two purposes: it either only zeroes memory or zeroes both memory and memory tags via a different code path. As this function has only two callers, each using only one code path, this behaviour is confusing. Pull the code that zeroes both memory and tags out of kernel_init_free_pages(). As a result of this change, the code in free_pages_prepare() starts to look complicated, but this is improved in the few following patches. Those improvements are not integrated into this patch to make diffs easier to read. This patch does no functional changes. Link: https://lkml.kernel.org/r/7719874e68b23902629c7cf19f966c4fd5f57979.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25d4f9ad35258..012170b1c47aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1282,16 +1282,10 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) PageSkipKASanPoison(page); } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +static void kernel_init_free_pages(struct page *page, int numpages) { int i; - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { @@ -1387,7 +1381,7 @@ static __always_inline bool free_pages_prepare(struct page *page, bool init = want_init_on_free(); if (init) - kernel_init_free_pages(page, 1 << order, false); + kernel_init_free_pages(page, 1 << order); if (!skip_kasan_poison) kasan_poison_pages(page, order, init); } @@ -2430,9 +2424,17 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + + if (init) { + if (gfp_flags & __GFP_ZEROTAGS) { + int i; + + for (i = 0; i < 1 << order; i++) + tag_clear_highpage(page + i); + } else { + kernel_init_free_pages(page, 1 << order); + } + } } set_page_owner(page, order, gfp_flags); From 2c8208290d1340221e2ae3d10b0c479938b4c7f1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:53 +1100 Subject: [PATCH 033/334] kasan, page_alloc: merge kasan_free_pages into free_pages_prepare Currently, the code responsible for initializing and poisoning memory in free_pages_prepare() is scattered across two locations: kasan_free_pages() for HW_TAGS KASAN and free_pages_prepare() itself. This is confusing. This and a few following patches combine the code from these two locations. Along the way, these patches also simplify the performed checks to make them easier to follow. Replaces the only caller of kasan_free_pages() with its implementation. As kasan_has_integrated_init() is only true when CONFIG_KASAN_HW_TAGS is enabled, moving the code does no functional changes. This patch is not useful by itself but makes the simplifications in the following patches easier to follow. Link: https://lkml.kernel.org/r/303498d15840bb71905852955c6e2390ecc87139.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 8 -------- mm/kasan/common.c | 2 +- mm/kasan/hw_tags.c | 11 ----------- mm/page_alloc.c | 6 ++++-- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4a45562d88937..a8bfe9f157c9c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,7 +96,6 @@ static inline bool kasan_hw_tags_enabled(void) } void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); -void kasan_free_pages(struct page *page, unsigned int order); #else /* CONFIG_KASAN_HW_TAGS */ @@ -117,13 +116,6 @@ static __always_inline void kasan_alloc_pages(struct page *page, BUILD_BUG(); } -static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 92196562687b6..a0082fad48b12 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) } /* - * The object will be poisoned by kasan_free_pages() or + * The object will be poisoned by kasan_poison_pages() or * kasan_slab_free_mempool(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 7355cb534e4f8..0b8225add2e48 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -213,17 +213,6 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) } } -void kasan_free_pages(struct page *page, unsigned int order) -{ - /* - * This condition should match the one in free_pages_prepare() in - * page_alloc.c. - */ - bool init = want_init_on_free(); - - kasan_poison_pages(page, order, init); -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 012170b1c47aa..e5f95c6ab0ac6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1368,15 +1368,17 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ if (kasan_has_integrated_init()) { + bool init = want_init_on_free(); + if (!skip_kasan_poison) - kasan_free_pages(page, order); + kasan_poison_pages(page, order, init); } else { bool init = want_init_on_free(); From 346de0c33dfaf1b80423c3cce4236b089b60462d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:53 +1100 Subject: [PATCH 034/334] kasan, page_alloc: simplify kasan_poison_pages call site Simplify the code around calling kasan_poison_pages() in free_pages_prepare(). This patch does no functional changes. Link: https://lkml.kernel.org/r/ae4f9bcf071577258e786bcec4798c145d718c46.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5f95c6ab0ac6..60bc838a4d853 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1302,6 +1302,7 @@ static __always_inline bool free_pages_prepare(struct page *page, { int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1374,19 +1375,10 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - bool init = want_init_on_free(); - - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); - } else { - bool init = want_init_on_free(); - - if (init) - kernel_init_free_pages(page, 1 << order); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); - } + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + if (!skip_kasan_poison) + kasan_poison_pages(page, order, init); /* * arch_free_page() can make the page's contents inaccessible. s390 From 2e83cd716ac90a0e978d906a3ff2475828a01ea5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:54 +1100 Subject: [PATCH 035/334] kasan, page_alloc: init memory of skipped pages on free Since commit 7a3b83537188 ("kasan: use separate (un)poison implementation for integrated init"), when all init, kasan_has_integrated_init(), and skip_kasan_poison are true, free_pages_prepare() doesn't initialize the page. This is wrong. Fix it by remembering whether kasan_poison_pages() performed initialization, and call kernel_init_free_pages() if it didn't. Reordering kasan_poison_pages() and kernel_init_free_pages() is OK, since kernel_init_free_pages() can handle poisoned memory. Link: https://lkml.kernel.org/r/1d97df75955e52727a3dc1c4e33b3b50506fc3fd.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 60bc838a4d853..f994fd68e3b11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1375,11 +1375,16 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (init && !kasan_has_integrated_init()) - kernel_init_free_pages(page, 1 << order); - if (!skip_kasan_poison) + if (!skip_kasan_poison) { kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; + } + if (init) + kernel_init_free_pages(page, 1 << order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should From 2c2a1dcb6edd17b227118a0eb9a92bd53ea2cd44 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:54 +1100 Subject: [PATCH 036/334] kasan: drop skip_kasan_poison variable in free_pages_prepare skip_kasan_poison is only used in a single place. Call should_skip_kasan_poison() directly for simplicity. Link: https://lkml.kernel.org/r/1d33212e79bc9ef0b4d3863f903875823e89046f.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f994fd68e3b11..8481420d25021 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1301,7 +1301,6 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1375,7 +1374,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (!skip_kasan_poison) { + if (!should_skip_kasan_poison(page, fpi_flags)) { kasan_poison_pages(page, order, init); /* Memory is already initialized if KASAN did it internally. */ From abdb14bc162a9b7547daffae15b0f9f760ce142e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:54 +1100 Subject: [PATCH 037/334] mm: clarify __GFP_ZEROTAGS comment __GFP_ZEROTAGS is intended as an optimization: if memory is zeroed during allocation, it's possible to set memory tags at the same time with little performance impact. Clarify this intention of __GFP_ZEROTAGS in the comment. Link: https://lkml.kernel.org/r/cdffde013973c5634a447513e10ec0d21e8eee29.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 80f63c862be57..581a1f47b8a2c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -232,8 +232,10 @@ struct vm_area_struct; * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if - * __GFP_ZERO is set. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself + * is being zeroed (either via __GFP_ZERO or via init_on_alloc). This flag is + * intended for optimization: setting memory tags at the same time as zeroing + * memory has minimal additional performace impact. * * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned * on deallocation. Typically used for userspace pages. Currently only has an From 7bfcf77b30c91d90fe462d86c392c252812be9ab Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:54 +1100 Subject: [PATCH 038/334] kasan: only apply __GFP_ZEROTAGS when memory is zeroed __GFP_ZEROTAGS should only be effective if memory is being zeroed. Currently, hardware tag-based KASAN violates this requirement. Fix by including an initialization check along with checking for __GFP_ZEROTAGS. Link: https://lkml.kernel.org/r/f4f4593f7f675262d29d07c1938db5bd0cd5e285.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 0b8225add2e48..c643740b85996 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -199,11 +199,12 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) * page_alloc.c. */ bool init = !want_init_on_free() && want_init_on_alloc(flags); + bool init_tags = init && (flags & __GFP_ZEROTAGS); if (flags & __GFP_SKIP_KASAN_POISON) SetPageSkipKASanPoison(page); - if (flags & __GFP_ZEROTAGS) { + if (init_tags) { int i; for (i = 0; i != 1 << order; ++i) From fb151cf5eda4a75d27d9c35cb38a807f1e41cb4a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:54 +1100 Subject: [PATCH 039/334] kasan, page_alloc: refactor init checks in post_alloc_hook Separate code for zeroing memory from the code clearing tags in post_alloc_hook(). This patch is not useful by itself but makes the simplifications in the following patches easier to follow. This patch does no functional changes. Link: https://lkml.kernel.org/r/2283fde963adfd8a2b29a92066f106cc16661a3c.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8481420d25021..868480d463c78 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2420,19 +2420,21 @@ inline void post_alloc_hook(struct page *page, unsigned int order, kasan_alloc_pages(page, order, gfp_flags); } else { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); kasan_unpoison_pages(page, order, init); - if (init) { - if (gfp_flags & __GFP_ZEROTAGS) { - int i; + if (init_tags) { + int i; - for (i = 0; i < 1 << order; i++) - tag_clear_highpage(page + i); - } else { - kernel_init_free_pages(page, 1 << order); - } + for (i = 0; i < 1 << order; i++) + tag_clear_highpage(page + i); + + init = false; } + + if (init) + kernel_init_free_pages(page, 1 << order); } set_page_owner(page, order, gfp_flags); From b085cd7d1f26911c73b8209630f40902bde00be8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:55 +1100 Subject: [PATCH 040/334] kasan, page_alloc: merge kasan_alloc_pages into post_alloc_hook Currently, the code responsible for initializing and poisoning memory in post_alloc_hook() is scattered across two locations: kasan_alloc_pages() hook for HW_TAGS KASAN and post_alloc_hook() itself. This is confusing. This and a few following patches combine the code from these two locations. Along the way, these patches do a step-by-step restructure the many performed checks to make them easier to follow. Replace the only caller of kasan_alloc_pages() with its implementation. As kasan_has_integrated_init() is only true when CONFIG_KASAN_HW_TAGS is enabled, moving the code does no functional changes. Also move init and init_tags variables definitions out of kasan_has_integrated_init() clause in post_alloc_hook(), as they have the same values regardless of what the if condition evaluates to. This patch is not useful by itself but makes the simplifications in the following patches easier to follow. Link: https://lkml.kernel.org/r/5ac7e0b30f5cbb177ec363ddd7878a3141289592.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 9 --------- mm/kasan/common.c | 2 +- mm/kasan/hw_tags.c | 22 ---------------------- mm/page_alloc.c | 20 +++++++++++++++----- 4 files changed, 16 insertions(+), 37 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a8bfe9f157c9c..b88ca6b97ba32 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -95,8 +95,6 @@ static inline bool kasan_hw_tags_enabled(void) return kasan_enabled(); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); - #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_enabled(void) @@ -109,13 +107,6 @@ static inline bool kasan_hw_tags_enabled(void) return false; } -static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order, gfp_t flags) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index a0082fad48b12..d9079ec11f313 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, return NULL; /* - * The object has already been unpoisoned by kasan_alloc_pages() for + * The object has already been unpoisoned by kasan_unpoison_pages() for * alloc_pages() or by kasan_krealloc() for krealloc(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index c643740b85996..76cf2b6229c79 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -192,28 +192,6 @@ void __init kasan_init_hw_tags(void) kasan_stack_collection_enabled() ? "on" : "off"); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) -{ - /* - * This condition should match the one in post_alloc_hook() in - * page_alloc.c. - */ - bool init = !want_init_on_free() && want_init_on_alloc(flags); - bool init_tags = init && (flags & __GFP_ZEROTAGS); - - if (flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); - - if (init_tags) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { - kasan_unpoison_pages(page, order, init); - } -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 868480d463c78..abed862d889d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2398,6 +2398,9 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + set_page_private(page, 0); set_page_refcounted(page); @@ -2413,15 +2416,22 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + if (gfp_flags & __GFP_SKIP_KASAN_POISON) + SetPageSkipKASanPoison(page); + + if (init_tags) { + int i; + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + } else { + kasan_unpoison_pages(page, order, init); + } + } else { kasan_unpoison_pages(page, order, init); if (init_tags) { From fae194da61bf5e4932609f55f8e2b084cf9a4ffc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:55 +1100 Subject: [PATCH 041/334] kasan, page_alloc: combine tag_clear_highpage calls in post_alloc_hook Move tag_clear_highpage() loops out of the kasan_has_integrated_init() clause as a code simplification. This patch does no functional changes. Link: https://lkml.kernel.org/r/587e3fc36358b88049320a89cc8dc6deaecb0cda.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index abed862d889d1..b3959327e06ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2419,30 +2419,30 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ + + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + int i; + + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } if (kasan_has_integrated_init()) { if (gfp_flags & __GFP_SKIP_KASAN_POISON) SetPageSkipKASanPoison(page); - if (init_tags) { - int i; - - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { + if (!init_tags) kasan_unpoison_pages(page, order, init); - } } else { kasan_unpoison_pages(page, order, init); - if (init_tags) { - int i; - - for (i = 0; i < 1 << order; i++) - tag_clear_highpage(page + i); - - init = false; - } - if (init) kernel_init_free_pages(page, 1 << order); } From ac8130a6b8818d22889cd4c76b2b9f311fdf412a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:55 +1100 Subject: [PATCH 042/334] kasan, page_alloc: move SetPageSkipKASanPoison in post_alloc_hook Pull the SetPageSkipKASanPoison() call in post_alloc_hook() out of the big if clause for better code readability. This also allows for more simplifications in the following patches. Also turn the kasan_has_integrated_init() check into the proper kasan_hw_tags_enabled() one. These checks evaluate to the same value, but logically skipping kasan poisoning has nothing to do with integrated init. Link: https://lkml.kernel.org/r/7214c1698b754ccfaa44a792113c95cc1f807c48.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b3959327e06ce..c51d637cdab39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2435,9 +2435,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, init = false; } if (kasan_has_integrated_init()) { - if (gfp_flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); - if (!init_tags) kasan_unpoison_pages(page, order, init); } else { @@ -2446,6 +2443,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, if (init) kernel_init_free_pages(page, 1 << order); } + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); From f1ed584848e772e210e4df1d43b8a882f77a3eac Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:55 +1100 Subject: [PATCH 043/334] kasan, page_alloc: move kernel_init_free_pages in post_alloc_hook Pull the kernel_init_free_pages() call in post_alloc_hook() out of the big if clause for better code readability. This also allows for more simplifications in the following patch. This patch does no functional changes. Link: https://lkml.kernel.org/r/a7a76456501eb37ddf9fca6529cee9555e59cdb1.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c51d637cdab39..2784bd4789423 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2435,14 +2435,18 @@ inline void post_alloc_hook(struct page *page, unsigned int order, init = false; } if (kasan_has_integrated_init()) { - if (!init_tags) + if (!init_tags) { kasan_unpoison_pages(page, order, init); + + /* Note that memory is already initialized by KASAN. */ + init = false; + } } else { kasan_unpoison_pages(page, order, init); - - if (init) - kernel_init_free_pages(page, 1 << order); } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_free_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) SetPageSkipKASanPoison(page); From 97d13822024f889cb6b16763584edf3dd821528f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:56 +1100 Subject: [PATCH 044/334] kasan, page_alloc: rework kasan_unpoison_pages call site Rework the checks around kasan_unpoison_pages() call in post_alloc_hook(). The logical condition for calling this function is: - If a software KASAN mode is enabled, we need to mark shadow memory. - Otherwise, HW_TAGS KASAN is enabled, and it only makes sense to set tags if they haven't already been cleared by tag_clear_highpage(), which is indicated by init_tags. This patch concludes the changes for post_alloc_hook(). Link: https://lkml.kernel.org/r/0ecebd0d7ccd79150e3620ea4185a32d3dfe912f.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2784bd4789423..3af38e3233914 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2434,15 +2434,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - if (kasan_has_integrated_init()) { - if (!init_tags) { - kasan_unpoison_pages(page, order, init); + /* + * If either a software KASAN mode is enabled, or, + * in the case of hardware tag-based KASAN, + * if memory tags have not been cleared via tag_clear_highpage(). + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS) || + kasan_hw_tags_enabled() && !init_tags) { + /* Mark shadow memory or set memory tags. */ + kasan_unpoison_pages(page, order, init); - /* Note that memory is already initialized by KASAN. */ + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) init = false; - } - } else { - kasan_unpoison_pages(page, order, init); } /* If memory is still not initialized, do it now. */ if (init) From d3b78b1b9fa3c8ef030c0ffaddadde4c1d236a2d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:56 +1100 Subject: [PATCH 045/334] kasan: clean up metadata byte definitions Most of the metadata byte values are only used for Generic KASAN. Remove KASAN_KMALLOC_FREETRACK definition for !CONFIG_KASAN_GENERIC case, and put it along with other metadata values for the Generic mode under a corresponding ifdef. Link: https://lkml.kernel.org/r/ac11d6e9e007c95e472e8fdd22efb6074ef3c6d8.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/kasan.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c17fa8d26ffe5..952cd6f9ca464 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -71,15 +71,16 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID #endif +#ifdef CONFIG_KASAN_GENERIC + +#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ #define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ @@ -110,6 +111,8 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_ABI_VERSION 1 #endif +#endif /* CONFIG_KASAN_GENERIC */ + /* Metadata layout customization. */ #define META_BYTES_PER_BLOCK 1 #define META_BLOCKS_PER_ROW 16 From 92652a43714696328b719b9ba2a3fd182efe9f20 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:56 +1100 Subject: [PATCH 046/334] kasan: define KASAN_VMALLOC_INVALID for SW_TAGS In preparation for adding vmalloc support to SW_TAGS KASAN, provide a KASAN_VMALLOC_INVALID definition for it. HW_TAGS KASAN won't be using this value, as it falls back onto page_alloc for poisoning freed vmalloc() memory. Link: https://lkml.kernel.org/r/1daaaafeb148a7ae8285265edc97d7ca07b6a07d.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/kasan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 952cd6f9ca464..020f3e57a03f5 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -71,18 +71,19 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */ #endif #ifdef CONFIG_KASAN_GENERIC #define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ /* * Stack redzone shadow values From fc4fc01e951df20a05e347d60cfbbb3b56d7c7cf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:57 +1100 Subject: [PATCH 047/334] kasan, x86, arm64, s390: rename functions for modules shadow Rename kasan_free_shadow to kasan_free_module_shadow and kasan_module_alloc to kasan_alloc_module_shadow. These functions are used to allocate/free shadow memory for kernel modules when KASAN_VMALLOC is not enabled. The new names better reflect their purpose. Also reword the comment next to their declaration to improve clarity. Link: https://lkml.kernel.org/r/36db32bde765d5d0b856f77d2d806e838513fe84.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kernel/module.c | 2 +- arch/s390/kernel/module.c | 2 +- arch/x86/kernel/module.c | 2 +- include/linux/kasan.h | 14 +++++++------- mm/kasan/shadow.c | 4 ++-- mm/vmalloc.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 309a27553c875..d3a1fa8183487 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b032e556eeb71..a7aefc278909b 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -45,7 +45,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 95fa745e310a5..c9eb8aa3b7b8a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -78,7 +78,7 @@ void *module_alloc(unsigned long size) MODULES_END, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b88ca6b97ba32..55f1d4edf6b55 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -454,17 +454,17 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, !defined(CONFIG_KASAN_VMALLOC) /* - * These functions provide a special case to support backing module - * allocations with real shadow memory. With KASAN vmalloc, the special - * case is unnecessary, as the work is handled in the generic case. + * These functions allocate and free shadow memory for kernel modules. + * They are only required when KASAN_VMALLOC is not supported, as otherwise + * shadow memory is allocated by the generic vmalloc handlers. */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); -void kasan_free_shadow(const struct vm_struct *vm); +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); +void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } -static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } +static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 94136f84b4497..e5c4393eb861e 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -534,7 +534,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) return -ENOMEM; } -void kasan_free_shadow(const struct vm_struct *vm) +void kasan_free_module_shadow(const struct vm_struct *vm) { if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4165304d35471..b6712a25c996e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2526,7 +2526,7 @@ struct vm_struct *remove_vm_area(const void *addr) va->vm = NULL; spin_unlock(&vmap_area_lock); - kasan_free_shadow(vm); + kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; From c6e07441809951a2aaa756d58071c3589dc7ad42 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:57 +1100 Subject: [PATCH 048/334] kasan, vmalloc: drop outdated VM_KASAN comment The comment about VM_KASAN in include/linux/vmalloc.c is outdated. VM_KASAN is currently only used to mark vm_areas allocated for kernel modules when CONFIG_KASAN_VMALLOC is disabled. Drop the comment. Link: https://lkml.kernel.org/r/780395afea83a147b3b5acc36cf2e38f7f8479f9.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/vmalloc.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 880227b9f0440..87f8cfec50a03 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,17 +35,6 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif -/* - * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. - * - * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after - * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poison shadow on free if it was never allocated. - * - * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to - * determine which allocations need the module shadow freed. - */ - /* bits [20..32] reserved for arch specific ioremap internals */ /* From a94dcc7e0aefe5cb084ee41624c2bcc89080536f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:57 +1100 Subject: [PATCH 049/334] kasan: reorder vmalloc hooks Group functions that [de]populate shadow memory for vmalloc. Group functions that [un]poison memory for vmalloc. This patch does no functional changes but prepares KASAN code for adding vmalloc support to HW_TAGS KASAN. Link: https://lkml.kernel.org/r/aeef49eb249c206c4c9acce2437728068da74c28.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 20 +++++++++----------- mm/kasan/shadow.c | 43 ++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 55f1d4edf6b55..46a63374c86fb 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -418,34 +418,32 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); +void kasan_unpoison_vmalloc(const void *start, unsigned long size); +void kasan_poison_vmalloc(const void *start, unsigned long size); #else /* CONFIG_KASAN_VMALLOC */ +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } - -static inline void kasan_poison_vmalloc(const void *start, unsigned long size) -{ } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) {} + unsigned long free_region_end) { } -static inline void kasan_populate_early_vm_area_shadow(void *start, - unsigned long size) +static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ } +static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } #endif /* CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e5c4393eb861e..bf7ab62fbfb94 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; } -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison(start, size, false); -} - static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -496,6 +475,28 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } + +void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + kasan_unpoison(start, size, false); +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + #else /* CONFIG_KASAN_VMALLOC */ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) From 0de122c9a3232472023dd68d89fb14e36c85852b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:57 +1100 Subject: [PATCH 050/334] kasan: add wrappers for vmalloc hooks Add wrappers around functions that [un]poison memory for vmalloc allocations. These functions will be used by HW_TAGS KASAN and therefore need to be disabled when kasan=off command line argument is provided. This patch does no functional changes for software KASAN modes. Link: https://lkml.kernel.org/r/3b8728eac438c55389fb0f9a8a2145d71dd77487.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 17 +++++++++++++++-- mm/kasan/shadow.c | 5 ++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 46a63374c86fb..da320069e7cf8 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -424,8 +424,21 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); +void __kasan_unpoison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_unpoison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_unpoison_vmalloc(start, size); +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_poison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_poison_vmalloc(start, size); +} #else /* CONFIG_KASAN_VMALLOC */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index bf7ab62fbfb94..39d0b32ebf708 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,8 +475,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) +void __kasan_unpoison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) return; @@ -488,7 +487,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) * Poison the shadow for a vmalloc region. Called as part of the * freeing process at the time the region is freed. */ -void kasan_poison_vmalloc(const void *start, unsigned long size) +void __kasan_poison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) return; From b117a7541c7cf8b1970ed6e225da456bdf624ed1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:57 +1100 Subject: [PATCH 051/334] kasan, vmalloc: reset tags in vmalloc functions In preparation for adding vmalloc support to SW/HW_TAGS KASAN, reset pointer tags in functions that use pointer values in range checks. vread() is a special case here. Despite the untagging of the addr pointer in its prologue, the accesses performed by vread() are checked. Instead of accessing the virtual mappings though addr directly, vread() recovers the physical address via page_address(vmalloc_to_page()) and acceses that. And as page_address() recovers the pointer tag, the accesses get checked. Link: https://lkml.kernel.org/r/046003c5f683cacb0ba18e1079e9688bb3dca943.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b6712a25c996e..38bf3b418b816 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false; bool is_vmalloc_addr(const void *x) { - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } @@ -632,7 +632,7 @@ int is_vmalloc_or_module_addr(const void *x) * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif @@ -806,6 +806,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *tmp; @@ -827,6 +829,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *va; @@ -2145,7 +2149,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; - unsigned long addr = (unsigned long)mem; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); @@ -3404,6 +3408,8 @@ long vread(char *buf, char *addr, unsigned long count) unsigned long buflen = count; unsigned long n; + addr = kasan_reset_tag(addr); + /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; From d82c81f1160478fae61232fa4da5fd772d0e4bb0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:58 +1100 Subject: [PATCH 052/334] kasan, fork: reset pointer tags of vmapped stacks Once tag-based KASAN modes start tagging vmalloc() allocations, kernel stacks start getting tagged if CONFIG_VMAP_STACK is enabled. Reset the tag of kernel stack pointers after allocation in alloc_thread_stack_node(). For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation can't handle the SP register being tagged. For HW_TAGS KASAN, there's no instrumentation-related issues. However, the impact of having a tagged SP register needs to be properly evaluated, so keep it non-tagged for now. Note, that the memory for the stack allocation still gets tagged to catch vmalloc-into-stack out-of-bounds accesses. Link: https://lkml.kernel.org/r/c6c96f012371ecd80e1936509ebcd3b07a5956f7.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b219..57d624f05182e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -254,6 +254,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) * so cache the vm_struct. */ if (stack) { + stack = kasan_reset_tag(stack); tsk->stack_vm_area = find_vm_area(stack); tsk->stack = stack; } From 35934bbf0c6bbdedbea61de2acc08df4934591a1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:58 +1100 Subject: [PATCH 053/334] kasan, arm64: reset pointer tags of vmapped stacks Once tag-based KASAN modes start tagging vmalloc() allocations, kernel stacks start getting tagged if CONFIG_VMAP_STACK is enabled. Reset the tag of kernel stack pointers after allocation in arch_alloc_vmap_stack(). For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation can't handle the SP register being tagged. For HW_TAGS KASAN, there's no instrumentation-related issues. However, the impact of having a tagged SP register needs to be properly evaluated, so keep it non-tagged for now. Note, that the memory for the stack allocation still gets tagged to catch vmalloc-into-stack out-of-bounds accesses. Link: https://lkml.kernel.org/r/698c5ab21743c796d46c15d075b9481825973e34.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/include/asm/vmap_stack.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 894e031b28d28..20873099c035c 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -17,10 +17,13 @@ */ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { + void *p; + BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, __builtin_return_address(0)); + return kasan_reset_tag(p); } #endif /* __ASM_VMAP_STACK_H */ From 02b0b3a35d41f6d5050cc994f48ab16ceb443204 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:58 +1100 Subject: [PATCH 054/334] kasan, vmalloc: add vmalloc tagging for SW_TAGS Add vmalloc tagging support to SW_TAGS KASAN. - __kasan_unpoison_vmalloc() now assigns a random pointer tag, poisons the virtual mapping accordingly, and embeds the tag into the returned pointer. - __get_vm_area_node() (used by vmalloc() and vmap()) and pcpu_get_vm_areas() save the tagged pointer into vm_struct->addr (note: not into vmap_area->addr). This requires putting kasan_unpoison_vmalloc() after setup_vmalloc_vm[_locked](); otherwise the latter will overwrite the tagged pointer. The tagged pointer then is naturally propagateed to vmalloc() and vmap(). - vm_map_ram() returns the tagged pointer directly. As a result of this change, vm_struct->addr is now tagged. Enabling KASAN_VMALLOC with SW_TAGS is not yet allowed. Link: https://lkml.kernel.org/r/4a78f3c064ce905e9070c29733aca1dd254a74f1.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 16 ++++++++++------ mm/kasan/shadow.c | 6 ++++-- mm/vmalloc.c | 14 ++++++++------ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index da320069e7cf8..92c5dfa29a352 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -424,12 +424,13 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void __kasan_unpoison_vmalloc(const void *start, unsigned long size); -static __always_inline void kasan_unpoison_vmalloc(const void *start, - unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size); +static __always_inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size) { if (kasan_enabled()) - __kasan_unpoison_vmalloc(start, size); + return __kasan_unpoison_vmalloc(start, size); + return (void *)start; } void __kasan_poison_vmalloc(const void *start, unsigned long size); @@ -454,8 +455,11 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_start, unsigned long free_region_end) { } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } +static inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size) +{ + return (void *)start; +} static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 39d0b32ebf708..5a866f6663fc0 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,12 +475,14 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } -void __kasan_unpoison_vmalloc(const void *start, unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size) { if (!is_vmalloc_or_module_addr(start)) - return; + return (void *)start; + start = set_tag(start, kasan_random_tag()); kasan_unpoison(start, size, false); + return (void *)start; } /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 38bf3b418b816..15e1a4fdfe0b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2210,7 +2210,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - kasan_unpoison_vmalloc(mem, size); + mem = kasan_unpoison_vmalloc(mem, size); if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { @@ -2443,10 +2443,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - setup_vmalloc_vm(area, va, flags, caller); + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + return area; } @@ -3795,9 +3795,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; - - kasan_unpoison_vmalloc((void *)vas[area]->va_start, - sizes[area]); } /* insert all vm's */ @@ -3810,6 +3807,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); + /* mark allocated areas as accessible */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size); + kfree(vas); return vms; From 07ed549d88b2b1cab913e0acb929deaf4f71b58d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:58 +1100 Subject: [PATCH 055/334] kasan, vmalloc, arm64: mark vmalloc mappings as pgprot_tagged HW_TAGS KASAN relies on ARM Memory Tagging Extension (MTE). With MTE, a memory region must be mapped as MT_NORMAL_TAGGED to allow setting memory tags via MTE-specific instructions. Add proper protection bits to vmalloc() allocations. These allocations are always backed by page_alloc pages, so the tags will actually be getting set on the corresponding physical memory. Link: https://lkml.kernel.org/r/983fc33542db2f6b1e77b34ca23448d4640bbb9e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/include/asm/vmalloc.h | 6 ++++++ include/linux/vmalloc.h | 7 +++++++ mm/vmalloc.c | 9 +++++++++ 3 files changed, 22 insertions(+) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index b9185503feae2..38fafffe699f7 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -25,4 +25,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) #endif +#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return pgprot_tagged(prot); +} + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 87f8cfec50a03..7b879c77bec5f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) } #endif +#ifndef arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return prot; +} +#endif + /* * Highlevel APIs for driver use */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 15e1a4fdfe0b6..92e635b7490cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3108,6 +3108,15 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } + /* + * Modify protection bits to allow tagging. + * This must be done before mapping by __vmalloc_area_node(). + */ + if (kasan_hw_tags_enabled() && + pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) + prot = arch_vmap_pgprot_tagged(prot); + + /* Allocate physical pages and map them into vmalloc space. */ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) goto fail; From 112c1703d653e812b680843590437f628945d021 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:58 +1100 Subject: [PATCH 056/334] kasan, vmalloc: unpoison VM_ALLOC pages after mapping Make KASAN unpoison vmalloc mappings after they have been mapped in when it's possible: for vmalloc() (indentified via VM_ALLOC) and vm_map_ram(). The reasons for this are: - For vmalloc() and vm_map_ram(): pages don't get unpoisoned in case mapping them fails. - For vmalloc(): HW_TAGS KASAN needs pages to be mapped to set tags via kasan_unpoison_vmalloc(). As a part of these changes, the return value of __vmalloc_node_range() is changed to area->addr. This is a non-functional change, as __vmalloc_area_node() returns area->addr anyway. Link: https://lkml.kernel.org/r/fcb98980e6fcd3c4be6acdcb5d6110898ef28548.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 92e635b7490cb..b65adac1cd802 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2210,14 +2210,15 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - mem = kasan_unpoison_vmalloc(mem, size); - if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + /* Mark the pages as accessible, now that they are mapped. */ + mem = kasan_unpoison_vmalloc(mem, size); + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2445,7 +2446,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, setup_vmalloc_vm(area, va, flags, caller); - area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); return area; } @@ -3055,7 +3063,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3117,10 +3125,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, prot = arch_vmap_pgprot_tagged(prot); /* Allocate physical pages and map them into vmalloc space. */ - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; + /* Mark the pages as accessible, now that they are mapped. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size); + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3132,7 +3143,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: if (shift > PAGE_SHIFT) { @@ -3816,7 +3827,10 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); - /* mark allocated areas as accessible */ + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size); From 2ee534e8c33a79b4ad6662149b95cefaa19523ca Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:59 +1100 Subject: [PATCH 057/334] kasan, mm: only define ___GFP_SKIP_KASAN_POISON with HW_TAGS Only define the ___GFP_SKIP_KASAN_POISON flag when CONFIG_KASAN_HW_TAGS is enabled. This patch it not useful by itself, but it prepares the code for additions of new KASAN-specific GFP patches. Link: https://lkml.kernel.org/r/44e5738a584c11801b2b8f1231898918efc8634a.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 8 +++++++- include/trace/events/mmflags.h | 12 +++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 581a1f47b8a2c..96f707931770c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -54,7 +54,11 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u +#ifdef CONFIG_KASAN_HW_TAGS #define ___GFP_SKIP_KASAN_POISON 0x1000000u +#else +#define ___GFP_SKIP_KASAN_POISON 0 +#endif #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x2000000u #else @@ -251,7 +255,9 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (24 + \ + IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 116ed4d5d0f88..cb4520374e2c8 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -49,12 +49,18 @@ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"}, \ - {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\ + {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ + +#ifdef CONFIG_KASAN_HW_TAGS +#define __def_gfpflag_names_kasan \ + , {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"} +#else +#define __def_gfpflag_names_kasan +#endif #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ - __def_gfpflag_names \ + __def_gfpflag_names __def_gfpflag_names_kasan \ ) : "none" #ifdef CONFIG_MMU From 12834da7c9596138207843c04e21d81f006fa502 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:59 +1100 Subject: [PATCH 058/334] kasan, page_alloc: allow skipping unpoisoning for HW_TAGS Add a new GFP flag __GFP_SKIP_KASAN_UNPOISON that allows skipping KASAN poisoning for page_alloc allocations. The flag is only effective with HW_TAGS KASAN. This flag will be used by vmalloc code for page_alloc allocations backing vmalloc() mappings in a following patch. The reason to skip KASAN poisoning for these pages in page_alloc is because vmalloc code will be poisoning them instead. Also reword the comment for __GFP_SKIP_KASAN_POISON. Link: https://lkml.kernel.org/r/35c97d77a704f6ff971dd3bfe4be95855744108e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 21 +++++++++++++-------- include/trace/events/mmflags.h | 5 +++-- mm/page_alloc.c | 31 ++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 96f707931770c..7303d1064460b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,12 +55,14 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_KASAN_POISON 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x1000000u +#define ___GFP_SKIP_KASAN_POISON 0x2000000u #else +#define ___GFP_SKIP_KASAN_UNPOISON 0 #define ___GFP_SKIP_KASAN_POISON 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x4000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -241,22 +243,25 @@ struct vm_area_struct; * intended for optimization: setting memory tags at the same time as zeroing * memory has minimal additional performace impact. * - * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned - * on deallocation. Typically used for userspace pages. Currently only has an - * effect in HW tags mode. + * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. + * Only effective in HW_TAGS mode. + * + * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation. + * Typically, used for userspace pages. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) -#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) +#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) +#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (24 + \ - IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ +#define __GFP_BITS_SHIFT (24 + \ + 2 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index cb4520374e2c8..134c45e62d918 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -52,8 +52,9 @@ {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ #ifdef CONFIG_KASAN_HW_TAGS -#define __def_gfpflag_names_kasan \ - , {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"} +#define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ + {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else #define __def_gfpflag_names_kasan #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3af38e3233914..94bfbc216ae9e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2395,6 +2395,26 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if either: + * + * 1. Memory tags have already been cleared via tag_clear_highpage(). + * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -2434,15 +2454,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - /* - * If either a software KASAN mode is enabled, or, - * in the case of hardware tag-based KASAN, - * if memory tags have not been cleared via tag_clear_highpage(). - */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || - IS_ENABLED(CONFIG_KASAN_SW_TAGS) || - kasan_hw_tags_enabled() && !init_tags) { - /* Mark shadow memory or set memory tags. */ + if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); /* Note that memory is already initialized by KASAN. */ From c615d2615488ea012e02f4b6c7668a1ca7f2c3b9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:59 +1100 Subject: [PATCH 059/334] kasan, page_alloc: allow skipping memory init for HW_TAGS Add a new GFP flag __GFP_SKIP_ZERO that allows to skip memory initialization. The flag is only effective with HW_TAGS KASAN. This flag will be used by vmalloc code for page_alloc allocations backing vmalloc() mappings in a following patch. The reason to skip memory initialization for these pages in page_alloc is because vmalloc code will be initializing them instead. With the current implementation, when __GFP_SKIP_ZERO is provided, __GFP_ZEROTAGS is ignored. This doesn't matter, as these two flags are never provided at the same time. However, if this is changed in the future, this particular implementation detail can be changed as well. Link: https://lkml.kernel.org/r/0d53efeff345de7d708e0baa0d8829167772521e.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/gfp.h | 18 +++++++++++------- include/trace/events/mmflags.h | 1 + mm/page_alloc.c | 13 ++++++++++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7303d1064460b..7797c915ce54c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,14 +55,16 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_KASAN_UNPOISON 0x1000000u -#define ___GFP_SKIP_KASAN_POISON 0x2000000u +#define ___GFP_SKIP_ZERO 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u +#define ___GFP_SKIP_KASAN_POISON 0x4000000u #else +#define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN_UNPOISON 0 #define ___GFP_SKIP_KASAN_POISON 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP 0x8000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -239,9 +241,10 @@ struct vm_area_struct; * %__GFP_ZERO returns a zeroed page on success. * * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc). This flag is - * intended for optimization: setting memory tags at the same time as zeroing - * memory has minimal additional performace impact. + * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that + * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting + * memory tags at the same time as zeroing memory has minimal additional + * performace impact. * * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. * Only effective in HW_TAGS mode. @@ -253,6 +256,7 @@ struct vm_area_struct; #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) +#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) #define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) #define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) @@ -261,7 +265,7 @@ struct vm_area_struct; /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (24 + \ - 2 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + 3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 134c45e62d918..6532119a6bf1a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -53,6 +53,7 @@ #ifdef CONFIG_KASAN_HW_TAGS #define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} #else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94bfbc216ae9e..368c6c5bf42a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2415,10 +2415,21 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); } +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); set_page_private(page, 0); From 25350803f9315992ada642396d4472a7ceddb0eb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:30:59 +1100 Subject: [PATCH 060/334] kasan, vmalloc: add vmalloc tagging for HW_TAGS Add vmalloc tagging support to HW_TAGS KASAN. The key difference between HW_TAGS and the other two KASAN modes when it comes to vmalloc: HW_TAGS KASAN can only assign tags to physical memory. The other two modes have shadow memory covering every mapped virtual memory region. Make __kasan_unpoison_vmalloc() for HW_TAGS KASAN: - Skip non-VM_ALLOC mappings as HW_TAGS KASAN can only tag a single mapping of normal physical memory; see the comment in the function. - Generate a random tag, tag the returned pointer and the allocation, and initialize the allocation at the same time. - Propagate the tag into the page stucts to allow accesses through page_address(vmalloc_to_page()). The rest of vmalloc-related KASAN hooks are not needed: - The shadow-related ones are fully skipped. - __kasan_poison_vmalloc() is kept as a no-op with a comment. Poisoning and zeroing of physical pages that are backing vmalloc() allocations are skipped via __GFP_SKIP_KASAN_UNPOISON and __GFP_SKIP_ZERO: __kasan_unpoison_vmalloc() does that instead. Enabling CONFIG_KASAN_VMALLOC with HW_TAGS is not yet allowed. Link: https://lkml.kernel.org/r/d19b2e9e59a9abc59d05b72dea8429dcaea739c6.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 36 +++++++++++++++-- kernel/scs.c | 4 +- mm/kasan/hw_tags.c | 92 +++++++++++++++++++++++++++++++++++++++++++ mm/kasan/shadow.c | 10 ++++- mm/vmalloc.c | 51 ++++++++++++++++++------ 5 files changed, 175 insertions(+), 18 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 92c5dfa29a352..499f1573dba4c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -25,6 +25,12 @@ struct kunit_kasan_expectation { #endif +typedef unsigned int __bitwise kasan_vmalloc_flags_t; + +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -418,18 +424,39 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void *__kasan_unpoison_vmalloc(const void *start, unsigned long size); +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } +static inline int kasan_populate_vmalloc(unsigned long start, + unsigned long size) +{ + return 0; +} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) { } + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags); static __always_inline void *kasan_unpoison_vmalloc(const void *start, - unsigned long size) + unsigned long size, + kasan_vmalloc_flags_t flags) { if (kasan_enabled()) - return __kasan_unpoison_vmalloc(start, size); + return __kasan_unpoison_vmalloc(start, size, flags); return (void *)start; } @@ -456,7 +483,8 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_end) { } static inline void *kasan_unpoison_vmalloc(const void *start, - unsigned long size) + unsigned long size, + kasan_vmalloc_flags_t flags) { return (void *)start; } diff --git a/kernel/scs.c b/kernel/scs.c index 579841be88646..b83bc9251f996 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,7 +32,7 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); memset(s, 0, SCS_SIZE); return s; } @@ -78,7 +78,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); vfree_atomic(s); } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 76cf2b6229c79..21104fd518727 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -192,6 +192,98 @@ void __init kasan_init_hw_tags(void) kasan_stack_collection_enabled() ? "on" : "off"); } +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) +{ + struct vm_struct *area; + int i; + + /* + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. + */ + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + page_kasan_tag_set(page, tag); + } +} + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC + * mappings as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + return (void *)start; + + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + + /* + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). + */ + unpoison_vmalloc_pages(start, tag); + + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + /* + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. + */ +} + +#endif + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 5a866f6663fc0..b958babc8feda 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -475,8 +475,16 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } -void *__kasan_unpoison_vmalloc(const void *start, unsigned long size) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) { + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + if (!is_vmalloc_or_module_addr(start)) return (void *)start; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b65adac1cd802..6dcdf815576b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2216,8 +2216,12 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) return NULL; } - /* Mark the pages as accessible, now that they are mapped. */ - mem = kasan_unpoison_vmalloc(mem, size); + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_NONE); return mem; } @@ -2451,9 +2455,12 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) - area->addr = kasan_unpoison_vmalloc(area->addr, requested_size); + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_NONE); return area; } @@ -3064,6 +3071,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, { struct vm_struct *area; void *ret; + kasan_vmalloc_flags_t kasan_flags; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3116,21 +3124,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - /* - * Modify protection bits to allow tagging. - * This must be done before mapping by __vmalloc_area_node(). - */ + /* Prepare arguments for __vmalloc_area_node(). */ if (kasan_hw_tags_enabled() && - pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) + pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping in __vmalloc_area_node(). + */ prot = arch_vmap_pgprot_tagged(prot); + /* + * Skip page_alloc poisoning and zeroing for physical pages + * backing VM_ALLOC mapping. Memory is instead poisoned and + * zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; - /* Mark the pages as accessible, now that they are mapped. */ - area->addr = kasan_unpoison_vmalloc(area->addr, real_size); + /* + * Mark the pages as accessible, now that they are mapped. + * The init condition should match the one in post_alloc_hook() + * (except for the should_skip_init() check) to make sure that memory + * is initialized under the same conditions regardless of the enabled + * KASAN mode. + */ + kasan_flags = KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + kasan_flags |= KASAN_VMALLOC_INIT; + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -3830,10 +3856,13 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size); + vms[area]->size, + KASAN_VMALLOC_NONE); kfree(vas); return vms; From c523f9ee05e0776aea5e730810e6fc5164c3ffbd Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:00 +1100 Subject: [PATCH 061/334] kasan, vmalloc: only tag normal vmalloc allocations The kernel can use to allocate executable memory. The only supported way to do that is via __vmalloc_node_range() with the executable bit set in the prot argument. (vmap() resets the bit via pgprot_nx()). Once tag-based KASAN modes start tagging vmalloc allocations, executing code from such allocations will lead to the PC register getting a tag, which is not tolerated by the kernel. Only tag the allocations for normal kernel pages. Link: https://lkml.kernel.org/r/fbfd9939a4dc375923c9a5c6b9e7ab05c26b8c6b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kasan.h | 7 ++++--- mm/kasan/hw_tags.c | 7 +++++++ mm/kasan/shadow.c | 7 +++++++ mm/vmalloc.c | 49 +++++++++++++++++++++++++------------------ 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 499f1573dba4c..3593c95d1fa54 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -27,9 +27,10 @@ struct kunit_kasan_expectation { typedef unsigned int __bitwise kasan_vmalloc_flags_t; -#define KASAN_VMALLOC_NONE 0x00u -#define KASAN_VMALLOC_INIT 0x01u -#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_PROT_NORMAL 0x04u #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 21104fd518727..2e9378a4f07f1 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -247,6 +247,13 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, if (!(flags & KASAN_VMALLOC_VM_ALLOC)) return (void *)start; + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + tag = kasan_random_tag(); start = set_tag(start, tag); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index b958babc8feda..7272e248db87d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -488,6 +488,13 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, if (!is_vmalloc_or_module_addr(start)) return (void *)start; + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + start = set_tag(start, kasan_random_tag()); kasan_unpoison(start, size, false); return (void *)start; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dcdf815576b3..375b53fd939f1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2221,7 +2221,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ - mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_NONE); + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } @@ -2460,7 +2460,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, - KASAN_VMALLOC_NONE); + KASAN_VMALLOC_PROT_NORMAL); return area; } @@ -3071,7 +3071,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, { struct vm_struct *area; void *ret; - kasan_vmalloc_flags_t kasan_flags; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3124,21 +3124,28 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - /* Prepare arguments for __vmalloc_area_node(). */ - if (kasan_hw_tags_enabled() && - pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - /* - * Modify protection bits to allow tagging. - * This must be done before mapping in __vmalloc_area_node(). - */ - prot = arch_vmap_pgprot_tagged(prot); + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); - /* - * Skip page_alloc poisoning and zeroing for physical pages - * backing VM_ALLOC mapping. Memory is instead poisoned and - * zeroed by kasan_unpoison_vmalloc(). - */ - gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ @@ -3152,10 +3159,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, * (except for the should_skip_init() check) to make sure that memory * is initialized under the same conditions regardless of the enabled * KASAN mode. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). */ - kasan_flags = KASAN_VMALLOC_VM_ALLOC; + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* @@ -3861,8 +3871,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size, - KASAN_VMALLOC_NONE); + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; From 2ed5d97c9c567aca3615dab784e79546afca9ade Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:00 +1100 Subject: [PATCH 062/334] kasan, arm64: don't tag executable vmalloc allocations Besides asking vmalloc memory to be executable via the prot argument of __vmalloc_node_range() (see the previous patch), the kernel can skip that bit and instead mark memory as executable via set_memory_x(). Once tag-based KASAN modes start tagging vmalloc allocations, executing code from such allocations will lead to the PC register getting a tag, which is not tolerated by the kernel. Generic kernel code typically allocates memory via module_alloc() if it intends to mark memory as executable. (On arm64 module_alloc() uses __vmalloc_node_range() without setting the executable bit). Thus, reset pointer tags of pointers returned from module_alloc(). However, on arm64 there's an exception: the eBPF subsystem. Instead of using module_alloc(), it uses vmalloc() (via bpf_jit_alloc_exec()) to allocate its JIT region. Thus, reset pointer tags of pointers returned from bpf_jit_alloc_exec(). Resetting tags for these pointers results in untagged pointers being passed to set_memory_x(). This causes conflicts in arithmetic checks in change_memory_common(), as vm_struct->addr pointer returned by find_vm_area() is tagged. Reset pointer tag of find_vm_area(addr)->addr in change_memory_common(). Link: https://lkml.kernel.org/r/b7b2595423340cd7d76b770e5d519acf3b72f0ab.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kernel/module.c | 3 ++- arch/arm64/mm/pageattr.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index d3a1fa8183487..f2d4bb14bfabe 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -63,7 +63,8 @@ void *module_alloc(unsigned long size) return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index a3bacd79507a4..64e985eaa52d8 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages, */ area = find_vm_area((void *)addr); if (!area || - end > (unsigned long)area->addr + area->size || + end > (unsigned long)kasan_reset_tag(area->addr) + area->size || !(area->flags & VM_ALLOC)) return -EINVAL; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e96d4d87291f3..2198af06ae6a7 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1150,7 +1150,8 @@ u64 bpf_jit_alloc_exec_limit(void) void *bpf_jit_alloc_exec(unsigned long size) { - return vmalloc(size); + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(vmalloc(size)); } void bpf_jit_free_exec(void *addr) From 5af65642ff149402099b85c7e9d5dc3a5296ce03 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:00 +1100 Subject: [PATCH 063/334] kasan: mark kasan_arg_stacktrace as __initdata As kasan_arg_stacktrace is only used in __init functions, mark it as __initdata instead of __ro_after_init to allow it be freed after boot. The other enums for KASAN args are used in kasan_init_hw_tags_cpu(), which is not marked as __init as a CPU can be hot-plugged after boot. Clarify this in a comment. Link: https://lkml.kernel.org/r/7fa090865614f8e0c6c1265508efb1d429afaa50.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2e9378a4f07f1..6509809dd5d8c 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -40,7 +40,7 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* Whether KASAN is enabled at all. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); @@ -116,7 +116,10 @@ static inline const char *kasan_mode_info(void) return "sync"; } -/* kasan_init_hw_tags_cpu() is called for each CPU. */ +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ void kasan_init_hw_tags_cpu(void) { /* From 6f77c2ebf0484bec902606701e7f0278e3dc7616 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:01 +1100 Subject: [PATCH 064/334] kasan: clean up feature flags for HW_TAGS mode - Untie kasan_init_hw_tags() code from the default values of kasan_arg_mode and kasan_arg_stacktrace. - Move static_branch_enable(&kasan_flag_enabled) to the end of kasan_init_hw_tags_cpu(). - Remove excessive comments in kasan_arg_mode switch. - Add new comments. Link: https://lkml.kernel.org/r/76ebb340265be57a218564a497e1f52ff36a3879.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 38 +++++++++++++++++++++----------------- mm/kasan/kasan.h | 2 +- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 6509809dd5d8c..6a3146d1ccc55 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -42,16 +42,22 @@ static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; -/* Whether KASAN is enabled at all. */ +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); -/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/ +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); /* kasan=off/on */ static int __init early_kasan_flag(char *arg) @@ -127,7 +133,11 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled via command line, don't initialize it. */ + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ if (kasan_arg == KASAN_ARG_OFF) return; @@ -154,42 +164,36 @@ void __init kasan_init_hw_tags(void) if (kasan_arg == KASAN_ARG_OFF) return; - /* Enable KASAN. */ - static_branch_enable(&kasan_flag_enabled); - switch (kasan_arg_mode) { case KASAN_ARG_MODE_DEFAULT: - /* - * Default to sync mode. - */ - fallthrough; + /* Default is specified by kasan_mode definition. */ + break; case KASAN_ARG_MODE_SYNC: - /* Sync mode enabled. */ kasan_mode = KASAN_MODE_SYNC; break; case KASAN_ARG_MODE_ASYNC: - /* Async mode enabled. */ kasan_mode = KASAN_MODE_ASYNC; break; case KASAN_ARG_MODE_ASYMM: - /* Asymm mode enabled. */ kasan_mode = KASAN_MODE_ASYMM; break; } switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default to enabling stack trace collection. */ - static_branch_enable(&kasan_flag_stacktrace); + /* Default is specified by kasan_flag_stacktrace definition. */ break; case KASAN_ARG_STACKTRACE_OFF: - /* Do nothing, kasan_flag_stacktrace keeps its default value. */ + static_branch_disable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); break; } + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", kasan_mode_info(), kasan_stack_collection_enabled() ? "on" : "off"); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 020f3e57a03f5..efda13a9ce6ad 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,7 +12,7 @@ #include #include "../slab.h" -DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, From a577812cc348baacebeb711b1f673aa70457fd47 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:01 +1100 Subject: [PATCH 065/334] kasan: add kasan.vmalloc command line flag Allow disabling vmalloc() tagging for HW_TAGS KASAN via a kasan.vmalloc command line switch. This is a fail-safe switch intended for production systems that enable HW_TAGS KASAN. In case vmalloc() tagging ends up having an issue not detected during testing but that manifests in production, kasan.vmalloc allows to turn vmalloc() tagging off while leaving page_alloc/slab tagging on. Link: https://lkml.kernel.org/r/904f6d4dfa94870cc5fc2660809e093fd0d27c3b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/hw_tags.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- mm/kasan/kasan.h | 6 ++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 6a3146d1ccc55..fad1887e54c05 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -32,6 +32,12 @@ enum kasan_arg_mode { KASAN_ARG_MODE_ASYMM, }; +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -40,6 +46,7 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* @@ -56,6 +63,9 @@ EXPORT_SYMBOL(kasan_flag_enabled); enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); @@ -95,6 +105,23 @@ static int __init early_kasan_mode(char *arg) } early_param("kasan.mode", early_kasan_mode); +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -179,6 +206,18 @@ void __init kasan_init_hw_tags(void) break; } + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ + break; + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); + break; + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: /* Default is specified by kasan_flag_stacktrace definition. */ @@ -194,8 +233,9 @@ void __init kasan_init_hw_tags(void) /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", kasan_stack_collection_enabled() ? "on" : "off"); } @@ -228,6 +268,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, u8 tag; unsigned long redzone_start, redzone_size; + if (!kasan_vmalloc_enabled()) + return (void *)start; + if (!is_vmalloc_or_module_addr(start)) return (void *)start; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index efda13a9ce6ad..4d67408e84076 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,6 +12,7 @@ #include #include "../slab.h" +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { @@ -22,6 +23,11 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); From 4bb57b1d1b1c7ef5c42a55114e16180391437cdf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:01 +1100 Subject: [PATCH 066/334] kasan: allow enabling KASAN_VMALLOC and SW/HW_TAGS Allow enabling CONFIG_KASAN_VMALLOC with SW_TAGS and HW_TAGS KASAN modes. Also adjust CONFIG_KASAN_VMALLOC description: - Mention HW_TAGS support. - Remove unneeded internal details: they have no place in Kconfig description and are already explained in the documentation. Link: https://lkml.kernel.org/r/bfa0fdedfe25f65e5caa4e410f074ddbac7a0b59.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.kasan | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 879757b6dd149..1f3e620188a24 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -178,17 +178,17 @@ config KASAN_TAGS_IDENTIFY memory consumption. config KASAN_VMALLOC - bool "Back mappings in vmalloc space with real shadow memory" - depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC + bool "Check accesses to vmalloc allocations" + depends on HAVE_ARCH_KASAN_VMALLOC help - By default, the shadow region for vmalloc space is the read-only - zero page. This means that KASAN cannot detect errors involving - vmalloc space. - - Enabling this option will hook in to vmap/vmalloc and back those - mappings with real shadow memory allocated on demand. This allows - for KASAN to detect more sorts of errors (and to support vmapped - stacks), but at the cost of higher memory usage. + This mode makes KASAN check accesses to vmalloc allocations for + validity. + + With software KASAN modes, checking is done for all types of vmalloc + allocations. Enabling this option leads to higher memory usage. + + With hardware tag-based KASAN, only VM_ALLOC mappings are checked. + There is no additional memory usage. config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS From 6f208425249ae910d7b515222c11579d710f3677 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:01 +1100 Subject: [PATCH 067/334] arm64: select KASAN_VMALLOC for SW/HW_TAGS modes Generic KASAN already selects KASAN_VMALLOC to allow VMAP_STACK to be selected unconditionally, see commit acc3042d62cb9 ("arm64: Kconfig: select KASAN_VMALLOC if KANSAN_GENERIC is enabled"). The same change is needed for SW_TAGS KASAN. HW_TAGS KASAN does not require enabling KASAN_VMALLOC for VMAP_STACK, they already work together as is. Still, selecting KASAN_VMALLOC still makes sense to make vmalloc() always protected. In case any bugs in KASAN's vmalloc() support are discovered, the command line kasan.vmalloc flag can be used to disable vmalloc() checking. Select KASAN_VMALLOC for all KASAN modes for arm64. Link: https://lkml.kernel.org/r/99d6b3ebf57fc1930ff71f9a4a71eea19881b270.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Catalin Marinas Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 09b885cc4db53..7cbab9fac9ffd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -205,7 +205,7 @@ config ARM64 select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOC if KASAN_GENERIC + select KASAN_VMALLOC if KASAN select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH From c455e19374a81e37763577ccbb6dce4e56f37cea Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:01 +1100 Subject: [PATCH 068/334] kasan: documentation updates Update KASAN documentation: - Bump Clang version requirement for HW_TAGS as ARM64_MTE depends on AS_HAS_LSE_ATOMICS as of commit 2decad92f4731 ("arm64: mte: Ensure TIF_MTE_ASYNC_FAULT is set atomically"), which requires Clang 12. - Add description of the new kasan.vmalloc command line flag. - Mention that SW_TAGS and HW_TAGS modes now support vmalloc tagging. - Explicitly say that the "Shadow memory" section is only applicable to software KASAN modes. - Mention that shadow-based KASAN_VMALLOC is supported on arm64. Link: https://lkml.kernel.org/r/a61189128fa3f9fbcfd9884ff653d401864b8e74.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/dev-tools/kasan.rst | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 8089c559d339c..7614a1fc30fac 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -30,7 +30,7 @@ Software tag-based KASAN mode is only supported in Clang. The hardware KASAN mode (#3) relies on hardware to perform the checks but still requires a compiler version that supports memory tagging instructions. -This mode is supported in GCC 10+ and Clang 11+. +This mode is supported in GCC 10+ and Clang 12+. Both software KASAN modes work with SLUB and SLAB memory allocators, while the hardware tag-based KASAN currently only supports SLUB. @@ -206,6 +206,9 @@ additional boot parameters that allow disabling KASAN or controlling features: Asymmetric mode: a bad access is detected synchronously on reads and asynchronously on writes. +- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc + allocations (default: ``on``). + - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). @@ -279,8 +282,8 @@ Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Software tag-based KASAN currently only supports tagging of slab, page_alloc, +and vmalloc memory. Hardware tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,8 +306,8 @@ Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Hardware tag-based KASAN currently only supports tagging of slab, page_alloc, +and VM_ALLOC-based vmalloc memory. If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN will not be enabled. In this case, all KASAN boot parameters are ignored. @@ -319,6 +322,8 @@ checking gets disabled. Shadow memory ------------- +The contents of this section are only applicable to software KASAN modes. + The kernel maps memory in several different parts of the address space. The range of kernel virtual addresses is large: there is not enough real memory to support a real shadow region for every address that could be @@ -349,7 +354,7 @@ CONFIG_KASAN_VMALLOC With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the cost of greater memory usage. Currently, this is supported on x86, -riscv, s390, and powerpc. +arm64, riscv, s390, and powerpc. This works by hooking into vmalloc and vmap and dynamically allocating real shadow memory to back the mappings. From dcee870a49c2d587c4650d6f673a1247e8942e75 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:02 +1100 Subject: [PATCH 069/334] kasan: improve vmalloc tests Update the existing vmalloc_oob() test to account for the specifics of the tag-based modes. Also add a few new checks and comments. Add new vmalloc-related tests: - vmalloc_helpers_tags() to check that exported vmalloc helpers can handle tagged pointers. - vmap_tags() to check that SW_TAGS mode properly tags vmap() mappings. - vm_map_ram_tags() to check that SW_TAGS mode properly tags vm_map_ram() mappings. - vmalloc_percpu() to check that SW_TAGS mode tags regions allocated for __alloc_percpu(). The tagging of per-cpu mappings is best-effort; proper tagging is tracked in [1]. [1] https://bugzilla.kernel.org/show_bug.cgi?id=215019 Link: https://lkml.kernel.org/r/bbdc1c0501c5275e7f26fdb8e2a7b14a40a9f36b.1643047180.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Mark Rutland Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 189 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 183 insertions(+), 6 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 3b413f8c8a715..366d7aae4b38f 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -1057,21 +1058,181 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +static void vmalloc_helpers_tags(struct kunit *test) +{ + void *ptr; + int rv; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + + vfree(ptr); +} + static void vmalloc_oob(struct kunit *test) { - void *area; + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + /* - * We have to be careful not to hit the guard page. + * We have to be careful not to hit the guard page in vmalloc tests. * The MMU will catch that and crash us. */ - area = vmalloc(3000); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); - vfree(area); + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + size_t order = 1; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, order); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1 << order, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, order); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + size_t order = 1; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, order); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1 << order, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1 << order); + free_pages((unsigned long)p_ptr, order); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); } /* @@ -1105,6 +1266,18 @@ static void match_all_not_assigned(struct kunit *test) KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); free_pages((unsigned long)ptr, order); } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = (get_random_int() % 1024) + 1; + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } } /* Check that 0xff works as a match-all pointer tag for tag-based modes. */ @@ -1210,7 +1383,11 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), From 074d139b7068925cc4cc1b0abce27a31975934ab Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 16 Feb 2022 15:31:02 +1100 Subject: [PATCH 070/334] similar to "kasan: test: fix compatibility with FORTIFY_SOURCE" Link: https://lkml.kernel.org/r/20220128144801.73f5ced0@canb.auug.org.au Signed-off-by: Stephen Rothwell Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 366d7aae4b38f..4311cf5319557 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -1099,6 +1099,8 @@ static void vmalloc_oob(struct kunit *test) v_ptr = vmalloc(size); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + OPTIMIZER_HIDE_VAR(v_ptr); + /* * We have to be careful not to hit the guard page in vmalloc tests. * The MMU will catch that and crash us. From 87c84771ef07c2fbd396168fa49c195411a08ee7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 16 Feb 2022 15:31:02 +1100 Subject: [PATCH 071/334] fix for "kasan: improve vmalloc tests" vmap_tags() and vm_map_ram_tags() pass invalid page array size to vm_map_ram() and vm_unmap_ram(). It's supposed to be 1, but it's 1 << order == 2 currently. Remove order variable (it can only be 0 with the current code) and hardcode the number of pages in these tests. Link: https://lkml.kernel.org/r/865c91ba49b90623ab50c7526b79ccb955f544f0.1644950160.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_kasan.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 4311cf5319557..32770e225d345 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -1140,7 +1140,6 @@ static void vmap_tags(struct kunit *test) { char *p_ptr, *v_ptr; struct page *p_page, *v_page; - size_t order = 1; /* * This test is specifically crafted for the software tag-based mode, @@ -1150,12 +1149,12 @@ static void vmap_tags(struct kunit *test) KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); - p_page = alloc_pages(GFP_KERNEL, order); + p_page = alloc_pages(GFP_KERNEL, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); p_ptr = page_address(p_page); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); - v_ptr = vmap(&p_page, 1 << order, VM_MAP, PAGE_KERNEL); + v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); /* @@ -1177,14 +1176,13 @@ static void vmap_tags(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); vunmap(v_ptr); - free_pages((unsigned long)p_ptr, order); + free_pages((unsigned long)p_ptr, 1); } static void vm_map_ram_tags(struct kunit *test) { char *p_ptr, *v_ptr; struct page *page; - size_t order = 1; /* * This test is specifically crafted for the software tag-based mode, @@ -1192,12 +1190,12 @@ static void vm_map_ram_tags(struct kunit *test) */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); - page = alloc_pages(GFP_KERNEL, order); + page = alloc_pages(GFP_KERNEL, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); p_ptr = page_address(page); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); - v_ptr = vm_map_ram(&page, 1 << order, -1); + v_ptr = vm_map_ram(&page, 1, -1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); @@ -1207,8 +1205,8 @@ static void vm_map_ram_tags(struct kunit *test) *p_ptr = 0; *v_ptr = 0; - vm_unmap_ram(v_ptr, 1 << order); - free_pages((unsigned long)p_ptr, order); + vm_unmap_ram(v_ptr, 1); + free_pages((unsigned long)p_ptr, 1); } static void vmalloc_percpu(struct kunit *test) From 76e36e956a142836b058047b1c9153f3c5dd6714 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:02 +1100 Subject: [PATCH 072/334] mm/memremap: avoid calling kasan_remove_zero_shadow() for device private memory For device private memory, we do not create a linear mapping for the memory because the device memory is un-accessible. Thus we do not add kasan zero shadow for it. So it's unnecessary to do kasan_remove_zero_shadow() for it. Link: https://lkml.kernel.org/r/20220126092602.1425-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index d2a72cf2ff831..d9e05952fff66 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -302,7 +302,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, return 0; err_add_memory: - kasan_remove_zero_shadow(__va(range->start), range_len(range)); + if (!is_private) + kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); err_pfn_remap: From 295cda1765b5ccf92a0a35f4fae40989de8ce9cf Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Wed, 16 Feb 2022 15:31:02 +1100 Subject: [PATCH 073/334] tools/vm/page_owner_sort.c: sort by stacktrace before culling The contents of page_owner have changed to include more information than the stack trace. On a modern kernel, the blocks look like Page allocated via order 0, mask 0x0(), pid 1, ts 165564237 ns, free_ts 0 ns register_early_stack+0x4b/0x90 init_page_owner+0x39/0x250 kernel_init_freeable+0x11e/0x242 kernel_init+0x16/0x130 Sorting by the contents of .txt will result in almost no repeated pages, as the pid, ts, and free_ts will almost never be the same. Instead, sort by the contents of the stack trace, which we assume to be whatever is after the first line. Link: https://lkml.kernel.org/r/20211124193709.1805776-1-seanga2@gmail.com Signed-off-by: Sean Anderson Cc: Changhee Han Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 9ebb84a9c7310..9ad3772a294dc 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -23,6 +23,7 @@ struct block_list { char *txt; + char *stacktrace; int len; int num; int page_num; @@ -51,11 +52,11 @@ int read_block(char *buf, int buf_size, FILE *fin) return -1; /* EOF or no space left in buf. */ } -static int compare_txt(const void *p1, const void *p2) +static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return strcmp(l1->txt, l2->txt); + return strcmp(l1->stacktrace ?: "", l2->stacktrace ?: ""); } static int compare_num(const void *p1, const void *p2) @@ -121,6 +122,7 @@ static void add_list(char *buf, int len) list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; + list[list_size].stacktrace = strchr(list[list_size].txt, '\n'); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -199,7 +201,7 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_txt); + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -211,7 +213,7 @@ int main(int argc, char **argv) for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].txt, list[i].txt) != 0) { + strcmp(list2[count-1].stacktrace, list[i].stacktrace) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; From 35b3896abf6c8b7744b82c584286df1866a2aabf Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Wed, 16 Feb 2022 15:31:03 +1100 Subject: [PATCH 074/334] tools/vm/page_owner_sort.c: fix NULL-pointer dereference when comparing stack traces If there is no newline in a block, then strchr returns NULL. We check for this in stacktrace_compare, but not when culling. Fix this (and any future bugs like it) by replacing NULL stack traces with "" in add_list. Link: https://lkml.kernel.org/r/20211125162653.1855958-1-seanga2@gmail.com Fixes: d0abbab9e9e9 ("tools/vm/page_owner_sort.c: sort by stacktrace before culling") Signed-off-by: Sean Anderson Cc: Changhee Han Cc: Zhenliang Wei Cc: Zhang Shengju Cc: Tang Bin Cc: Sean Anderson Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 9ad3772a294dc..5582d8454d3bd 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -56,7 +56,7 @@ static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return strcmp(l1->stacktrace ?: "", l2->stacktrace ?: ""); + return strcmp(l1->stacktrace, l2->stacktrace); } static int compare_num(const void *p1, const void *p2) @@ -122,7 +122,7 @@ static void add_list(char *buf, int len) list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; - list[list_size].stacktrace = strchr(list[list_size].txt, '\n'); + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); From 24fe67fc67ae7e543e82837144f31950a7e26341 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Wed, 16 Feb 2022 15:31:03 +1100 Subject: [PATCH 075/334] tools/vm/page_owner_sort.c: support sorting by stack trace This adds the ability to sort by stacktraces. This is helpful when comparing multiple dumps of page_owner taken at different times, since blocks will not be reordered if they were allocated/free'd. Link: https://lkml.kernel.org/r/20211124193709.1805776-2-seanga2@gmail.com Signed-off-by: Sean Anderson Cc: Zhenliang Wei Cc: Changhee Han Cc: Tang Bin Cc: Zhang Shengju Cc: Stephen Rothwell Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 5582d8454d3bd..1b2acf02d3cd6 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -29,7 +29,6 @@ struct block_list { int page_num; }; -static int sort_by_memory; static regex_t order_pattern; static struct block_list *list; static int list_size; @@ -134,13 +133,16 @@ static void add_list(char *buf, int len) static void usage(void) { - printf("Usage: ./page_owner_sort [-m] \n" - "-m Sort by total memory. If this option is unset, sort by times\n" + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m Sort by total memory.\n" + "-s Sort by the stack trace.\n" + "-t Sort by times (default).\n" ); } int main(int argc, char **argv) { + int (*cmp)(const void *, const void *) = compare_num; FILE *fin, *fout; char *buf; int ret, i, count; @@ -149,10 +151,16 @@ int main(int argc, char **argv) int err; int opt; - while ((opt = getopt(argc, argv, "m")) != -1) + while ((opt = getopt(argc, argv, "mst")) != -1) switch (opt) { case 'm': - sort_by_memory = 1; + cmp = compare_page_num; + break; + case 's': + cmp = compare_stacktrace; + break; + case 't': + cmp = compare_num; break; default: usage(); @@ -221,10 +229,7 @@ int main(int argc, char **argv) } } - if (sort_by_memory) - qsort(list2, count, sizeof(list[0]), compare_page_num); - else - qsort(list2, count, sizeof(list[0]), compare_num); + qsort(list2, count, sizeof(list[0]), cmp); for (i = 0; i < count; i++) fprintf(fout, "%d times, %d pages:\n%s\n", From e68cfb903955010e0afc0cef75aceeb2d8c67eb8 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Wed, 16 Feb 2022 15:31:03 +1100 Subject: [PATCH 076/334] tools/vm/page_owner_sort.c: add switch between culling by stacktrace and txt Culling by comparing stacktrace would casue loss of some information. For example, if there exists 2 blocks which have the same stacktrace and the different head info Page allocated via order 0, mask 0x108c48(...), pid 73696, ts 1578829190639010 ns, free_ts 1576583851324450 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 Page allocated via order 0, mask 0x108c48(...), pid 61806, ts 1354113726046100 ns, free_ts 1354104926841400 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 After culling, it would be like this 2 times, 2 pages: Page allocated via order 0, mask 0x108c48(...), pid 73696, ts 1578829190639010 ns, free_ts 1576583851324450 ns prep_new_page+0x80/0xb8 get_page_from_freelist+0x924/0xee8 __alloc_pages+0x138/0xc18 alloc_pages+0x80/0xf0 __page_cache_alloc+0x90/0xc8 The info of second block missed. So, add -c to turn on culling by stacktrace. By default, it will cull by txt. Link: https://lkml.kernel.org/r/20211129145658.2491-1-zhangyinan2019@email.szu.edu.cn Signed-off-by: Yinan Zhang Cc: Changhee Han Cc: Sean Anderson Cc: Stephen Rothwell Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 1b2acf02d3cd6..492be7f752c04 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -51,6 +51,13 @@ int read_block(char *buf, int buf_size, FILE *fin) return -1; /* EOF or no space left in buf. */ } +static int compare_txt(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->txt, l2->txt); +} + static int compare_stacktrace(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -137,12 +144,14 @@ static void usage(void) "-m Sort by total memory.\n" "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" + "-c cull by comparing stacktrace instead of total block.\n" ); } int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; + int cull_st = 0; FILE *fin, *fout; char *buf; int ret, i, count; @@ -151,7 +160,7 @@ int main(int argc, char **argv) int err; int opt; - while ((opt = getopt(argc, argv, "mst")) != -1) + while ((opt = getopt(argc, argv, "mstc")) != -1) switch (opt) { case 'm': cmp = compare_page_num; @@ -162,6 +171,9 @@ int main(int argc, char **argv) case 't': cmp = compare_num; break; + case 'c': + cull_st = 1; + break; default: usage(); exit(1); @@ -209,7 +221,10 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + if (cull_st == 1) + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + else + qsort(list, list_size, sizeof(list[0]), compare_txt); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -219,9 +234,11 @@ int main(int argc, char **argv) printf("culling\n"); + long offset = cull_st ? &list[0].stacktrace - &list[0].txt : 0; + for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].stacktrace, list[i].stacktrace) != 0) { + strcmp(*(&list2[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; From b085550fd2fcb06c394ae6effb094646b5071e21 Mon Sep 17 00:00:00 2001 From: Chongxi Zhao Date: Wed, 16 Feb 2022 15:31:03 +1100 Subject: [PATCH 077/334] tools/vm/page_owner_sort.c: support sorting pid and time When viewing the page owner information, we expect that the information can be sorted by PID, so that we can quickly combine PID with the program to check the information together. We also expect that the information can be sorted by time. Time sorting helps to view the running status of the program according to the time interval when the program hangs up. Finally, we hope to pass the page_ owner_ Sort. C can reduce part of the output and only output the plate information whose memory has not been released, which can make us locate the problem of the program faster. Therefore, the following adjustments have been made: 1. Add the static functions search_pattern and check_regcomp to improve the cleanliness. 2. Add member attributes and their corresponding sorting methods. In terms of comparison time, int will overflow because the data of ull is too large, so the ternary operator is used 3. Add the -f parameter to filter out the information of blocks whose memory has not been released Link: https://lkml.kernel.org/r/20211206165653.5093-1-zhaochongxi2019@email.szu.edu.cn Signed-off-by: Chongxi Zhao Reviewed-by: Sean Anderson Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 177 +++++++++++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 29 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 492be7f752c04..c9fedc1806d50 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -20,6 +20,7 @@ #include #include #include +#include struct block_list { char *txt; @@ -27,9 +28,15 @@ struct block_list { int len; int num; int page_num; + pid_t pid; + __u64 ts_nsec; + __u64 free_ts_nsec; }; static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; @@ -79,34 +86,124 @@ static int compare_page_num(const void *p1, const void *p2) return l2->page_num - l1->page_num; } -static int get_page_num(char *buf) +static int compare_pid(const void *p1, const void *p2) { - int err, val_len, order_val; - char order_str[4] = {0}; - char *endptr; + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; regmatch_t pmatch[2]; - err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL); + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); if (err != 0 || pmatch[1].rm_so == -1) { - printf("no order pattern in %s\n", buf); - return 0; + printf("no matching pattern in %s\n", buf); + return -1; } val_len = pmatch[1].rm_eo - pmatch[1].rm_so; - if (val_len > 2) /* max_order should not exceed 2 digits */ - goto wrong_order; - memcpy(order_str, buf + pmatch[1].rm_so, val_len); + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} + +static void check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + printf("Invalid pattern %s code %d\n", regex, err); + exit(1); + } +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + search_pattern(&order_pattern, order_str, buf); errno = 0; order_val = strtol(order_str, &endptr, 10); - if (errno != 0 || endptr == order_str || *endptr != '\0') - goto wrong_order; + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + printf("wrong order in follow buf:\n%s\n", buf); + return 0; + } return 1 << order_val; +} -wrong_order: - printf("wrong order in follow buf:\n%s\n", buf); - return 0; +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + printf("wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; + +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + printf("wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + printf("wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; } static void add_list(char *buf, int len) @@ -129,6 +226,11 @@ static void add_list(char *buf, int len) memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + list[list_size].pid = get_pid(buf); + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -144,6 +246,9 @@ static void usage(void) "-m Sort by total memory.\n" "-s Sort by the stack trace.\n" "-t Sort by times (default).\n" + "-p Sort by pid.\n" + "-a Sort by memory allocate time.\n" + "-r Sort by memory release time.\n" "-c cull by comparing stacktrace instead of total block.\n" ); } @@ -152,28 +257,40 @@ int main(int argc, char **argv) { int (*cmp)(const void *, const void *) = compare_num; int cull_st = 0; + int filter = 0; FILE *fin, *fout; char *buf; int ret, i, count; struct block_list *list2; struct stat st; - int err; int opt; - while ((opt = getopt(argc, argv, "mstc")) != -1) + while ((opt = getopt(argc, argv, "acfmprst")) != -1) switch (opt) { + case 'a': + cmp = compare_ts; + break; + case 'c': + cull_st = 1; + break; + case 'f': + filter = 1; + break; case 'm': cmp = compare_page_num; break; + case 'p': + cmp = compare_pid; + break; + case 'r': + cmp = compare_free_ts; + break; case 's': cmp = compare_stacktrace; break; case 't': cmp = compare_num; break; - case 'c': - cull_st = 1; - break; default: usage(); exit(1); @@ -192,13 +309,10 @@ int main(int argc, char **argv) exit(1); } - err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE); - if (err != 0 || order_pattern.re_nsub != 1) { - printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n", - argv[0], err); - exit(1); - } - + check_regcomp(&order_pattern, "order\\s*([0-9]*),"); + check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); + check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); + check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -248,10 +362,15 @@ int main(int argc, char **argv) qsort(list2, count, sizeof(list[0]), cmp); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { + if (filter == 1 && list2[i].free_ts_nsec != 0) + continue; fprintf(fout, "%d times, %d pages:\n%s\n", list2[i].num, list2[i].page_num, list2[i].txt); - + } regfree(&order_pattern); + regfree(&pid_pattern); + regfree(&ts_nsec_pattern); + regfree(&free_ts_nsec_pattern); return 0; } From 5a537441f3527199e1bc62fa0c404650aef712f1 Mon Sep 17 00:00:00 2001 From: Shenghong Han Date: Wed, 16 Feb 2022 15:31:03 +1100 Subject: [PATCH 078/334] tools/vm/page_owner_sort.c: two trivial fixes 1) There is an unused variable. It's better to delete it. 2) One case is missing in the usage(). Link: https://lkml.kernel.org/r/20211213164518.2461-1-hanshenghong2019@email.szu.edu.cn Signed-off-by: Shenghong Han Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c9fedc1806d50..284a5070402c3 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -41,8 +41,6 @@ static struct block_list *list; static int list_size; static int max_size; -struct block_list *block_head; - int read_block(char *buf, int buf_size, FILE *fin) { char *curr = buf, *const buf_end = buf + buf_size; @@ -249,7 +247,8 @@ static void usage(void) "-p Sort by pid.\n" "-a Sort by memory allocate time.\n" "-r Sort by memory release time.\n" - "-c cull by comparing stacktrace instead of total block.\n" + "-c Cull by comparing stacktrace instead of total block.\n" + "-f Filter out the information of blocks whose memory has not been released.\n" ); } From 65bb012bfc9be41511c18f1a49bd37627cea1cdf Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Wed, 16 Feb 2022 15:31:04 +1100 Subject: [PATCH 079/334] tools/vm/page_owner_sort.c: delete invalid duplicate code I noticed that there is two invalid lines of duplicate code. It's better to delete it. Link: https://lkml.kernel.org/r/20211213095743.3630-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Cc: Mark Brown Cc: Sean Anderson Cc: Zhenliang Wei Cc: Tang Bin Cc: Yinan Zhang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 284a5070402c3..c8ec2d6b314de 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -227,8 +227,6 @@ static void add_list(char *buf, int len) list[list_size].pid = get_pid(buf); list[list_size].ts_nsec = get_ts_nsec(buf); list[list_size].free_ts_nsec = get_free_ts_nsec(buf); - memcpy(list[list_size].txt, buf, len); - list[list_size].txt[len] = 0; list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); From 15f5ea01f22e6b4bc8f8619148d90542c6b8709a Mon Sep 17 00:00:00 2001 From: Shenghong Han Date: Wed, 16 Feb 2022 15:31:04 +1100 Subject: [PATCH 080/334] Documentation/vm/page_owner.rst: update the documentation Update the documentation of ``page_owner``. Link: https://lkml.kernel.org/r/20211214134736.2569-1-hanshenghong2019@email.szu.edu.cn Signed-off-by: Shenghong Han Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Georgi Djakov Cc: Liam Mark Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/page_owner.rst | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 9837fc8147dd6..7a28e7b0d9c29 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -97,7 +97,7 @@ Usage The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the times. + and pages of buf, and finally sorts them according to the parameter(s). See the result about who allocated each page in the ``sorted_page_owner.txt``. General output: @@ -108,3 +108,22 @@ Usage By default, ``page_owner_sort`` is sorted according to the times of buf. If you want to sort by the pages nums of buf, use the ``-m`` parameter. + The detail parameters are shown as follows: + + fundamental function: + + Sort: + -a Sort by memory allocate time. + -m Sort by total memory. + -p Sort by pid. + -r Sort by memory release time. + -s Sort by the stack trace. + -t Sort by times (default). + + additional function: + + Cull: + -c Cull by comparing stacktrace instead of total block. + + Filter: + -f Filter out the information of blocks whose memory has not been released. From d29dfe232fc47687454c97dee7fd1824dcb6493b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:04 +1100 Subject: [PATCH 081/334] documentation-vm-page_ownerrst-update-the-documentation-fix small grammatical tweaks Cc: Georgi Djakov Cc: Jonathan Corbet Cc: Liam Mark Cc: Shenghong Han Cc: Tang Bin Cc: Vlastimil Babka Cc: Xiaoming Ni Cc: Zhang Shengju Cc: Zhenliang Wei Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/page_owner.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 7a28e7b0d9c29..602cf6eefcb5f 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -107,17 +107,17 @@ Usage // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the pages nums of buf, use the ``-m`` parameter. - The detail parameters are shown as follows: + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: fundamental function: Sort: - -a Sort by memory allocate time. + -a Sort by memory allocation time. -m Sort by total memory. -p Sort by pid. -r Sort by memory release time. - -s Sort by the stack trace. + -s Sort by stack trace. -t Sort by times (default). additional function: From 38835684cec1d8260f3d137f54b10fe0f41237c3 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 16 Feb 2022 15:31:04 +1100 Subject: [PATCH 082/334] Documentation/vm/page_owner.rst: fix unexpected indentation warns Fix Unexpected indentation warns in page_owner: Documentation/vm/page_owner.rst:92: WARNING: Unexpected indentation. Documentation/vm/page_owner.rst:96: WARNING: Unexpected indentation. Documentation/vm/page_owner.rst:107: WARNING: Unexpected indentation. Link: https://lkml.kernel.org/r/20211215001929.47866-1-skhan@linuxfoundation.org Signed-off-by: Shuah Khan Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/page_owner.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 602cf6eefcb5f..2b54e82b9fe15 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -89,11 +89,11 @@ Usage Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times @@ -104,7 +104,7 @@ Usage XXX times, XXX pages: Page allocated via order XXX, ... - // Detailed stack + // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. If you want to sort by the page nums of buf, use the ``-m`` parameter. From c3a2a1288b7908f4e1d5e8b057f09292505cee31 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:31:05 +1100 Subject: [PATCH 083/334] lib/vsprintf: avoid redundant work with 0 size Patch series "mm/page_owner: Extend page_owner to show memcg information", v4. While debugging the constant increase in percpu memory consumption on a system that spawned large number of containers, it was found that a lot of offline mem_cgroup structures remained in place without being freed. Further investigation indicated that those mem_cgroup structures were pinned by some pages. In order to find out what those pages are, the existing page_owner debugging tool is extended to show memory cgroup information and whether those memcgs are offline or not. With the enhanced page_owner tool, the following is a typical page that pinned the mem_cgroup structure in my test case: Page allocated via order 0, mask 0x1100cca(GFP_HIGHUSER_MOVABLE), pid 162970 (podman), ts 1097761405537 ns, free_ts 1097760838089 ns PFN 1925700 type Movable Block 3761 type Movable Flags 0x17ffffc00c001c(uptodate|dirty|lru|reclaim|swapbacked|node=0|zone=2|lastcpupid=0x1fffff) prep_new_page+0xac/0xe0 get_page_from_freelist+0x1327/0x14d0 __alloc_pages+0x191/0x340 alloc_pages_vma+0x84/0x250 shmem_alloc_page+0x3f/0x90 shmem_alloc_and_acct_page+0x76/0x1c0 shmem_getpage_gfp+0x281/0x940 shmem_write_begin+0x36/0xe0 generic_perform_write+0xed/0x1d0 __generic_file_write_iter+0xdc/0x1b0 generic_file_write_iter+0x5d/0xb0 new_sync_write+0x11f/0x1b0 vfs_write+0x1ba/0x2a0 ksys_write+0x59/0xd0 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Charged to offline memcg libpod-conmon-15e4f9c758422306b73b2dd99f9d50a5ea53cbb16b4a13a2c2308a4253cc0ec8. So the page was not freed because it was part of a shmem segment. That is useful information that can help users to diagnose similar problems. With cgroup v1, /proc/cgroups can be read to find out the total number of memory cgroups (online + offline). With cgroup v2, the cgroup.stat of the root cgroup can be read to find the number of dying cgroups (most likely pinned by dying memcgs). The page_owner feature is not supposed to be enabled for production system due to its memory overhead. However, if it is suspected that dying memcgs are increasing over time, a test environment with page_owner enabled can then be set up with appropriate workload for further analysis on what may be causing the increasing number of dying memcgs. This patch (of 4): For *scnprintf(), vsnprintf() is always called even if the input size is 0. That is a waste of time, so just return 0 in this case. Note that vsnprintf() will never return -1 to indicate an error. So skipping the call to vsnprintf() when size is 0 will have no functional impact at all. Link: https://lkml.kernel.org/r/20220202203036.744010-1-longman@redhat.com Link: https://lkml.kernel.org/r/20220202203036.744010-2-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Reviewed-by: Sergey Senozhatsky Acked-by: Roman Gushchin Acked-by: Rafael Aquini Acked-by: Mike Rapoport Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Petr Mladek Cc: Steven Rostedt (Google) Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Ira Weiny Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/vsprintf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3b8129dd374cd..d419154b47bb8 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2895,13 +2895,15 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; + if (unlikely(!size)) + return 0; + i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; - if (size != 0) - return size - 1; - return 0; + + return size - 1; } EXPORT_SYMBOL(vscnprintf); From ed136879caea2abd841d1caaaeb9fcbe3fece0fe Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:31:05 +1100 Subject: [PATCH 084/334] mm/page_owner: use scnprintf() to avoid excessive buffer overrun check The snprintf() function can return a length greater than the given input size. That will require a check for buffer overrun after each invocation of snprintf(). scnprintf(), on the other hand, will never return a greater length. By using scnprintf() in selected places, we can avoid some buffer overrun checks except after stack_depot_snprint() and after the last snprintf(). Link: https://lkml.kernel.org/r/20220202203036.744010-3-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Reviewed-by: Sergey Senozhatsky Acked-by: Rafael Aquini Acked-by: Mike Rapoport Cc: Andy Shevchenko Cc: Ira Weiny Cc: Johannes Weiner Cc: Michal Hocko Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 99e360df94652..28dac73e0542d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -338,19 +338,16 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (!kbuf) return -ENOMEM; - ret = snprintf(kbuf, count, + ret = scnprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); - if (ret >= count) - goto err; - /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], @@ -358,19 +355,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[pageblock_mt], &page->flags); - if (ret >= count) - goto err; - ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); - if (ret >= count) - goto err; } ret += snprintf(kbuf + ret, count - ret, "\n"); From 6f19db574bb98ccee49ce2ef1398f8db25f3eefe Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:31:05 +1100 Subject: [PATCH 085/334] mm/page_owner: print memcg information It was found that a number of offline memcgs were not freed because they were pinned by some charged pages that were present. Even "echo 1 > /proc/sys/vm/drop_caches" wasn't able to free those pages. These offline but not freed memcgs tend to increase in number over time with the side effect that percpu memory consumption as shown in /proc/meminfo also increases over time. In order to find out more information about those pages that pin offline memcgs, the page_owner feature is extended to print memory cgroup information especially whether the cgroup is offline or not. RCU read lock is taken when memcg is being accessed to make sure that it won't be freed. Link: https://lkml.kernel.org/r/20220202203036.744010-4-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Acked-by: Roman Gushchin Acked-by: Rafael Aquini Acked-by: Mike Rapoport Cc: Andy Shevchenko Cc: Ira Weiny Cc: Johannes Weiner Cc: Michal Hocko Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 28dac73e0542d..f7820357e4d4c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "internal.h" @@ -325,6 +326,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, seq_putc(m, '\n'); } +/* + * Looking for memcg information and print it out + */ +static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + struct page *page) +{ +#ifdef CONFIG_MEMCG + unsigned long memcg_data; + struct mem_cgroup *memcg; + bool online; + char name[80]; + + rcu_read_lock(); + memcg_data = READ_ONCE(page->memcg_data); + if (!memcg_data) + goto out_unlock; + + if (memcg_data & MEMCG_DATA_OBJCGS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + + memcg = page_memcg_check(page); + if (!memcg) + goto out_unlock; + + online = (memcg->css.flags & CSS_ONLINE); + cgroup_name(memcg->css.cgroup, name, sizeof(name)); + ret += scnprintf(kbuf + ret, count - ret, + "Charged %sto %smemcg %s\n", + PageMemcgKmem(page) ? "(via objcg) " : "", + online ? "" : "offline ", + name); +out_unlock: + rcu_read_unlock(); +#endif /* CONFIG_MEMCG */ + + return ret; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, @@ -365,6 +405,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migrate_reason_names[page_owner->last_migrate_reason]); } + ret = print_page_owner_memcg(kbuf, count, ret, page); + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; From ecd86f320cc88ac939cb5dcb3ae01c16fd743b44 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:31:05 +1100 Subject: [PATCH 086/334] mm/page_owner: record task command name The page_owner information currently includes the pid of the calling task. That is useful as long as the task is still running. Otherwise, the number is meaningless. To have more information about the allocating tasks that had exited by the time the page_owner information is retrieved, we need to store the command name of the task. Add a new comm field into page_owner structure to store the command name and display it when the page_owner information is retrieved. Link: https://lkml.kernel.org/r/20220202203036.744010-5-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Rafael Aquini Cc: Andy Shevchenko Cc: David Rientjes Cc: Ira Weiny Cc: Johannes Weiner Cc: Michal Hocko Cc: Mike Rapoport Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_owner.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index f7820357e4d4c..d56afa9c792ed 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -29,6 +29,7 @@ struct page_owner { depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; + char comm[TASK_COMM_LEN]; pid_t pid; }; @@ -165,6 +166,8 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->ts_nsec = local_clock(); + strlcpy(page_owner->comm, current->comm, + sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -232,6 +235,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; + strcpy(new_page_owner->comm, old_page_owner->comm); /* * We don't clear the bit on the old folio as it's going to be freed @@ -379,10 +383,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = scnprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d (%s), ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->comm, page_owner->ts_nsec, + page_owner->free_ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); @@ -449,9 +454,10 @@ void __dump_page_owner(const struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->pid, page_owner->comm, page_owner->ts_nsec, + page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) From 9509efdb5d9379dbeac0a5235a14a4a4447582a9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:05 +1100 Subject: [PATCH 087/334] tools/vm/page_owner: filter out pid and timestamp Commit 9cc7e96aa846 ("mm/page_owner: record timestamp and pid") introduces timestamp and pid for page owner. However, it is hard to aggregate the stack since those are specific (especially timestamp). Filter out those information when aggregating. Link: https://lkml.kernel.org/r/20220215134045.12004-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Liam Mark Cc: Georgi Djakov Cc: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/vm/page_owner_sort.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c8ec2d6b314de..de7b547c26aa0 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -50,6 +50,12 @@ int read_block(char *buf, int buf_size, FILE *fin) return curr - buf; if (!strncmp(curr, "PFN", 3)) continue; + if (!strncmp(curr, "Page allocated via order", 24)) { + char *end = strstr(curr, ", pid "); + + if (end) + memcpy(end, "\n", 2); + } curr += strlen(curr); } From ba5dde2f66a2e868bbacf819fb9fa62f4962ebfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:06 +1100 Subject: [PATCH 088/334] mm: unexport page_init_poison page_init_poison is only used in core MM code, so unexport it. Link: https://lkml.kernel.org/r/20220207063446.1833404-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/debug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/debug.c b/mm/debug.c index bc9ac87f0e08d..8b43dbb2f17be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -265,5 +265,4 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } -EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ From 62d6ea564cfd70b92ce378db2e4b6b774e933fde Mon Sep 17 00:00:00 2001 From: zhanglianjie Date: Wed, 16 Feb 2022 15:31:06 +1100 Subject: [PATCH 089/334] mm: move page-writeback sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the page-writeback sysctls to its own file. Link: https://lkml.kernel.org/r/20220129012955.26594-1-zhanglianjie@uniontech.com Signed-off-by: zhanglianjie Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/writeback.h | 15 ------ kernel/sysctl.c | 69 --------------------------- mm/page-writeback.c | 99 +++++++++++++++++++++++++++++++++++---- 3 files changed, 89 insertions(+), 94 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec5..dc2b94e6a94f0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -345,28 +345,13 @@ void wb_domain_exit(struct wb_domain *dom); extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ -extern int dirty_background_ratio; -extern unsigned long dirty_background_bytes; -extern int vm_dirty_ratio; -extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; -extern int vm_highmem_is_dirtyable; extern int laptop_mode; -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e2..34371bcb8ffa8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,8 +100,6 @@ static const int six_hundred_forty_kb = 640 * 1024; #endif -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2401,55 +2399,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, @@ -2621,13 +2570,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, @@ -2725,17 +2667,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d163f8d36b2..f630681df9d29 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,30 +70,33 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ -int dirty_background_ratio = 10; +static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ -unsigned long dirty_background_bytes; +static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ -int vm_highmem_is_dirtyable; +static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +static int vm_dirty_ratio = 20; + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ -unsigned long vm_dirty_bytes; +static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks @@ -503,7 +506,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) return nr_pages <= limit; } -int dirty_background_ratio_handler(struct ctl_table *table, int write, +static int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -514,7 +517,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -int dirty_background_bytes_handler(struct ctl_table *table, int write, +static int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -525,7 +528,7 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, +static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; @@ -539,7 +542,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, return ret; } -int dirty_bytes_handler(struct ctl_table *table, int write, +static int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; @@ -1996,7 +1999,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, +static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; @@ -2081,6 +2084,79 @@ static int page_writeback_cpu_online(unsigned int cpu) return 0; } +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif + /* * Called early on to tune the page writeback dirty limits. * @@ -2105,6 +2181,9 @@ void __init page_writeback_init(void) page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif } /** From daa3321ea87b79401bed33b55ddba1a000b606ba Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:06 +1100 Subject: [PATCH 090/334] mm-move-page-writeback-sysctls-to-is-own-file-checkpatch-fixes WARNING: please, no spaces at the start of a line #249: FILE: mm/page-writeback.c:2089: + {$ ERROR: code indent should use tabs where possible #250: FILE: mm/page-writeback.c:2090: + .procname = "dirty_background_ratio",$ WARNING: please, no spaces at the start of a line #250: FILE: mm/page-writeback.c:2090: + .procname = "dirty_background_ratio",$ ERROR: code indent should use tabs where possible #251: FILE: mm/page-writeback.c:2091: + .data = &dirty_background_ratio,$ WARNING: please, no spaces at the start of a line #251: FILE: mm/page-writeback.c:2091: + .data = &dirty_background_ratio,$ ERROR: code indent should use tabs where possible #252: FILE: mm/page-writeback.c:2092: + .maxlen = sizeof(dirty_background_ratio),$ WARNING: please, no spaces at the start of a line #252: FILE: mm/page-writeback.c:2092: + .maxlen = sizeof(dirty_background_ratio),$ ERROR: code indent should use tabs where possible #253: FILE: mm/page-writeback.c:2093: + .mode = 0644,$ WARNING: please, no spaces at the start of a line #253: FILE: mm/page-writeback.c:2093: + .mode = 0644,$ ERROR: code indent should use tabs where possible #254: FILE: mm/page-writeback.c:2094: + .proc_handler = dirty_background_ratio_handler,$ WARNING: please, no spaces at the start of a line #254: FILE: mm/page-writeback.c:2094: + .proc_handler = dirty_background_ratio_handler,$ ERROR: code indent should use tabs where possible #255: FILE: mm/page-writeback.c:2095: + .extra1 = SYSCTL_ZERO,$ WARNING: please, no spaces at the start of a line #255: FILE: mm/page-writeback.c:2095: + .extra1 = SYSCTL_ZERO,$ ERROR: code indent should use tabs where possible #256: FILE: mm/page-writeback.c:2096: + .extra2 = SYSCTL_ONE_HUNDRED,$ WARNING: please, no spaces at the start of a line #256: FILE: mm/page-writeback.c:2096: + .extra2 = SYSCTL_ONE_HUNDRED,$ WARNING: please, no spaces at the start of a line #257: FILE: mm/page-writeback.c:2097: + },$ WARNING: please, no spaces at the start of a line #258: FILE: mm/page-writeback.c:2098: + {$ ERROR: code indent should use tabs where possible #259: FILE: mm/page-writeback.c:2099: + .procname = "dirty_background_bytes",$ WARNING: please, no spaces at the start of a line #259: FILE: mm/page-writeback.c:2099: + .procname = "dirty_background_bytes",$ ERROR: code indent should use tabs where possible #260: FILE: mm/page-writeback.c:2100: + .data = &dirty_background_bytes,$ WARNING: please, no spaces at the start of a line #260: FILE: mm/page-writeback.c:2100: + .data = &dirty_background_bytes,$ ERROR: code indent should use tabs where possible #261: FILE: mm/page-writeback.c:2101: + .maxlen = sizeof(dirty_background_bytes),$ WARNING: please, no spaces at the start of a line #261: FILE: mm/page-writeback.c:2101: + .maxlen = sizeof(dirty_background_bytes),$ ERROR: code indent should use tabs where possible #262: FILE: mm/page-writeback.c:2102: + .mode = 0644,$ WARNING: please, no spaces at the start of a line #262: FILE: mm/page-writeback.c:2102: + .mode = 0644,$ ERROR: code indent should use tabs where possible #263: FILE: mm/page-writeback.c:2103: + .proc_handler = dirty_background_bytes_handler,$ WARNING: please, no spaces at the start of a line #263: FILE: mm/page-writeback.c:2103: + .proc_handler = dirty_background_bytes_handler,$ ERROR: code indent should use tabs where possible #264: FILE: mm/page-writeback.c:2104: + .extra1 = SYSCTL_LONG_ONE,$ WARNING: please, no spaces at the start of a line #264: FILE: mm/page-writeback.c:2104: + .extra1 = SYSCTL_LONG_ONE,$ WARNING: please, no spaces at the start of a line #265: FILE: mm/page-writeback.c:2105: + },$ WARNING: please, no spaces at the start of a line #266: FILE: mm/page-writeback.c:2106: + {$ ERROR: code indent should use tabs where possible #267: FILE: mm/page-writeback.c:2107: + .procname = "dirty_ratio",$ WARNING: please, no spaces at the start of a line #267: FILE: mm/page-writeback.c:2107: + .procname = "dirty_ratio",$ ERROR: code indent should use tabs where possible #268: FILE: mm/page-writeback.c:2108: + .data = &vm_dirty_ratio,$ WARNING: please, no spaces at the start of a line #268: FILE: mm/page-writeback.c:2108: + .data = &vm_dirty_ratio,$ ERROR: code indent should use tabs where possible #269: FILE: mm/page-writeback.c:2109: + .maxlen = sizeof(vm_dirty_ratio),$ WARNING: please, no spaces at the start of a line #269: FILE: mm/page-writeback.c:2109: + .maxlen = sizeof(vm_dirty_ratio),$ ERROR: code indent should use tabs where possible #270: FILE: mm/page-writeback.c:2110: + .mode = 0644,$ WARNING: please, no spaces at the start of a line #270: FILE: mm/page-writeback.c:2110: + .mode = 0644,$ ERROR: code indent should use tabs where possible #271: FILE: mm/page-writeback.c:2111: + .proc_handler = dirty_ratio_handler,$ WARNING: please, no spaces at the start of a line #271: FILE: mm/page-writeback.c:2111: + .proc_handler = dirty_ratio_handler,$ ERROR: code indent should use tabs where possible #272: FILE: mm/page-writeback.c:2112: + .extra1 = SYSCTL_ZERO,$ WARNING: please, no spaces at the start of a line #272: FILE: mm/page-writeback.c:2112: + .extra1 = SYSCTL_ZERO,$ ERROR: code indent should use tabs where possible #273: FILE: mm/page-writeback.c:2113: + .extra2 = SYSCTL_ONE_HUNDRED,$ WARNING: please, no spaces at the start of a line #273: FILE: mm/page-writeback.c:2113: + .extra2 = SYSCTL_ONE_HUNDRED,$ WARNING: please, no spaces at the start of a line #274: FILE: mm/page-writeback.c:2114: + },$ WARNING: please, no spaces at the start of a line #275: FILE: mm/page-writeback.c:2115: + {$ ERROR: code indent should use tabs where possible #276: FILE: mm/page-writeback.c:2116: + .procname = "dirty_bytes",$ WARNING: please, no spaces at the start of a line #276: FILE: mm/page-writeback.c:2116: + .procname = "dirty_bytes",$ ERROR: code indent should use tabs where possible #277: FILE: mm/page-writeback.c:2117: + .data = &vm_dirty_bytes,$ WARNING: please, no spaces at the start of a line #277: FILE: mm/page-writeback.c:2117: + .data = &vm_dirty_bytes,$ ERROR: code indent should use tabs where possible #278: FILE: mm/page-writeback.c:2118: + .maxlen = sizeof(vm_dirty_bytes),$ WARNING: please, no spaces at the start of a line #278: FILE: mm/page-writeback.c:2118: + .maxlen = sizeof(vm_dirty_bytes),$ ERROR: code indent should use tabs where possible #279: FILE: mm/page-writeback.c:2119: + .mode = 0644,$ WARNING: please, no spaces at the start of a line #279: FILE: mm/page-writeback.c:2119: + .mode = 0644,$ ERROR: code indent should use tabs where possible #280: FILE: mm/page-writeback.c:2120: + .proc_handler = dirty_bytes_handler,$ WARNING: please, no spaces at the start of a line #280: FILE: mm/page-writeback.c:2120: + .proc_handler = dirty_bytes_handler,$ ERROR: code indent should use tabs where possible #281: FILE: mm/page-writeback.c:2121: + .extra1 = (void *)&dirty_bytes_min,$ WARNING: please, no spaces at the start of a line #281: FILE: mm/page-writeback.c:2121: + .extra1 = (void *)&dirty_bytes_min,$ WARNING: please, no spaces at the start of a line #282: FILE: mm/page-writeback.c:2122: + },$ WARNING: please, no spaces at the start of a line #283: FILE: mm/page-writeback.c:2123: + {$ ERROR: code indent should use tabs where possible #284: FILE: mm/page-writeback.c:2124: + .procname = "dirty_writeback_centisecs",$ WARNING: please, no spaces at the start of a line #284: FILE: mm/page-writeback.c:2124: + .procname = "dirty_writeback_centisecs",$ ERROR: code indent should use tabs where possible #285: FILE: mm/page-writeback.c:2125: + .data = &dirty_writeback_interval,$ WARNING: please, no spaces at the start of a line #285: FILE: mm/page-writeback.c:2125: + .data = &dirty_writeback_interval,$ ERROR: code indent should use tabs where possible #286: FILE: mm/page-writeback.c:2126: + .maxlen = sizeof(dirty_writeback_interval),$ WARNING: please, no spaces at the start of a line #286: FILE: mm/page-writeback.c:2126: + .maxlen = sizeof(dirty_writeback_interval),$ ERROR: code indent should use tabs where possible #287: FILE: mm/page-writeback.c:2127: + .mode = 0644,$ WARNING: please, no spaces at the start of a line #287: FILE: mm/page-writeback.c:2127: + .mode = 0644,$ ERROR: code indent should use tabs where possible #288: FILE: mm/page-writeback.c:2128: + .proc_handler = dirty_writeback_centisecs_handler,$ WARNING: please, no spaces at the start of a line #288: FILE: mm/page-writeback.c:2128: + .proc_handler = dirty_writeback_centisecs_handler,$ WARNING: please, no spaces at the start of a line #289: FILE: mm/page-writeback.c:2129: + },$ WARNING: please, no spaces at the start of a line #290: FILE: mm/page-writeback.c:2130: + {$ ERROR: code indent should use tabs where possible #291: FILE: mm/page-writeback.c:2131: + .procname = "dirty_expire_centisecs",$ WARNING: please, no spaces at the start of a line #291: FILE: mm/page-writeback.c:2131: + .procname = "dirty_expire_centisecs",$ ERROR: code indent should use tabs where possible #292: FILE: mm/page-writeback.c:2132: + .data = &dirty_expire_interval,$ WARNING: please, no spaces at the start of a line #292: FILE: mm/page-writeback.c:2132: + .data = &dirty_expire_interval,$ ERROR: code indent should use tabs where possible #293: FILE: mm/page-writeback.c:2133: + .maxlen = sizeof(dirty_expire_interval),$ WARNING: please, no spaces at the start of a line #293: FILE: mm/page-writeback.c:2133: + .maxlen = sizeof(dirty_expire_interval),$ ERROR: code indent should use tabs where possible #294: FILE: mm/page-writeback.c:2134: + .mode = 0644,$ WARNING: please, no spaces at the start of a line #294: FILE: mm/page-writeback.c:2134: + .mode = 0644,$ ERROR: code indent should use tabs where possible #295: FILE: mm/page-writeback.c:2135: + .proc_handler = proc_dointvec_minmax,$ WARNING: please, no spaces at the start of a line #295: FILE: mm/page-writeback.c:2135: + .proc_handler = proc_dointvec_minmax,$ ERROR: code indent should use tabs where possible #296: FILE: mm/page-writeback.c:2136: + .extra1 = SYSCTL_ZERO,$ WARNING: please, no spaces at the start of a line #296: FILE: mm/page-writeback.c:2136: + .extra1 = SYSCTL_ZERO,$ WARNING: please, no spaces at the start of a line #297: FILE: mm/page-writeback.c:2137: + },$ total: 37 errors, 49 warnings, 287 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. NOTE: Whitespace errors detected. You may wish to use scripts/cleanpatch or scripts/cleanfile ./patches/mm-move-page-writeback-sysctls-to-is-own-file.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: zhanglianjie Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page-writeback.c | 98 ++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f630681df9d29..1b308c00908c7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2086,55 +2086,55 @@ static int page_writeback_cpu_online(unsigned int cpu) #ifdef CONFIG_SYSCTL static struct ctl_table vm_page_writeback_sysctls[] = { - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_HIGHMEM { .procname = "highmem_is_dirtyable", From 28d01b40b00d5504d60000beeaa4ddeb4572fb4e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:06 +1100 Subject: [PATCH 091/334] mm-move-page-writeback-sysctls-to-is-own-file-fix fix CONFIG_SYSCTL=n warnings Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: zhanglianjie Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1b308c00908c7..48a8cf770e3f4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -506,6 +506,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) return nr_pages <= limit; } +#ifdef CONFIG_SYSCTL static int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -555,6 +556,7 @@ static int dirty_bytes_handler(struct ctl_table *table, int write, } return ret; } +#endif static unsigned long wp_next_time(unsigned long cur_time) { @@ -1996,6 +1998,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } +#ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ @@ -2020,6 +2023,7 @@ static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } +#endif void laptop_mode_timer_fn(struct timer_list *t) { From 57c83b300464cdf7f389d6f27405cb820eb9cb08 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:06 +1100 Subject: [PATCH 092/334] filemap: remove find_get_pages() It's unused now. Remove it and clean up the relevant comment. Link: https://lkml.kernel.org/r/20220208134149.47299-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: David Howells Cc: William Kucharski Cc: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Johannes Weiner Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/pagemap.h | 7 ------- mm/filemap.c | 11 ++++++----- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 270bf5136c34e..dc31eb981ea2b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -594,13 +594,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); -static inline unsigned find_get_pages(struct address_space *mapping, - pgoff_t *start, unsigned int nr_pages, - struct page **pages) -{ - return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages, - pages); -} unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, diff --git a/mm/filemap.c b/mm/filemap.c index ad8c39d90bf94..90afe301cd527 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2229,8 +2229,9 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages_contig() works exactly like find_get_pages(), except - * that the returned number of pages are guaranteed to be contiguous. + * find_get_pages_contig() works exactly like find_get_pages_range(), + * except that the returned number of pages are guaranteed to be + * contiguous. * * Return: the number of pages which were found. */ @@ -2290,9 +2291,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages(), except we only return head pages which are tagged - * with @tag. @index is updated to the index immediately after the last - * page we return, ready for the next iteration. + * Like find_get_pages_range(), except we only return head pages which are + * tagged with @tag. @index is updated to the index immediately after the + * last page we return, ready for the next iteration. * * Return: the number of pages which were found. */ From 7d7ffe8f960584282d5f17ec0fe9d9c263fa8e16 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 16 Feb 2022 15:31:07 +1100 Subject: [PATCH 093/334] mm: fix invalid page pointer returned with FOLL_PIN gups Patch series "mm/gup: some cleanups", v4. This patch (of 5): Alex reported invalid page pointer returned with pin_user_pages_remote() from vfio after upstream commit 4b6c33b32296 ("vfio/type1: Prepare for batched pinning with struct vfio_batch"). It turns out that it's not the fault of the vfio commit; however after vfio switches to a full page buffer to store the page pointers it starts to expose the problem easier. The problem is for VM_PFNMAP vmas we should normally fail with an -EFAULT then vfio will carry on to handle the MMIO regions. However when the bug triggered, follow_page_mask() returned -EEXIST for such a page, which will jump over the current page, leaving that entry in **pages untouched. However the caller is not aware of it, hence the caller will reference the page as usual even if the pointer data can be anything. We had that -EEXIST logic since commit 1027e4436b6a ("mm: make GUP handle pfn mapping unless FOLL_GET is requested") which seems very reasonable. It could be that when we reworked GUP with FOLL_PIN we could have overlooked that special path in commit 3faa52c03f44 ("mm/gup: track FOLL_PIN pages"), even if that commit rightfully touched up follow_devmap_pud() on checking FOLL_PIN when it needs to return an -EEXIST. Attaching the Fixes to the FOLL_PIN rework commit, as it happened later than 1027e4436b6a. [jhubbard@nvidia.com: added some tags, removed a reference to an out of tree module.] Link: https://lkml.kernel.org/r/20220207062213.235127-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20220204020010.68930-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20220204020010.68930-2-jhubbard@nvidia.com Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Signed-off-by: Peter Xu Signed-off-by: John Hubbard Reviewed-by: Claudio Imbrenda Reported-by: Alex Williamson Debugged-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Andrea Arcangeli Cc: Kirill A. Shutemov Cc: Jason Gunthorpe Cc: David Hildenbrand Cc: Lukas Bulwahn Cc: Matthew Wilcox (Oracle) Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index a9d4d724aef74..80229ecf0114a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -465,7 +465,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { /* No page to get reference */ - if (flags & FOLL_GET) + if (flags & (FOLL_GET | FOLL_PIN)) return -EFAULT; if (flags & FOLL_TOUCH) { From 6376f59698799bc0189fbb125565c3ceb8713a66 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 16 Feb 2022 15:31:07 +1100 Subject: [PATCH 094/334] mm/gup: follow_pfn_pte(): -EEXIST cleanup Remove a quirky special case from follow_pfn_pte(), and adjust its callers to match. Caller changes include: __get_user_pages(): Regardless of any FOLL_* flags, get_user_pages() and its variants should handle PFN-only entries by stopping early, if the caller expected **pages to be filled in. This makes for a more reliable API, as compared to the previous approach of skipping over such entries (and thus leaving them silently unwritten). move_pages(): squash the -EEXIST error return from follow_page() into -EFAULT, because -EFAULT is listed in the man page, whereas -EEXIST is not. Link: https://lkml.kernel.org/r/20220204020010.68930-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Suggested-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Peter Xu Cc: Lukas Bulwahn Cc: Matthew Wilcox Cc: Claudio Imbrenda Cc: Alex Williamson Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 13 ++++++++----- mm/migrate.c | 7 +++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 80229ecf0114a..2df0d0103c43b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -464,10 +464,6 @@ static struct page *no_page_table(struct vm_area_struct *vma, static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { - /* No page to get reference */ - if (flags & (FOLL_GET | FOLL_PIN)) - return -EFAULT; - if (flags & FOLL_TOUCH) { pte_t entry = *pte; @@ -1205,8 +1201,15 @@ static long __get_user_pages(struct mm_struct *mm, } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding - * struct page. + * struct page. If the caller expects **pages to be + * filled in, bail out now, because that can't be done + * for this page. */ + if (pages) { + ret = PTR_ERR(page); + goto out; + } + goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); diff --git a/mm/migrate.c b/mm/migrate.c index c7da064b4781b..be0d5ae36dc10 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1761,6 +1761,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, continue; } + /* + * The move_pages() man page does not have an -EEXIST choice, so + * use -EFAULT instead. + */ + if (err == -EEXIST) + err = -EFAULT; + /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. From 67baccf07405a86455d5fea2e6f0aea31e8245da Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 16 Feb 2022 15:31:07 +1100 Subject: [PATCH 095/334] mm/gup: remove unused pin_user_pages_locked() This routine was used for a short while, but then the calling code was refactored and the only caller was removed. Link: https://lkml.kernel.org/r/20220204020010.68930-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Claudio Imbrenda Cc: Alex Williamson Cc: Andrea Arcangeli Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lukas Bulwahn Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 -- mm/gup.c | 29 ----------------------------- 2 files changed, 31 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b1922..80c540c17d83c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1918,8 +1918,6 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/mm/gup.c b/mm/gup.c index 2df0d0103c43b..6664dfa7a38f8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3146,32 +3146,3 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); - -/* - * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). - * Behavior is the same, except that this one sets FOLL_PIN and rejects - * FOLL_GET. - */ -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - gup_flags |= FOLL_PIN; - return __get_user_pages_locked(current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(pin_user_pages_locked); From 0cbdd7d136e90a99387f12c4d10e5c100a65444c Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 16 Feb 2022 15:31:07 +1100 Subject: [PATCH 096/334] mm: change lookup_node() to use get_user_pages_fast() The purpose of calling get_user_pages_locked() from lookup_node() was to allow for unlocking the mmap_lock when reading a page from the disk during a page fault (hidden behind VM_FAULT_RETRY). The idea was to reduce contention on the heavily-used mmap_lock. (Thanks to Jan Kara for clearly pointing that out, and in fact I've used some of his wording here.) However, it is unlikely for lookup_node() to take a page fault. With that in mind, change over to calling get_user_pages_fast(). This simplifies the code, runs a little faster in the expected case, and allows removing get_user_pages_locked() entirely, in a subsequent patch. Link: https://lkml.kernel.org/r/20220204020010.68930-5-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Jason Gunthorpe Reviewed-by: Claudio Imbrenda Reviewed-by: Christoph Hellwig Cc: Alex Williamson Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lukas Bulwahn Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 028e8dd82b442..3f8dc58da3e80 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -907,17 +907,14 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; - int err; + int ret; - int locked = 1; - err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err > 0) { - err = page_to_nid(p); + ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); + if (ret > 0) { + ret = page_to_nid(p); put_page(p); } - if (locked) - mmap_read_unlock(mm); - return err; + return ret; } /* Retrieve NUMA policy */ @@ -968,14 +965,14 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* - * Take a refcount on the mpol, lookup_node() - * will drop the mmap_lock, so after calling - * lookup_node() only "pol" remains valid, "vma" - * is stale. + * Take a refcount on the mpol, because we are about to + * drop the mmap_lock, after which only "pol" remains + * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); + mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; From 6f45c98b05cd726a8fd70ee3e5f2177cfb7caf32 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 16 Feb 2022 15:31:07 +1100 Subject: [PATCH 097/334] mm/gup: remove unused get_user_pages_locked() Now that the last caller of get_user_pages_locked() is gone, remove it. Link: https://lkml.kernel.org/r/20220204020010.68930-6-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Jason Gunthorpe Reviewed-by: Claudio Imbrenda Reviewed-by: Christoph Hellwig Cc: Alex Williamson Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lukas Bulwahn Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 -- mm/gup.c | 59 ---------------------------------------------- 2 files changed, 61 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80c540c17d83c..528ef1cb4f3a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1916,8 +1916,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/mm/gup.c b/mm/gup.c index 6664dfa7a38f8..7337520afa499 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2145,65 +2145,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/** - * get_user_pages_locked() - variant of get_user_pages() - * - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * - * It is suitable to replace the form: - * - * mmap_read_lock(mm); - * do_something() - * get_user_pages(mm, ..., pages, NULL); - * mmap_read_unlock(mm); - * - * to: - * - * int locked = 1; - * mmap_read_lock(mm); - * do_something() - * get_user_pages_locked(mm, ..., pages, &locked); - * if (locked) - * mmap_read_unlock(mm); - * - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * - */ -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) - return -EINVAL; - - return __get_user_pages_locked(current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages_locked); - /* * get_user_pages_unlocked() is suitable to replace the form: * From 2d6269c090afa5b93c59d882eae75659c8bad3ec Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Wed, 16 Feb 2022 15:31:08 +1100 Subject: [PATCH 098/334] tmpfs: support for file creation time Various filesystems (including ext4) now support file creation time. This patch adds such support for tmpfs-based filesystems. Link: https://lkml.kernel.org/r/20220211213628.GA1919658@xavier-xps Signed-off-by: Xavier Roche Tested-by: Jean Delvare Reviewed-by: Jean Delvare Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/shmem_fs.h | 1 + mm/shmem.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e65b80ed09e77..29787767c3b95 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -25,6 +25,7 @@ struct shmem_inode_info { struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct inode vfs_inode; + struct timespec64 i_crtime; /* file creation time */ }; struct shmem_sb_info { diff --git a/mm/shmem.c b/mm/shmem.c index a09b29ec2b45c..5a3907712c4f7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1061,6 +1061,12 @@ static int shmem_getattr(struct user_namespace *mnt_userns, if (shmem_is_huge(NULL, inode, 0)) stat->blksize = HPAGE_PMD_SIZE; + if ((request_mask & STATX_BTIME)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = info->i_crtime.tv_sec; + stat->btime.tv_nsec = info->i_crtime.tv_nsec; + } + return 0; } @@ -2265,6 +2271,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; + info->i_crtime = inode->i_mtime; INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -3196,6 +3203,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { + .getattr = shmem_getattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3203,6 +3211,7 @@ static const struct inode_operations shmem_short_symlink_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { + .getattr = shmem_getattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3790,6 +3799,7 @@ static const struct inode_operations shmem_inode_operations = { static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS + .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, @@ -3811,6 +3821,7 @@ static const struct inode_operations shmem_dir_inode_operations = { }; static const struct inode_operations shmem_special_inode_operations = { + .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif From 3aae582fc96a35f30d76e83e880a45839ab3d347 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 16 Feb 2022 15:31:08 +1100 Subject: [PATCH 099/334] memcg: replace in_interrupt() with !in_task() Replace the deprecated in_interrupt() with !in_task() because in_interrupt() returns true for BH disabled even if the call happens in the task context. in_task() is the right interface to differentiate task context from NMI, hard IRQ and softirq contexts. Link: https://lkml.kernel.org/r/20220127162636.3461256-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Vasily Averin Cc: Johannes Weiner Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36e9f38c919d0..209e66893da6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2688,7 +2688,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, READ_ONCE(memcg->swap.high); /* Don't bother a random interrupted task */ - if (in_interrupt()) { + if (!in_task()) { if (mem_high) { schedule_work(&memcg->high_work); break; @@ -6968,7 +6968,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; /* Do not associate the sock with unrelated interrupted task's memcg. */ - if (in_interrupt()) + if (!in_task()) return; rcu_read_lock(); From 18a5393ed62d0a608677dacf58e4b78fc239273f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 16 Feb 2022 15:31:08 +1100 Subject: [PATCH 100/334] memcg: add per-memcg total kernel memory stat Currently memcg stats show several types of kernel memory: kernel stack, page tables, sock, vmalloc, and slab. However, there are other allocations with __GFP_ACCOUNT (or supersets such as GFP_KERNEL_ACCOUNT) that are not accounted in any of those stats, a few examples are: - various kvm allocations (e.g. allocated pages to create vcpus) - io_uring - tmp_page in pipes during pipe_write() - bpf ringbuffers - unix sockets Keeping track of the total kernel memory is essential for the ease of migration from cgroup v1 to v2 as there are large discrepancies between v1's kmem.usage_in_bytes and the sum of the available kernel memory stats in v2. Adding separate memcg stats for all __GFP_ACCOUNT kernel allocations is an impractical maintenance burden as there a lot of those all over the kernel code, with more use cases likely to show up in the future. Therefore, add a "kernel" memcg stat that is analogous to kmem page counter, with added benefits such as using rstat infrastructure which aggregates stats more efficiently. Additionally, this provides a lighter alternative in case the legacy kmem is deprecated in the future Link: https://lkml.kernel.org/r/20220201200823.3283171-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 24 ++++++++++++++++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5aa368d165dab..a0027d570a7f3 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1317,6 +1317,11 @@ PAGE_SIZE multiple when read back. vmalloc (npn) Amount of memory used for vmap backed memory. + kernel (npn) + Amount of total kernel memory, including + (kernel_stack, pagetables, percpu, vmalloc, slab) in + addition to other kernel memory use cases. + shmem Amount of cached filesystem data that is swap-backed, such as tmpfs, shm segments, shared anonymous mmap()s diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0abbd685703b9..8612d7dd08594 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -34,6 +34,7 @@ enum memcg_stat_item { MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, + MEMCG_KMEM, MEMCG_NR_STAT, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 209e66893da6e..afece32e67774 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1376,6 +1376,7 @@ static const struct memory_stat memory_stats[] = { { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, + { "kernel", MEMCG_KMEM }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -2979,6 +2980,19 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } +static void mem_cgroup_kmem_record(struct mem_cgroup *memcg, + int nr_pages) +{ + mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (nr_pages > 0) + page_counter_charge(&memcg->kmem, nr_pages); + else + page_counter_uncharge(&memcg->kmem, -nr_pages); + } +} + + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2991,8 +3005,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); + mem_cgroup_kmem_record(memcg, -nr_pages); refill_stock(memcg, nr_pages); css_put(&memcg->css); @@ -3018,8 +3031,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_charge(&memcg->kmem, nr_pages); + mem_cgroup_kmem_record(memcg, nr_pages); out: css_put(&memcg->css); @@ -6801,8 +6813,8 @@ static void uncharge_batch(const struct uncharge_gather *ug) page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + if (ug->nr_kmem) + mem_cgroup_kmem_record(ug->memcg, -ug->nr_kmem); memcg_oom_recover(ug->memcg); } From 02f9c4a77a28a160ac77ae9377b2aab53ccf0a32 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 16 Feb 2022 15:31:08 +1100 Subject: [PATCH 101/334] memcg-add-per-memcg-total-kernel-memory-stat-v2 - Moved "kernel" stat ahead of other subset kernel stats. - Renamed mem_cgroup_kmem_record() to memcg_account_kmem(), following Johannes's review to avoid the line wrap, but keeping a memcg_ prefix to stay consistent with other static functions in the file. - Fixed a build error when CONFIG_MEMCG_KMEM is not set (added an empty version if the config is not set). Link: https://lkml.kernel.org/r/20220203193856.972500-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reported-by: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/cgroup-v2.rst | 10 +++++----- mm/memcontrol.c | 15 +++++++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a0027d570a7f3..69d7a6983f781 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1301,6 +1301,11 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + kernel (npn) + Amount of total kernel memory, including + (kernel_stack, pagetables, percpu, vmalloc, slab) in + addition to other kernel memory use cases. + kernel_stack Amount of memory allocated to kernel stacks. @@ -1317,11 +1322,6 @@ PAGE_SIZE multiple when read back. vmalloc (npn) Amount of memory used for vmap backed memory. - kernel (npn) - Amount of total kernel memory, including - (kernel_stack, pagetables, percpu, vmalloc, slab) in - addition to other kernel memory use cases. - shmem Amount of cached filesystem data that is swap-backed, such as tmpfs, shm segments, shared anonymous mmap()s diff --git a/mm/memcontrol.c b/mm/memcontrol.c index afece32e67774..e64a276837b0a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1371,12 +1371,12 @@ struct memory_stat { static const struct memory_stat memory_stats[] = { { "anon", NR_ANON_MAPPED }, { "file", NR_FILE_PAGES }, + { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, - { "kernel", MEMCG_KMEM }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -2115,6 +2115,7 @@ static DEFINE_MUTEX(percpu_charge_mutex); static void drain_obj_stock(struct obj_stock *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else static inline void drain_obj_stock(struct obj_stock *stock) @@ -2125,6 +2126,9 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { return false; } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ +} #endif /** @@ -2980,8 +2984,7 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } -static void mem_cgroup_kmem_record(struct mem_cgroup *memcg, - int nr_pages) +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) { mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { @@ -3005,7 +3008,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - mem_cgroup_kmem_record(memcg, -nr_pages); + memcg_account_kmem(memcg, -nr_pages); refill_stock(memcg, nr_pages); css_put(&memcg->css); @@ -3031,7 +3034,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - mem_cgroup_kmem_record(memcg, nr_pages); + memcg_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -6814,7 +6817,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); if (ug->nr_kmem) - mem_cgroup_kmem_record(ug->memcg, -ug->nr_kmem); + memcg_account_kmem(ug->memcg, -ug->nr_kmem); memcg_oom_recover(ug->memcg); } From 2f168d8527c54e9fee8ca3efb07dd51ee59aa759 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 16 Feb 2022 15:31:08 +1100 Subject: [PATCH 102/334] mm/memcg: mem_cgroup_per_node is already set to 0 on allocation kzalloc_node() would set data to 0, so it's not necessary to set it again. Link: https://lkml.kernel.org/r/20220201004643.8391-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Muchun Song Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Reviewed-by: Mike Rapoport Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e64a276837b0a..5318c3fb92f4d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5105,8 +5105,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) } lruvec_init(&pn->lruvec); - pn->usage_in_excess = 0; - pn->on_tree = false; pn->memcg = memcg; memcg->nodeinfo[node] = pn; From fa6a939878e036bd3d1c668dfda8c6f75728a7f4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 16 Feb 2022 15:31:09 +1100 Subject: [PATCH 103/334] mm/memcg: retrieve parent memcg from css.parent The parent we get from page_counter is correct, while this is two different hierarchy. Let's retrieve the parent memcg from css.parent just like parent_cs(), blkcg_parent(), etc. Link: https://lkml.kernel.org/r/20220201004643.8391-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Muchun Song Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Yang Shi Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8612d7dd08594..ef4b445392a9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -842,9 +842,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); + return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, From a2bee4cfc1cdd73dbb10e4eb2a30e6cb084b35f1 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 16 Feb 2022 15:31:09 +1100 Subject: [PATCH 104/334] memcg: refactor mem_cgroup_oom Patch series "memcg: robust enforcement of memory.high", v2. Due to the semantics of memory.high enforcement i.e. throttle the workload without oom-kill, we are trying to use it for right sizing the workloads in our production environment. However we observed the mechanism fails for some specific applications which does big chunck of allocations in a single syscall. The reason behind this failure is due to the limitation of the memory.high enforcement's current implementation. This patch series solves this issue by enforcing the memory.high synchronously if the current process has accumulated a large amount of high overcharge. This patch (of 4): The function mem_cgroup_oom returns enum which has four possible values but the caller does not care about such values and only cares if the return value is OOM_SUCCESS or not. So, remove the enum altogether and make mem_cgroup_oom returns a simple bool. Link: https://lkml.kernel.org/r/20220211064917.2028469-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220211064917.2028469-2-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Chris Down Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5318c3fb92f4d..0451cc06b157c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1796,20 +1796,16 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -enum oom_status { - OOM_SUCCESS, - OOM_FAILED, - OOM_ASYNC, - OOM_SKIPPED -}; - -static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +/* + * Returns true if successfully killed one or more processes. Though in some + * corner cases it can return true even without killing any process. + */ +static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - enum oom_status ret; - bool locked; + bool locked, ret; if (order > PAGE_ALLOC_COSTLY_ORDER) - return OOM_SKIPPED; + return false; memcg_memory_event(memcg, MEMCG_OOM); @@ -1832,14 +1828,13 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int * victim and then we have to bail out from the charge path. */ if (memcg->oom_kill_disable) { - if (!current->in_user_fault) - return OOM_SKIPPED; - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; - - return OOM_ASYNC; + if (current->in_user_fault) { + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + } + return false; } mem_cgroup_mark_under_oom(memcg); @@ -1850,10 +1845,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int mem_cgroup_oom_notify(memcg); mem_cgroup_unmark_under_oom(memcg); - if (mem_cgroup_out_of_memory(memcg, mask, order)) - ret = OOM_SUCCESS; - else - ret = OOM_FAILED; + ret = mem_cgroup_out_of_memory(memcg, mask, order); if (locked) mem_cgroup_oom_unlock(memcg); @@ -2546,7 +2538,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; - enum oom_status oom_status; unsigned long nr_reclaimed; bool passed_oom = false; bool may_swap = true; @@ -2649,9 +2640,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, * a forward progress or bypass the charge if the oom killer * couldn't make any progress. */ - oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, - get_order(nr_pages * PAGE_SIZE)); - if (oom_status == OOM_SUCCESS) { + if (mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE))) { passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; From 3f4401713dbbe70903ff8217c2326d452d89526e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 16 Feb 2022 15:31:09 +1100 Subject: [PATCH 105/334] memcg: unify force charging conditions Currently the kernel force charges the allocations which have __GFP_HIGH flag without triggering the memory reclaim. __GFP_HIGH indicates that the caller is high priority and since commit 869712fd3de5 ("mm: memcontrol: fix network errors from failing __GFP_ATOMIC charges") the kernel lets such allocations do force charging. Please note that __GFP_ATOMIC has been replaced by __GFP_HIGH. __GFP_HIGH does not tell if the caller can block or can trigger reclaim. There are separate checks to determine that. So, there is no need to skip reclaiming for __GFP_HIGH allocations. So, handle __GFP_HIGH together with __GFP_NOFAIL which also does force charging. Please note that this is a noop change as there are no __GFP_HIGH allocators in the kernel which also have __GFP_ACCOUNT (or SLAB_ACCOUNT) and does not allow reclaim for now. Link: https://lkml.kernel.org/r/20220211064917.2028469-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Chris Down Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0451cc06b157c..0e8a58d6e374b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2565,15 +2565,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, goto retry; } - /* - * Memcg doesn't have a dedicated reserve for atomic - * allocations. But like the global atomic pool, we need to - * put the burden of reclaim on regular allocation requests - * and let these go through as privileged allocations. - */ - if (gfp_mask & __GFP_ATOMIC) - goto force; - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, @@ -2647,7 +2638,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, goto retry; } nomem: - if (!(gfp_mask & __GFP_NOFAIL)) + /* + * Memcg doesn't have a dedicated reserve for atomic + * allocations. But like the global atomic pool, we need to + * put the burden of reclaim on regular allocation requests + * and let these go through as privileged allocations. + */ + if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) return -ENOMEM; force: /* From 9793cffc4f7d41a1c591e36ec164e71a382fdc2d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 16 Feb 2022 15:31:09 +1100 Subject: [PATCH 106/334] selftests: memcg: test high limit for single entry allocation Test the enforcement of memory.high limit for large amount of memory allocation within a single kernel entry. There are valid use-cases where the application can trigger large amount of memory allocation within a single syscall e.g. mlock() or mmap(MAP_POPULATE). Make sure memory.high limit enforcement works for such use-cases. Link: https://lkml.kernel.org/r/20220211064917.2028469-4-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Chris Down Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/cgroup/cgroup_util.c | 15 +++- tools/testing/selftests/cgroup/cgroup_util.h | 1 + .../selftests/cgroup/test_memcontrol.c | 78 +++++++++++++++++++ 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 0cf7e90c0052e..dbaa7aabbb4a2 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -583,7 +583,7 @@ int clone_into_cgroup_run_wait(const char *cgroup) return 0; } -int cg_prepare_for_wait(const char *cgroup) +static int __prepare_for_wait(const char *cgroup, const char *filename) { int fd, ret = -1; @@ -591,8 +591,7 @@ int cg_prepare_for_wait(const char *cgroup) if (fd == -1) return fd; - ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"), - IN_MODIFY); + ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY); if (ret == -1) { close(fd); fd = -1; @@ -601,6 +600,16 @@ int cg_prepare_for_wait(const char *cgroup) return fd; } +int cg_prepare_for_wait(const char *cgroup) +{ + return __prepare_for_wait(cgroup, "cgroup.events"); +} + +int memcg_prepare_for_wait(const char *cgroup) +{ + return __prepare_for_wait(cgroup, "memory.events"); +} + int cg_wait_for(int fd) { int ret = -1; diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 4f66d10626d29..628738532ac9b 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -55,4 +55,5 @@ extern int clone_reap(pid_t pid, int options); extern int clone_into_cgroup_run_wait(const char *cgroup); extern int dirfd_open_opath(const char *dir); extern int cg_prepare_for_wait(const char *cgroup); +extern int memcg_prepare_for_wait(const char *cgroup); extern int cg_wait_for(int fd); diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index c19a97dd02d49..36ccf2322e216 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "../kselftest.h" #include "cgroup_util.h" @@ -628,6 +629,82 @@ static int test_memcg_high(const char *root) return ret; } +static int alloc_anon_mlock(const char *cgroup, void *arg) +{ + size_t size = (size_t)arg; + void *buf; + + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, + 0, 0); + if (buf == MAP_FAILED) + return -1; + + mlock(buf, size); + munmap(buf, size); + return 0; +} + +/* + * This test checks that memory.high is able to throttle big single shot + * allocation i.e. large allocation within one kernel entry. + */ +static int test_memcg_high_sync(const char *root) +{ + int ret = KSFT_FAIL, pid, fd = -1; + char *memcg; + long pre_high, pre_max; + long post_high, post_max; + + memcg = cg_name(root, "memcg_test"); + if (!memcg) + goto cleanup; + + if (cg_create(memcg)) + goto cleanup; + + pre_high = cg_read_key_long(memcg, "memory.events", "high "); + pre_max = cg_read_key_long(memcg, "memory.events", "max "); + if (pre_high < 0 || pre_max < 0) + goto cleanup; + + if (cg_write(memcg, "memory.swap.max", "0")) + goto cleanup; + + if (cg_write(memcg, "memory.high", "30M")) + goto cleanup; + + if (cg_write(memcg, "memory.max", "140M")) + goto cleanup; + + fd = memcg_prepare_for_wait(memcg); + if (fd < 0) + goto cleanup; + + pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200)); + if (pid < 0) + goto cleanup; + + cg_wait_for(fd); + + post_high = cg_read_key_long(memcg, "memory.events", "high "); + post_max = cg_read_key_long(memcg, "memory.events", "max "); + if (post_high < 0 || post_max < 0) + goto cleanup; + + if (pre_high == post_high || pre_max != post_max) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (fd >= 0) + close(fd); + cg_destroy(memcg); + free(memcg); + + return ret; +} + /* * This test checks that memory.max limits the amount of * memory which can be consumed by either anonymous memory @@ -1180,6 +1257,7 @@ struct memcg_test { T(test_memcg_min), T(test_memcg_low), T(test_memcg_high), + T(test_memcg_high_sync), T(test_memcg_max), T(test_memcg_oom_events), T(test_memcg_swap_max), From ce3e1bfd5ebf1683f91ca9835fdf850a8758c85f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 16 Feb 2022 15:31:09 +1100 Subject: [PATCH 107/334] memcg: synchronously enforce memory.high for large overcharges The high limit is used to throttle the workload without invoking the oom-killer. Recently we tried to use the high limit to right size our internal workloads. More specifically dynamically adjusting the limits of the workload without letting the workload get oom-killed. However due to the limitation of the implementation of high limit enforcement, we observed the mechanism fails for some real workloads. The high limit is enforced on return-to-userspace i.e. the kernel let the usage goes over the limit and when the execution returns to userspace, the high reclaim is triggered and the process can get throttled as well. However this mechanism fails for workloads which do large allocations in a single kernel entry e.g. applications that mlock() a large chunk of memory in a single syscall. Such applications bypass the high limit and can trigger the oom-killer. To make high limit enforcement more robust, this patch makes the limit enforcement synchronous only if the accumulated overcharge becomes larger than MEMCG_CHARGE_BATCH. So, most of the allocations would still be throttled on the return-to-userspace path but only the extreme allocations which accumulates large amount of overcharge without returning to the userspace will be throttled synchronously. The value MEMCG_CHARGE_BATCH is a bit arbitrary but most of other places in the memcg codebase uses this constant therefore for now uses the same one. Link: https://lkml.kernel.org/r/20220211064917.2028469-5-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Chris Down Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e8a58d6e374b..17398e7601f6c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2704,6 +2704,11 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, } } while ((memcg = parent_mem_cgroup(memcg))); + if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && + !(current->flags & PF_MEMALLOC) && + gfpflags_allow_blocking(gfp_mask)) { + mem_cgroup_handle_over_high(); + } return 0; } From d849db0f5cadf363a6966b41dbedeb55e49cda46 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 16 Feb 2022 15:31:10 +1100 Subject: [PATCH 108/334] mm: generalize ARCH_HAS_FILTER_PGPROT ARCH_HAS_FILTER_PGPROT config has duplicate definitions on platforms that subscribe it. Instead make it a generic config option which can be selected on applicable platforms when required. Link: https://lkml.kernel.org/r/1643004823-16441-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 3 --- mm/Kconfig | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7cbab9fac9ffd..6820d404d22e5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,6 +23,7 @@ config ARM64 select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE @@ -1252,9 +1253,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_HAS_FILTER_PGPROT - def_bool y - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9f5bd41bf660c..b1ce75d0ab0c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -333,9 +333,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_HAS_CPU_RELAX def_bool y -config ARCH_HAS_FILTER_PGPROT - def_bool y - config ARCH_HIBERNATION_POSSIBLE def_bool y diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f33..257ed9c86de34 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -744,6 +744,9 @@ config IDLE_PAGE_TRACKING config ARCH_HAS_CACHE_LINE_SIZE bool +config ARCH_HAS_FILTER_PGPROT + bool + config ARCH_HAS_PTE_DEVMAP bool From debac8bffb55f01c0ec4340e2df67d78ac065c2a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:10 +1100 Subject: [PATCH 109/334] mm: optimize do_wp_page() for exclusive pages in the swapcache Patch series "mm: COW fixes part 1: fix the COW security issue for THP and swap", v3. This series attempts to optimize and streamline the COW logic for ordinary anon pages and THP anon pages, fixing two remaining instances of CVE-2020-29374 in do_swap_page() and do_huge_pmd_wp_page(): information can leak from a parent process to a child process via anonymous pages shared during fork(). This issue, including other related COW issues, has been summarized in [2]: " 1. Observing Memory Modifications of Private Pages From A Child Process Long story short: process-private memory might not be as private as you think once you fork(): successive modifications of private memory regions in the parent process can still be observed by the child process, for example, by smart use of vmsplice()+munmap(). The core problem is that pinning pages readable in a child process, such as done via the vmsplice system call, can result in a child process observing memory modifications done in the parent process the child is not supposed to observe. [1] contains an excellent summary and [2] contains further details. This issue was assigned CVE-2020-29374 [9]. For this to trigger, it's required to use a fork() without subsequent exec(), for example, as used under Android zygote. Without further details about an application that forks less-privileged child processes, one cannot really say what's actually affected and what's not -- see the details section the end of this mail for a short sshd/openssh analysis. While commit 17839856fd58 ("gup: document and work around "COW can break either way" issue") fixed this issue and resulted in other problems (e.g., ptrace on pmem), commit 09854ba94c6a ("mm: do_wp_page() simplification") re-introduced part of the problem unfortunately. The original reproducer can be modified quite easily to use THP [3] and make the issue appear again on upstream kernels. I modified it to use hugetlb [4] and it triggers as well. The problem is certainly less severe with hugetlb than with THP; it merely highlights that we still have plenty of open holes we should be closing/fixing. Regarding vmsplice(), the only known workaround is to disallow the vmsplice() system call ... or disable THP and hugetlb. But who knows what else is affected (RDMA? O_DIRECT?) to achieve the same goal -- in the end, it's a more generic issue. " This security issue was first reported by Jann Horn on 27 May 2020 and it currently affects anonymous pages during swapin, anonymous THP and hugetlb. This series tackles anonymous pages during swapin and anonymous THP: * do_swap_page() for handling COW on PTEs during swapin directly * do_huge_pmd_wp_page() for handling COW on PMD-mapped THP during write faults With this series, we'll apply the same COW logic we have in do_wp_page() to all swappable anon pages: don't reuse (map writable) the page in case there are additional references (page_count() != 1). All users of reuse_swap_page() are remove, and consequently reuse_swap_page() is removed. In general, we're struggling with the following COW-related issues: (1) "missed COW": we miss to copy on write and reuse the page (map it writable) although we must copy because there are pending references from another process to this page. The result is a security issue. (2) "wrong COW": we copy on write although we wouldn't have to and shouldn't: if there are valid GUP references, they will become out of sync with the pages mapped into the page table. We fail to detect that such a page can be reused safely, especially if never more than a single process mapped the page. The result is an intra process memory corruption. (3) "unnecessary COW": we copy on write although we wouldn't have to: performance degradation and temporary increases swap+memory consumption can be the result. While this series fixes (1) for swappable anon pages, it tries to reduce reported cases of (3) first as good and easy as possible to limit the impact when streamlining. The individual patches try to describe in which cases we will run into (3). This series certainly makes (2) worse for THP, because a THP will now get PTE-mapped on write faults if there are additional references, even if there was only ever a single process involved: once PTE-mapped, we'll copy each and every subpage and won't reuse any subpage as long as the underlying compound page wasn't split. I'm working on an approach to fix (2) and improve (3): PageAnonExclusive to mark anon pages that are exclusive to a single process, allow GUP pins only on such exclusive pages, and allow turning exclusive pages shared (clearing PageAnonExclusive) only if there are no GUP pins. Anon pages with PageAnonExclusive set never have to be copied during write faults, but eventually during fork() if they cannot be turned shared. The improved reuse logic in this series will essentially also be the logic to reset PageAnonExclusive. This work will certainly take a while, but I'm planning on sharing details before having code fully ready. #1-#5 can be applied independently of the rest. #6-#9 are mostly only cleanups related to reuse_swap_page(). Notes: * For now, I'll leave hugetlb code untouched: "unnecessary COW" might easily break existing setups because hugetlb pages are a scarce resource and we could just end up having to crash the application when we run out of hugetlb pages. We have to be very careful and the security aspect with hugetlb is most certainly less relevant than for unprivileged anon pages. * Instead of lru_add_drain() we might actually just drain the lru_add list or even just remove the single page of interest from the lru_add list. This would require a new helper function, and could be added if the conditional lru_add_drain() turn out to be a problem. * I extended the test case already included in [1] to also test for the newly found do_swap_page() case. I'll send that out separately once/if this part was merged. [1] https://lkml.kernel.org/r/20211217113049.23850-1-david@redhat.com [2] https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com This patch (of 9): Liang Zhang reported [1] that the current COW logic in do_wp_page() is sub-optimal when it comes to swap+read fault+write fault of anonymous pages that have a single user, visible via a performance degradation in the redis benchmark. Something similar was previously reported [2] by Nadav with a simple reproducer. After we put an anon page into the swapcache and unmapped it from a single process, that process might read that page again and refault it read-only. If that process then writes to that page, the process is actually the exclusive user of the page, however, the COW logic in do_co_page() won't be able to reuse it due to the additional reference from the swapcache. Let's optimize for pages that have been added to the swapcache but only have an exclusive user. Try removing the swapcache reference if there is hope that we're the exclusive user. We will fail removing the swapcache reference in two scenarios: (1) There are additional swap entries referencing the page: copying instead of reusing is the right thing to do. (2) The page is under writeback: theoretically we might be able to reuse in some cases, however, we cannot remove the additional reference and will have to copy. Note that we'll only try removing the page from the swapcache when it's highly likely that we'll be the exclusive owner after removing the page from the swapache. As we're about to map that page writable and redirty it, that should not affect reclaim but is rather the right thing to do. Further, we might have additional references from the LRU pagevecs, which will force us to copy instead of being able to reuse. We'll try handling such references for some scenarios next. Concurrent writeback cannot be handled easily and we'll always have to copy. While at it, remove the superfluous page_mapcount() check: it's implicitly covered by the page_count() for ordinary anon pages. [1] https://lkml.kernel.org/r/20220113140318.11117-1-zhangliang5@huawei.com [2] https://lkml.kernel.org/r/0480D692-D9B2-429A-9A88-9BBA1331AC3A@gmail.com Link: https://lkml.kernel.org/r/20220131162940.210846-2-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Liang Zhang Reported-by: Nadav Amit Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: David Rientjes Cc: Shakeel Butt Cc: John Hubbard Cc: Jason Gunthorpe Cc: Mike Kravetz Cc: Mike Rapoport Cc: Yang Shi Cc: Kirill A. Shutemov Cc: Jann Horn Cc: Michal Hocko Cc: Rik van Riel Cc: Roman Gushchin Cc: Andrea Arcangeli Cc: Peter Xu Cc: Don Dutile Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c125c4969913a..bcd3b7c508915 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3291,19 +3291,27 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (PageAnon(vmf->page)) { struct page *page = vmf->page; - /* PageKsm() doesn't necessarily raise the page refcount */ - if (PageKsm(page) || page_count(page) != 1) + /* + * We have to verify under page lock: these early checks are + * just an optimization to avoid locking the page and freeing + * the swapcache if there is little hope that we can reuse. + * + * PageKsm() doesn't necessarily raise the page refcount. + */ + if (PageKsm(page) || page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; - if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { + if (PageSwapCache(page)) + try_to_free_swap(page); + if (PageKsm(page) || page_count(page) != 1) { unlock_page(page); goto copy; } /* - * Ok, we've got the only map reference, and the only - * page count reference, and the page is locked, - * it's dark out, and we're wearing sunglasses. Hit it. + * Ok, we've got the only page reference from our mapping + * and the page is locked, it's dark out, and we're wearing + * sunglasses. Hit it. */ unlock_page(page); wp_page_reuse(vmf); From 9426834cdd4f0241031aeaabead3168f3ee2f844 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:10 +1100 Subject: [PATCH 110/334] mm: optimize do_wp_page() for fresh pages in local LRU pagevecs For example, if a page just got swapped in via a read fault, the LRU pagevecs might still hold a reference to the page. If we trigger a write fault on such a page, the additional reference from the LRU pagevecs will prohibit reusing the page. Let's conditionally drain the local LRU pagevecs when we stumble over a !PageLRU() page. We cannot easily drain remote LRU pagevecs and it might not be desirable performance-wise. Consequently, this will only avoid copying in some cases. Add a simple "page_count(page) > 3" check first but keep the "page_count(page) > 1 + PageSwapCache(page)" check in place, as we want to minimize cases where we remove a page from the swapcache but won't be able to reuse it, for example, because another process has it mapped R/O, to not affect reclaim. We cannot easily handle the following cases and we will always have to copy: (1) The page is referenced in the LRU pagevecs of other CPUs. We really would have to drain the LRU pagevecs of all CPUs -- most probably copying is much cheaper. (2) The page is already PageLRU() but is getting moved between LRU lists, for example, for activation (e.g., mark_page_accessed()), deactivation (MADV_COLD), or lazyfree (MADV_FREE). We'd have to drain mostly unconditionally, which might be bad performance-wise. Most probably this won't happen too often in practice. Note that there are other reasons why an anon page might temporarily not be PageLRU(): for example, compaction and migration have to isolate LRU pages from the LRU lists first (isolate_lru_page()), moving them to temporary local lists and clearing PageLRU() and holding an additional reference on the page. In that case, we'll always copy. This change seems to be fairly effective with the reproducer [1] shared by Nadav, as long as writeback is done synchronously, for example, using zram. However, with asynchronous writeback, we'll usually fail to free the swapcache because the page is still under writeback: something we cannot easily optimize for, and maybe it's not really relevant in practice. [1] https://lkml.kernel.org/r/0480D692-D9B2-429A-9A88-9BBA1331AC3A@gmail.com Link: https://lkml.kernel.org/r/20220131162940.210846-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index bcd3b7c508915..923165b4c27e4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3298,7 +3298,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * * PageKsm() doesn't necessarily raise the page refcount. */ - if (PageKsm(page) || page_count(page) > 1 + PageSwapCache(page)) + if (PageKsm(page) || page_count(page) > 3) + goto copy; + if (!PageLRU(page)) + /* + * Note: We cannot easily detect+handle references from + * remote LRU pagevecs or references to PageLRU() pages. + */ + lru_add_drain(); + if (page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; From b44b3409f46f12ba5dc5a18330b4e8ecbc40e495 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:10 +1100 Subject: [PATCH 111/334] mm: slightly clarify KSM logic in do_swap_page() Let's make it clearer that KSM might only have to copy a page in case we have a page in the swapcache, not if we allocated a fresh page and bypassed the swapcache. While at it, add a comment why this is usually necessary and merge the two swapcache conditions. Link: https://lkml.kernel.org/r/20220131162940.210846-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 923165b4c27e4..3c91294cca983 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3615,21 +3615,29 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - /* - * Make sure try_to_free_swap or reuse_swap_page or swapoff did not - * release the swapcache from under us. The page pin, and pte_same - * test below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not changed. - */ - if (unlikely((!PageSwapCache(page) || - page_private(page) != entry.val)) && swapcache) - goto out_page; - - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - goto out_page; + if (swapcache) { + /* + * Make sure try_to_free_swap or reuse_swap_page or swapoff did + * not release the swapcache from under us. The page pin, and + * pte_same test below, are not enough to exclude that. Even if + * it is still swapcache, we need to check that the page's swap + * has not changed. + */ + if (unlikely(!PageSwapCache(page) || + page_private(page) != entry.val)) + goto out_page; + + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + page = ksm_might_need_to_copy(page, vma, vmf->address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; + } } cgroup_throttle_swaprate(page, GFP_KERNEL); From 75aa784c4eaadf56e02db27c856438eef821f627 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:10 +1100 Subject: [PATCH 112/334] mm: streamline COW logic in do_swap_page() Currently we have a different COW logic when: * triggering a read-fault to swapin first and then trigger a write-fault -> do_swap_page() + do_wp_page() * triggering a write-fault to swapin -> do_swap_page() + do_wp_page() only if we fail reuse in do_swap_page() The COW logic in do_swap_page() is different than our reuse logic in do_wp_page(). The COW logic in do_wp_page() -- page_count() == 1 -- makes currently sure that we certainly don't have a remaining reference, e.g., via GUP, on the target page we want to reuse: if there is any unexpected reference, we have to copy to avoid information leaks. As do_swap_page() behaves differently, in environments with swap enabled we can currently have an unintended information leak from the parent to the child, similar as known from CVE-2020-29374: 1. Parent writes to anonymous page -> Page is mapped writable and modified 2. Page is swapped out -> Page is unmapped and replaced by swap entry 3. fork() -> Swap entries are copied to child 4. Child pins page R/O -> Page is mapped R/O into child 5. Child unmaps page -> Child still holds GUP reference 6. Parent writes to page -> Page is reused in do_swap_page() -> Child can observe changes Exchanging 2. and 3. should have the same effect. Let's apply the same COW logic as in do_wp_page(), conditionally trying to remove the page from the swapcache after freeing the swap entry, however, before actually mapping our page. We can change the order now that we use try_to_free_swap(), which doesn't care about the mapcount, instead of reuse_swap_page(). To handle references from the LRU pagevecs, conditionally drain the local LRU pagevecs when required, however, don't consider the page_count() when deciding whether to drain to keep it simple for now. Link: https://lkml.kernel.org/r/20220131162940.210846-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 55 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3c91294cca983..c6177d8979643 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3497,6 +3497,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } +static inline bool should_try_to_free_swap(struct page *page, + struct vm_area_struct *vma, + unsigned int fault_flags) +{ + if (!PageSwapCache(page)) + return false; + if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || + PageMlocked(page)) + return true; + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * user. Try freeing the swapcache to get rid of the swapcache + * reference only in case it's likely that we'll be the exlusive user. + */ + return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + page_count(page) == 2; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3638,6 +3657,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = swapcache; goto out_page; } + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * pagevecs if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + !PageKsm(page) && !PageLRU(page)) + lru_add_drain(); } cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3656,19 +3685,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* - * The page isn't present yet, go ahead with the fault. - * - * Be careful about the sequence of operations here. - * To get its accounting right, reuse_swap_page() must be called - * while the page is counted on swap but not yet in mapcount i.e. - * before page_add_anon_rmap() and swap_free(); try_to_free_swap() - * must be called after the swap_free(), or it will never succeed. + * Remove the swap entry and conditionally try to free up the swapcache. + * We're already holding a reference on the page but haven't mapped it + * yet. */ + swap_free(entry); + if (should_try_to_free_swap(page, vma, vmf->flags)) + try_to_free_swap(page); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + + /* + * Same logic as in do_wp_page(); however, optimize for fresh pages + * that are certainly not shared because we just allocated them without + * exposing them to the swapcache. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + (page != swapcache || page_count(page) == 1)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3694,10 +3729,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - swap_free(entry); - if (mem_cgroup_swap_full(page) || - (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* From c28564603a18bc0b53241dc6eec859d23e47e564 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:11 +1100 Subject: [PATCH 113/334] mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page() We currently have a different COW logic for anon THP than we have for ordinary anon pages in do_wp_page(): the effect is that the issue reported in CVE-2020-29374 is currently still possible for anon THP: an unintended information leak from the parent to the child. Let's apply the same logic (page_count() == 1), with similar optimizations to remove additional references first as we really want to avoid PTE-mapping the THP and copying individual pages best we can. If we end up with a page that has page_count() != 1, we'll have to PTE-map the THP and fallback to do_wp_page(), which will always copy the page. Note that KSM does not apply to THP. I. Interaction with the swapcache and writeback While a THP is in the swapcache, the swapcache holds one reference on each subpage of the THP. So with PageSwapCache() set, we expect as many additional references as we have subpages. If we manage to remove the THP from the swapcache, all these references will be gone. Usually, a THP is not split when entered into the swapcache and stays a compound page. However, try_to_unmap() will PTE-map the THP and use PTE swap entries. There are no PMD swap entries for that purpose, consequently, we always only swapin subpages into PTEs. Removing a page from the swapcache can fail either when there are remaining swap entries (in which case COW is the right thing to do) or if the page is currently under writeback. Having a locked, R/O PMD-mapped THP that is in the swapcache seems to be possible only in corner cases, for example, if try_to_unmap() failed after adding the page to the swapcache. However, it's comparatively easy to handle. As we have to fully unmap a THP before starting writeback, and swapin is always done on the PTE level, we shouldn't find a R/O PMD-mapped THP in the swapcache that is under writeback. This should at least leave writeback out of the picture. II. Interaction with GUP references Having a R/O PMD-mapped THP with GUP references (i.e., R/O references) will result in PTE-mapping the THP on a write fault. Similar to ordinary anon pages, do_wp_page() will have to copy sub-pages and result in a disconnect between the GUP references and the pages actually mapped into the page tables. To improve the situation in the future, we'll need additional handling to mark anonymous pages as definitely exclusive to a single process, only allow GUP pins on exclusive anon pages, and disallow sharing of exclusive anon pages with GUP pins e.g., during fork(). III. Interaction with references from LRU pagevecs There is no need to try draining the (local) LRU pagevecs in case we would stumble over a !PageLRU() page: folio_add_lru() and friends will always flush the affected pagevec after adding a compound page to it immediately -- pagevec_add_and_need_flush() always returns "true" for them. Note that the LRU pagevecs will hold a reference on the compound page for a very short time, between adding the page to the pagevec and draining it immediately afterwards. IV. Interaction with speculative/temporary references Similar to ordinary anon pages, other speculative/temporary references on the THP, for example, from the pagecache or page migration code, will disallow exclusive reuse of the page. We'll have to PTE-map the THP. Link: https://lkml.kernel.org/r/20220131162940.210846-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/huge_memory.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 406a3c28c0266..f34ebc5cb827c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1303,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); @@ -1319,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. + * See do_wp_page(): we can only map the page writable if there are + * no additional references. Note that we always drain the LRU + * pagevecs immediately after adding a THP. */ - if (reuse_swap_page(page)) { + if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + goto unlock_fallback; + if (PageSwapCache(page)) + try_to_free_swap(page); + if (page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1333,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) return VM_FAULT_WRITE; } +unlock_fallback: unlock_page(page); spin_unlock(vmf->ptl); fallback: From 6b28af07a3a08eb31d10c2305683ef0e509f804b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:11 +1100 Subject: [PATCH 114/334] mm/khugepaged: remove reuse_swap_page() usage reuse_swap_page() currently indicates if we can write to an anon page without COW. A COW is required if the page is shared by multiple processes (either already mapped or via swap entries) or if there is concurrent writeback that cannot tolerate concurrent page modifications. However, in the context of khugepaged we're not actually going to write to a read-only mapped page, we'll copy the page content to our newly allocated THP and map that THP writable. All we have to make sure is that the read-only mapped page we're about to copy won't get reused by another process sharing the page, otherwise, page content would get modified. But that is already guaranteed via multiple mechanisms (e.g., holding a reference, holding the page lock, removing the rmap after copying the page). The swapcache handling was introduced in commit 10359213d05a ("mm: incorporate read-only pages into transparent huge pages") and it sounds like it merely wanted to mimic what do_swap_page() would do when trying to map a page obtained via the swapcache writable. As that logic is unnecessary, let's just remove it, removing the last user of reuse_swap_page(). Link: https://lkml.kernel.org/r/20220131162940.210846-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/trace/events/huge_memory.h | 1 - mm/khugepaged.c | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4fdb14a81108b..d651f3437367d 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -29,7 +29,6 @@ EM( SCAN_VMA_NULL, "vma_null") \ EM( SCAN_VMA_CHECK, "vma_check_failed") \ EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \ - EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 131492fd1148b..a325a646be33e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,7 +46,6 @@ enum scan_result { SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, @@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PAGE_COUNT; goto out; } - if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page)) { - /* - * Page is in the swap cache and cannot be re-used. - * It cannot be collapsed into a THP. - */ - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } /* * Isolate the page to avoid collapsing an hugepage From b8d1ad4a52a5f61ec3fbebd78796f575e1b4b11a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:11 +1100 Subject: [PATCH 115/334] mm/swapfile: remove stale reuse_swap_page() All users are gone, let's remove it. We'll let SWP_STABLE_WRITES stick around for now, as it might come in handy in the near future. Link: https://lkml.kernel.org/r/20220131162940.210846-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 4 -- mm/swapfile.c | 104 ------------------------------------------- 2 files changed, 108 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 1d38d9475c4d0..b546e4bd5c5a2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,9 +679,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page) == 1) - static inline int try_to_free_swap(struct page *page) { return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index bf0df7aa7158f..a5183315dc585 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1167,16 +1167,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return NULL; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) - spin_lock(&p->lock); - return p; -} - static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { @@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, - int *total_swapcount) -{ - int i, map_swapcount, _total_swapcount; - unsigned long offset = 0; - struct swap_info_struct *si; - struct swap_cluster_info *ci = NULL; - unsigned char *map = NULL; - int swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page); - } - - page = compound_head(page); - - _total_swapcount = map_swapcount = 0; - if (PageSwapCache(page)) { - swp_entry_t entry; - - entry.val = page_private(page); - si = _swap_info_get(entry); - if (si) { - map = si->swap_map; - offset = swp_offset(entry); - } - } - if (map) - ci = lock_cluster(si, offset); - for (i = 0; i < HPAGE_PMD_NR; i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - if (map) { - swapcount = swap_count(map[offset + i]); - _total_swapcount += swapcount; - } - map_swapcount = max(map_swapcount, mapcount + swapcount); - } - unlock_cluster(ci); - - if (PageDoubleMap(page)) - map_swapcount -= 1; - - if (total_swapcount) - *total_swapcount = _total_swapcount; - - return map_swapcount + compound_mapcount(page); -} - -/* - * We can write to an anon page without COW if there are no other references - * to it. And as a side-effect, free up its swap: because the old content - * on disk will never be read, and seeking back there to write new content - * later would only waste time away from clustering. - */ -bool reuse_swap_page(struct page *page) -{ - int count, total_swapcount; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (unlikely(PageKsm(page))) - return false; - count = page_trans_huge_map_swapcount(page, &total_swapcount); - if (count == 1 && PageSwapCache(page) && - (likely(!PageTransCompound(page)) || - /* The remaining swap count will be freed soon */ - total_swapcount == page_swapcount(page))) { - if (!PageWriteback(page)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } else { - swp_entry_t entry; - struct swap_info_struct *p; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (p->flags & SWP_STABLE_WRITES) { - spin_unlock(&p->lock); - return false; - } - spin_unlock(&p->lock); - } - } - - return count <= 1; -} - /* * If swap is getting full, or if there are no more mappings of this page, * then try_to_free_swap is called to free its swap space. From 1e80a02e51b88efeacbced8efe160eedf8b8d8b1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:11 +1100 Subject: [PATCH 116/334] mm/huge_memory: remove stale page_trans_huge_mapcount() All users are gone, let's remove it. Link: https://lkml.kernel.org/r/20220131162940.210846-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 5 ----- mm/huge_memory.c | 48 ---------------------------------------------- 2 files changed, 53 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 528ef1cb4f3a7..ac11fc132ed10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -820,16 +820,11 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page) -{ - return page_mapcount(page); -} #endif static inline struct page *virt_to_head_page(const void *x) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f34ebc5cb827c..a6dc5af1a7639 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2523,54 +2523,6 @@ int total_mapcount(struct page *page) return ret; } -/* - * This calculates accurately how many mappings a transparent hugepage - * has (unlike page_mapcount() which isn't fully accurate). This full - * accuracy is primarily needed to know if copy-on-write faults can - * reuse the page and change the mapping to read-write instead of - * copying them. At the same time this returns the total_mapcount too. - * - * The function returns the highest mapcount any one of the subpages - * has. If the return value is one, even if different processes are - * mapping different subpages of the transparent hugepage, they can - * all reuse it, because each process is reusing a different subpage. - * - * The total_mapcount is instead counting all virtual mappings of the - * subpages. If the total_mapcount is equal to "one", it tells the - * caller all mappings belong to the same "mm" and in turn the - * anon_vma of the transparent hugepage can become the vma->anon_vma - * local one as no other process may be mapping any of the subpages. - * - * It would be more accurate to replace page_mapcount() with - * page_trans_huge_mapcount(), however we only use - * page_trans_huge_mapcount() in the copy-on-write faults where we - * need full accuracy to avoid breaking page pinning, because - * page_trans_huge_mapcount() is slower than page_mapcount(). - */ -int page_trans_huge_mapcount(struct page *page) -{ - int i, ret; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (likely(!PageTransCompound(page))) - return atomic_read(&page->_mapcount) + 1; - - page = compound_head(page); - - ret = 0; - for (i = 0; i < thp_nr_pages(page); i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - ret = max(ret, mapcount); - } - - if (PageDoubleMap(page)) - ret -= 1; - - return ret + compound_mapcount(page); -} - /* Racy check whether the huge page can be split */ bool can_split_huge_page(struct page *page, int *pextra_pins) { From dbe67be3d9f30a56d41ef643582420703cc4333d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:11 +1100 Subject: [PATCH 117/334] mm/huge_memory: remove stale locking logic from __split_huge_pmd() Let's remove the stale logic that was required for reuse_swap_page(). Link: https://lkml.kernel.org/r/20220131162940.210846-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oleg Nesterov Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/huge_memory.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a6dc5af1a7639..cda88d8ac1bd5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2152,8 +2152,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; - bool do_unlock_page = false; - pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@ -2172,35 +2170,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, goto out; } -repeat: if (pmd_trans_huge(*pmd)) { - if (!page) { + if (!page) page = pmd_page(*pmd); - /* - * An anonymous page must be locked, to ensure that a - * concurrent reuse_swap_page() sees stable mapcount; - * but reuse_swap_page() is not used on shmem or file, - * and page lock must not be taken when zap_pmd_range() - * calls __split_huge_pmd() while i_mmap_lock is held. - */ - if (PageAnon(page)) { - if (unlikely(!trylock_page(page))) { - get_page(page); - _pmd = *pmd; - spin_unlock(ptl); - lock_page(page); - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, _pmd))) { - unlock_page(page); - put_page(page); - page = NULL; - goto repeat; - } - put_page(page); - } - do_unlock_page = true; - } - } if (PageMlocked(page)) clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) @@ -2208,8 +2180,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); - if (do_unlock_page) - unlock_page(page); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): From 8ac6487d4d76996e42d0e7a9a524d649ddc8b24f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 16 Feb 2022 15:31:12 +1100 Subject: [PATCH 118/334] mm: merge pte_mkhuge() call into arch_make_huge_pte() Each call into pte_mkhuge() is invariably followed by arch_make_huge_pte(). Instead arch_make_huge_pte() can accommodate pte_mkhuge() at the beginning. This updates generic fallback stub for arch_make_huge_pte() and available platforms definitions. This makes huge pte creation much cleaner and easier to follow. Link: https://lkml.kernel.org/r/1643860669-26307-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Acked-by: Mike Kravetz Acked-by: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: "David S. Miller" Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/mm/hugetlbpage.c | 1 + arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 4 ++-- arch/sparc/mm/hugetlbpage.c | 1 + include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 3 +-- mm/vmalloc.c | 1 - 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index ffb9c229610ab..228226c5fa809 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -347,6 +347,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; + entry = pte_mkhuge(entry); if (pagesize == CONT_PTE_SIZE) { entry = pte_mkcont(entry); } else if (pagesize == CONT_PMD_SIZE) { diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 64b6c608eca43..de092b04ee1a1 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -71,9 +71,9 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags size_t size = 1UL << shift; if (size == SZ_16K) - return __pte(pte_val(entry) & ~_PAGE_HUGE); + return __pte(pte_val(entry) | _PAGE_SPS); else - return entry; + return __pte(pte_val(entry) | _PAGE_SPS | _PAGE_HUGE); } #define arch_make_huge_pte arch_make_huge_pte #endif diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 0f49fada20938..d8e0e3c7038d0 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -181,6 +181,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { pte_t pte; + entry = pte_mkhuge(entry); pte = hugepage_shift_to_tte(entry, shift); #ifdef CONFIG_SPARC64 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1897a69c5406..52c462390aee3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -754,7 +754,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { - return entry; + return pte_mkhuge(entry); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f294db835f4bc..a404af0b49a05 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4637,7 +4637,6 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; @@ -6171,7 +6170,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); - pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); + pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 375b53fd939f1..4ad2275fde825 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -118,7 +118,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry); pfn += PFN_DOWN(size); From 42725f7e4c6c179b031f96291a087b6c23fa329a Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Wed, 16 Feb 2022 15:31:12 +1100 Subject: [PATCH 119/334] mm: remove mmu_gathers storage from remaining architectures Originally the mmu_gathers were removed in commit 1c3951769621 ("mm: now that all old mmu_gather code is gone, remove the storage"). However, the openrisc and hexagon architecture were merged around the same time and mmu_gathers was not removed. This patch removes them from openrisc, hexagon and nds32: Noticed while cleaning this warning: arch/openrisc/mm/init.c:41:1: warning: symbol 'mmu_gathers' was not declared. Should it be static? Link: https://lkml.kernel.org/r/20220205141956.3315419-1-shorne@gmail.com Signed-off-by: Stafford Horne Acked-by: Mike Rapoport Cc: Brian Cain Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Russell King Cc: David Hildenbrand Cc: Dave Hansen Cc: Kefeng Wang Cc: Christophe Leroy Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/hexagon/mm/init.c | 2 -- arch/nds32/mm/init.c | 1 - arch/openrisc/mm/init.c | 2 -- 3 files changed, 5 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f01e91e10d95d..3167a3b5c97b0 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -29,8 +29,6 @@ int max_kernel_seg = 0x303; /* indicate pfn's of high memory */ unsigned long highstart_pfn, highend_pfn; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* Default cache attribute for newly created page tables */ unsigned long _dflt_cache_att = CACHEDEF; diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index f63f839738c46..825c85cab1a1d 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -18,7 +18,6 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); DEFINE_SPINLOCK(anon_alias_lock); extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 97305bde1b169..3a021ab6f1aef 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -38,8 +38,6 @@ int mem_init_done; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static void __init zone_sizes_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; From e3a81a5ae5a8885d65fac5d003a29b29bbab3a2d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:12 +1100 Subject: [PATCH 120/334] mm: thp: fix wrong cache flush in remove_migration_pmd() Patch series "Fix some cache flush bugs", v5. This series focuses on fixing cache maintenance. This patch (of 7): The flush_cache_range() is supposed to be justified only if the page is already placed in process page table, and that is done right after flush_cache_range(). So using this interface is wrong. And there is no need to invalite cache since it was non-present before in remove_migration_pmd(). So just to remove it. Link: https://lkml.kernel.org/r/20220210123058.79206-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220210123058.79206-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Zi Yan Cc: Kirill A. Shutemov Cc: David Rientjes Cc: Lars Persson Cc: Mike Kravetz Cc: Zi Yan Cc: Xiongchun Duan Cc: Fam Zheng Cc: Muchun Song Cc: Axel Rasmussen Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cda88d8ac1bd5..0bda8b43c95f0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3124,7 +3124,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); - flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else @@ -3132,6 +3131,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); + + /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif From 607342ad83305b33c1a76281972dcff259b76b80 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:12 +1100 Subject: [PATCH 121/334] mm: fix missing cache flush for all tail pages of compound page The D-cache maintenance inside move_to_new_page() only consider one page, there is still D-cache maintenance issue for tail pages of compound page (e.g. THP or HugeTLB). THP migration is only enabled on x86_64, ARM64 and powerpc, while powerpc and arm64 need to maintain the consistency between I-Cache and D-Cache, which depends on flush_dcache_page() to maintain the consistency between I-Cache and D-Cache. But there is no issues on arm64 and powerpc since they already considers the compound page cache flushing in their icache flush function. HugeTLB migration is enabled on arm, arm64, mips, parisc, powerpc, riscv, s390 and sh, while arm has handled the compound page cache flush in flush_dcache_page(), but most others do not. In theory, the issue exists on many architectures. Fix this by not using flush_dcache_folio() since it is not backportable. Link: https://lkml.kernel.org/r/20220210123058.79206-3-songmuchun@bytedance.com Fixes: 290408d4a250 ("hugetlb: hugepage migration core") Signed-off-by: Muchun Song Reviewed-by: Zi Yan Cc: Axel Rasmussen Cc: David Rientjes Cc: Fam Zheng Cc: Kirill A. Shutemov Cc: Lars Persson Cc: Mike Kravetz Cc: Peter Xu Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index be0d5ae36dc10..996c0e386734a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -916,9 +916,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (!PageMappingFlags(page)) page->mapping = NULL; - if (likely(!is_zone_device_page(newpage))) - flush_dcache_page(newpage); + if (likely(!is_zone_device_page(newpage))) { + int i, nr = compound_nr(newpage); + for (i = 0; i < nr; i++) + flush_dcache_page(newpage + i); + } } out: return rc; From 6062f43127d48368e80c0469b6a6b8ce1960ee73 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:12 +1100 Subject: [PATCH 122/334] mm: hugetlb: fix missing cache flush in copy_huge_page_from_user() userfaultfd calls copy_huge_page_from_user() which does not do any cache flushing for the target page. Then the target page will be mapped to the user space with a different address (user address), which might have an alias issue with the kernel address used to copy the data from the user to. Fix this issue by flushing dcache in copy_huge_page_from_user(). Link: https://lkml.kernel.org/r/20220210123058.79206-4-songmuchun@bytedance.com Fixes: fa4d75c1de13 ("userfaultfd: hugetlbfs: add copy_huge_page_from_user for hugetlb userfaultfd support") Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Axel Rasmussen Cc: David Rientjes Cc: Fam Zheng Cc: Kirill A. Shutemov Cc: Lars Persson Cc: Peter Xu Cc: Xiongchun Duan Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index c6177d8979643..f4c0226fda489 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5499,6 +5499,8 @@ long copy_huge_page_from_user(struct page *dst_page, if (rc) break; + flush_dcache_page(subpage); + cond_resched(); } return ret_val; From a9ec93fc66179f01b711468830c50124e58af7f9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:13 +1100 Subject: [PATCH 123/334] mm: hugetlb: fix missing cache flush in hugetlb_mcopy_atomic_pte() folio_copy() will copy the data from one page to the target page, then the target page will be mapped to the user space address, which might have an alias issue with the kernel address used to copy the data from the page to. There are 2 ways to fix this issue. 1) insert flush_dcache_page() after folio_copy(). 2) replace folio_copy() with copy_user_huge_page() which already considers the cache maintenance. We chose 2) way to fix the issue since architectures can optimize this situation. It is also make backports easier. Link: https://lkml.kernel.org/r/20220210123058.79206-5-songmuchun@bytedance.com Fixes: 8cc5fcbb5be8 ("mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY") Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Axel Rasmussen Cc: David Rientjes Cc: Fam Zheng Cc: Kirill A. Shutemov Cc: Lars Persson Cc: Peter Xu Cc: Xiongchun Duan Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a404af0b49a05..3d450f8028233 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5816,7 +5816,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; goto out; } - folio_copy(page_folio(page), page_folio(*pagep)); + copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } From c966793c308d22cf301f4595d76fb6bb595f4680 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:13 +1100 Subject: [PATCH 124/334] mm: shmem: fix missing cache flush in shmem_mfill_atomic_pte() userfaultfd calls shmem_mfill_atomic_pte() which does not do any cache flushing for the target page. Then the target page will be mapped to the user space with a different address (user address), which might have an alias issue with the kernel address used to copy the data from the user to. Insert flush_dcache_page() in non-zero-page case. And replace clear_highpage() with clear_user_highpage() which already considers the cache maintenance. Link: https://lkml.kernel.org/r/20220210123058.79206-6-songmuchun@bytedance.com Fixes: 8d1039634206 ("userfaultfd: shmem: add shmem_mfill_zeropage_pte for userfaultfd support") Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support") Signed-off-by: Muchun Song Cc: Axel Rasmussen Cc: David Rientjes Cc: Fam Zheng Cc: Kirill A. Shutemov Cc: Lars Persson Cc: Mike Kravetz Cc: Peter Xu Cc: Xiongchun Duan Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/shmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5a3907712c4f7..ccf152292ff66 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2364,8 +2364,10 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ goto out_unacct_blocks; } + + flush_dcache_page(page); } else { /* ZEROPAGE */ - clear_highpage(page); + clear_user_highpage(page, dst_addr); } } else { page = *pagep; From 559e5d2c4891d5e58d648a1f331614e149167264 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:13 +1100 Subject: [PATCH 125/334] mm: userfaultfd: fix missing cache flush in mcopy_atomic_pte() and __mcopy_atomic() userfaultfd calls mcopy_atomic_pte() and __mcopy_atomic() which do not do any cache flushing for the target page. Then the target page will be mapped to the user space with a different address (user address), which might have an alias issue with the kernel address used to copy the data from the user to. Fix this by insert flush_dcache_page() after copy_from_user() succeeds. Link: https://lkml.kernel.org/r/20220210123058.79206-7-songmuchun@bytedance.com Fixes: b6ebaedb4cb1 ("userfaultfd: avoid mmap_sem read recursion in mcopy_atomic") Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation") Signed-off-by: Muchun Song Cc: Axel Rasmussen Cc: David Rientjes Cc: Fam Zheng Cc: Kirill A. Shutemov Cc: Lars Persson Cc: Mike Kravetz Cc: Peter Xu Cc: Xiongchun Duan Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/userfaultfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0780c2a57ff11..6ccc534d1c1cb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -150,6 +150,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ goto out; } + + flush_dcache_page(page); } else { page = *pagep; *pagep = NULL; @@ -625,6 +627,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, err = -EFAULT; goto out; } + flush_dcache_page(page); goto retry; } else BUG_ON(page); From 2ef5bbce1471cadf1013fbfa357537ae6ce7df4d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:13 +1100 Subject: [PATCH 126/334] mm: replace multiple dcache flush with flush_dcache_folio() Simplify the code by using flush_dcache_folio(). Link: https://lkml.kernel.org/r/20220210123058.79206-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Axel Rasmussen Cc: David Rientjes Cc: Fam Zheng Cc: Kirill A. Shutemov Cc: Lars Persson Cc: Peter Xu Cc: Xiongchun Duan Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 996c0e386734a..54b168a3b84a5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -916,12 +916,8 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (!PageMappingFlags(page)) page->mapping = NULL; - if (likely(!is_zone_device_page(newpage))) { - int i, nr = compound_nr(newpage); - - for (i = 0; i < nr; i++) - flush_dcache_page(newpage + i); - } + if (likely(!is_zone_device_page(newpage))) + flush_dcache_folio(page_folio(newpage)); } out: return rc; From 27323748616bf2fd2f3b70e1a5fff9843dae5840 Mon Sep 17 00:00:00 2001 From: wangyong Date: Wed, 16 Feb 2022 15:31:13 +1100 Subject: [PATCH 127/334] memfd: fix shmem huge page failed to set F_SEAL_WRITE attribute problem After enabling tmpfs filesystem to support transparent hugepage with the following command: echo always > /sys/kernel/mm/transparent_hugepage/shmem_enabled The docker program adds F_SEAL_WRITE through the following command which will prompt EBUSY. fcntl(5, F_ADD_SEALS, F_SEAL_WRITE)=-1. It is found that in memfd_wait_for_pins function, the page_count of hugepage is 512 and page_mapcount is 0, which does not meet the conditions: page_count(page) - page_mapcount(page) != 1. But the page is not busy at this time, therefore, the page_order of hugepage should be taken into account in the calculation. Link: https://lkml.kernel.org/r/20220215073743.1769979-1-cgel.zte@gmail.com Signed-off-by: wangyong Reported-by: Zeal Robot Cc: Hugh Dickins Cc: Mike Kravetz Cc: Kirill A. Shutemov Cc: Song Liu Cc: Yang Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memfd.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a5..26d1d390a22a7 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -31,6 +31,7 @@ static void memfd_tag_pins(struct xa_state *xas) { struct page *page; + int count = 0; unsigned int tagged = 0; lru_add_drain(); @@ -39,8 +40,12 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); - if (page_count(page) - page_mapcount(page) > 1) + count = page_count(page); + if (PageTransCompound(page)) + count -= (1 << compound_order(compound_head(page))) - 1; + if (count - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); if (++tagged % XA_CHECK_SCHED) @@ -67,11 +72,12 @@ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); struct page *page; - int error, scan; + int error, scan, count; memfd_tag_pins(&xas); error = 0; + count = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { unsigned int tagged = 0; @@ -89,8 +95,12 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); - if (page_count(page) - page_mapcount(page) != 1) { + count = page_count(page); + if (PageTransCompound(page)) + count -= (1 << compound_order(compound_head(page))) - 1; + if (count - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still From 8d54b58e3eb78382b58d87d6bf1fad1c06ec4804 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:14 +1100 Subject: [PATCH 128/334] mm/sparse: make mminit_validate_memmodel_limits() static It's only used in the sparse.c now. So we can make it static and further clean up the relevant code. Link: https://lkml.kernel.org/r/20220127093221.63524-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 11 ----------- mm/sparse.c | 2 +- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index d80300392a194..9c298afb96884 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -572,17 +572,6 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ -/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ -#if defined(CONFIG_SPARSEMEM) -extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, - unsigned long *end_pfn); -#else -static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, - unsigned long *end_pfn) -{ -} -#endif /* CONFIG_SPARSEMEM */ - #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 diff --git a/mm/sparse.c b/mm/sparse.c index d21c6e5910d07..952f06d8f3731 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -126,7 +126,7 @@ static inline int sparse_early_nid(struct mem_section *section) } /* Validate the physical addressing limitations of the model */ -void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, +static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); From edecc06b4d34e92a5cd306d3436833e344eb3fa2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:31:14 +1100 Subject: [PATCH 129/334] mm/sparsemem: fix 'mem_section' will never be NULL gcc 12 warning The gcc 12 compiler reports a "'mem_section' will never be NULL" warning on the following code: static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section) return NULL; #endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; : It happens with both CONFIG_SPARSEMEM_EXTREME on and off. The mem_section definition is #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif In the CONFIG_SPARSEMEM_EXTREME case, mem_section obviously cannot be NULL, but *mem_section can be if memory hasn't been allocated for the dynamic mem_section[] array yet. In the !CONFIG_SPARSEMEM_EXTREME case, mem_section is a static 2-dimensional array and so the check "!mem_section[SECTION_NR_TO_ROOT(nr)]" doesn't make sense. Fix this warning by checking for "!*mem_section" instead of "!mem_section" and moving the "!mem_section[SECTION_NR_TO_ROOT(nr)]" check up inside the CONFIG_SPARSEMEM_EXTREME block. Link: https://lkml.kernel.org/r/20220201192924.672675-1-longman@redhat.com Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Fixes: 3e347261a80b ("sparsemem extreme implementation") Signed-off-by: Waiman Long Reported-by: Justin Forbes Cc: Kirill A. Shutemov Cc: Ingo Molnar Cc: Justin Forbes Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aed44e9b5d899..bd1b19925f3b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1390,7 +1390,7 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms) static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) + if (!*mem_section) return NULL; #endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) From fff3b2a167db5495b47548cc71054e064c440031 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:31:14 +1100 Subject: [PATCH 130/334] mm-sparsemem-fix-mem_section-will-never-be-null-gcc-12-warning-v2 Link: https://lkml.kernel.org/r/20220202003550.698768-1-longman@redhat.com Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Fixes: 3e347261a80b ("sparsemem extreme implementation") Reported-by: Justin Forbes Signed-off-by: Waiman Long Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bd1b19925f3b2..08517376c7658 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1390,11 +1390,9 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms) static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME - if (!*mem_section) + if (!*mem_section || !mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; #endif - if (!mem_section[SECTION_NR_TO_ROOT(nr)]) - return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); From 393dff331c41ab2e0e6cf4d7eeb5efa092e806b4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:14 +1100 Subject: [PATCH 131/334] mm/vmalloc: remove unneeded function forward declaration The forward declaration for lazy_max_pages() is unnecessary. Remove it. Link: https://lkml.kernel.org/r/20220124133752.60663-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4ad2275fde825..0e7f36511d7b7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -791,7 +791,6 @@ RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); -static unsigned long lazy_max_pages(void); static atomic_long_t nr_vmalloc_pages; From ae377d7c8649db0bf9cac913f9799285590be38b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 16 Feb 2022 15:31:14 +1100 Subject: [PATCH 132/334] mm/vmalloc: Move draining areas out of caller context A caller initiates the drain procces from its context once the drain threshold is reached or passed. There are at least two drawbacks of doing so: a) a caller can be a high-prio or RT task. In that case it can stuck in doing the actual drain of all lazily freed areas. This is not optimal because such tasks usually are latency sensitive where the control should be returned back as soon as possible in order to drive such workloads in time. See 96e2db456135 ("mm/vmalloc: rework the drain logic") b) It is not safe to call vfree() during holding a spinlock due to the vmap_purge_lock mutex. The was a report about this from Zeal Robot here: https://lore.kernel.org/all/20211222081026.484058-1-chi.minghao@zte.com.cn Moving the drain to the separate work context addresses those issues. v1->v2: - Added prefix "_work" to the drain worker function. v2->v3: - Remove the drain_vmap_work_in_progress. Extra queuing is expectable under heavy load but it can be disregarded because a work will bail out if nothing to be done. Link: https://lkml.kernel.org/r/20220131144058.35608-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Uladzislau Rezki Cc: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0e7f36511d7b7..25dfcf1405699 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -791,6 +791,8 @@ RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static void drain_vmap_area_work(struct work_struct *work); +static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; @@ -1721,18 +1723,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) return true; } -/* - * Kick off a purge of the outstanding lazy areas. Don't bother if somebody - * is already purging. - */ -static void try_purge_vmap_area_lazy(void) -{ - if (mutex_trylock(&vmap_purge_lock)) { - __purge_vmap_area_lazy(ULONG_MAX, 0); - mutex_unlock(&vmap_purge_lock); - } -} - /* * Kick off a purge of the outstanding lazy areas. */ @@ -1744,6 +1734,20 @@ static void purge_vmap_area_lazy(void) mutex_unlock(&vmap_purge_lock); } +static void drain_vmap_area_work(struct work_struct *work) +{ + unsigned long nr_lazy; + + do { + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + + /* Recheck if further work is required. */ + nr_lazy = atomic_long_read(&vmap_lazy_nr); + } while (nr_lazy > lazy_max_pages()); +} + /* * Free a vmap area, caller ensuring that the area has been unmapped * and flush_cache_vunmap had been called for the correct range @@ -1770,7 +1774,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) /* After this point, we may free va at any time */ if (unlikely(nr_lazy > lazy_max_pages())) - try_purge_vmap_area_lazy(); + schedule_work(&drain_vmap_work); } /* From cfd0720b9d718a0d7abe87fe67835c7aa43b9b08 Mon Sep 17 00:00:00 2001 From: Uladzislau Rezki Date: Wed, 16 Feb 2022 15:31:15 +1100 Subject: [PATCH 133/334] mm/vmalloc: add adjust_search_size parameter Extend the find_vmap_lowest_match() function with one more parameter. It is "adjust_search_size" boolean variable, so it is possible to control an accuracy of search block if a specific alignment is required. With this patch, a search size is always adjusted, to serve a request as fast as possible because of performance reason. But there is one exception though, it is short ranges where requested size corresponds to passed vstart/vend restriction together with a specific alignment request. In such scenario an adjustment wold not lead to success allocation. Link: https://lkml.kernel.org/r/20220119143540.601149-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 25dfcf1405699..8aa4c57ef357e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1193,22 +1193,28 @@ is_within_this_va(struct vmap_area *va, unsigned long size, /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing - * parameters. + * parameters. Please note, with an alignment bigger than PAGE_SIZE, + * a search length is adjusted to account for worst case alignment + * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, - unsigned long align, unsigned long vstart) +find_vmap_lowest_match(unsigned long size, unsigned long align, + unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; + unsigned long length; /* Start from the root. */ node = free_vmap_area_root.rb_node; + /* Adjust the search size for alignment overhead. */ + length = adjust_search_size ? size + align - 1 : size; + while (node) { va = rb_entry(node, struct vmap_area, rb_node); - if (get_subtree_max_size(node->rb_left) >= size && + if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { @@ -1218,9 +1224,9 @@ find_vmap_lowest_match(unsigned long size, /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is - * equal or bigger to the requested search size. + * equal or bigger to the requested search length. */ - if (get_subtree_max_size(node->rb_right) >= size) { + if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } @@ -1236,7 +1242,7 @@ find_vmap_lowest_match(unsigned long size, if (is_within_this_va(va, size, align, vstart)) return va; - if (get_subtree_max_size(node->rb_right) >= size && + if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with @@ -1284,7 +1290,7 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart); + va_1 = find_vmap_lowest_match(size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(size, align, vstart); if (va_1 != va_2) @@ -1435,12 +1441,25 @@ static __always_inline unsigned long __alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { + bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; enum fit_type type; int ret; - va = find_vmap_lowest_match(size, align, vstart); + /* + * Do not adjust when: + * a) align <= PAGE_SIZE, because it does not make any sense. + * All blocks(their start addresses) are at least PAGE_SIZE + * aligned anyway; + * b) a short range where a requested size corresponds to exactly + * specified [vstart:vend] interval and an alignment > PAGE_SIZE. + * With adjusted search length an allocation would not succeed. + */ + if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) + adjust_search_size = false; + + va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; From 4aef14c30b62fe47fc452b386bdb26a50e1fa1b3 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 16 Feb 2022 15:31:15 +1100 Subject: [PATCH 134/334] mm/vmalloc: eliminate an extra orig_gfp_mask That extra variable has been introduced just for keeping an original passed gfp_mask because it is updated with __GFP_NOWARN on entry, thus error handling messages were broken. Instead we can keep an original gfp_mask without modifying it and add an extra __GFP_NOWARN flag together with gfp_mask as a parameter to the vm_area_alloc_pages() function. It will make it less confused. Link: https://lkml.kernel.org/r/20220119143540.601149-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Vasily Averin Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8aa4c57ef357e..5229488fc3c47 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2965,7 +2965,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t orig_gfp_mask = gfp_mask; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); @@ -2989,7 +2988,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); @@ -2999,8 +2998,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - area->nr_pages = vm_area_alloc_pages(gfp_mask, node, - page_order, nr_small_pages, area->pages); + area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { @@ -3016,7 +3015,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; @@ -3044,7 +3043,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, memalloc_noio_restore(flags); if (ret < 0) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; From 41a33a6c9cfee5bb5da5f6cc146e9f4dc2e40074 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 16 Feb 2022 15:31:15 +1100 Subject: [PATCH 135/334] mm/vmalloc.c: fix "unused function" warning compute_subtree_max_size() is unused, when building with DEBUG_AUGMENT_PROPAGATE_CHECK=y. mm/vmalloc.c:785:1: warning: unused function 'compute_subtree_max_size' [-Wunused-function]. Link: https://lkml.kernel.org/r/20220129034652.75359-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jiapeng Chong Reported-by: Abaci Robot Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5229488fc3c47..e163372d39679 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -775,17 +775,6 @@ get_subtree_max_size(struct rb_node *node) return va ? va->subtree_max_size : 0; } -/* - * Gets called when remove the node and rotate. - */ -static __always_inline unsigned long -compute_subtree_max_size(struct vmap_area *va) -{ - return max3(va_size(va), - get_subtree_max_size(va->rb_node.rb_left), - get_subtree_max_size(va->rb_node.rb_right)); -} - RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) @@ -977,6 +966,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root) } #if DEBUG_AUGMENT_PROPAGATE_CHECK +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} + static void augment_tree_propagate_check(void) { From fc1749521c8c63363172be68f5f14ea36464be5e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 16 Feb 2022 15:31:15 +1100 Subject: [PATCH 136/334] mm/vmalloc.c: vmap(): don't allow invalid pages vmap() takes struct page *pages as one of arguments, and user may provide an invalid pointer which would lead to data abort at address translation later. Currently, kernel checks the pages against NULL. In my case, however, the address was not NULL, and was big enough so that the hardware generated Address Size Abort on arm64. Interestingly, this abort happens even if copy_from_kernel_nofault() is used, which is quite inconvenient for debugging purposes. This patch adds a pfn_valid() check into vmap() path, so that invalid mapping will not be created. Link: https://lkml.kernel.org/r/20220119012109.551931-1-yury.norov@gmail.com Signed-off-by: Yury Norov Suggested-by: Matthew Wilcox (Oracle) Cc: Catalin Marinas Cc: Will Deacon Cc: Nicholas Piggin Cc: Ding Tianhong Cc: Anshuman Khandual Cc: Matthew Wilcox Cc: Alexey Klimov Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e163372d39679..b454cf1a261f9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -478,6 +478,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); From 4833404b8ae71681ac6b540e619b3fefde36bf0b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 16 Feb 2022 15:31:15 +1100 Subject: [PATCH 137/334] mm: page_alloc: avoid merging non-fallbackable pageblocks with others This is done in addition to MIGRATE_ISOLATE pageblock merge avoidance. It prepares for the upcoming removal of the MAX_ORDER-1 alignment requirement for CMA and alloc_contig_range(). MIGRATE_HIGHATOMIC should not merge with other migratetypes like MIGRATE_ISOLATE and MIGRARTE_CMA[1], so this commit prevents that too. Remove MIGRATE_CMA and MIGRATE_ISOLATE from fallbacks list, since they are never used. [1] https://lore.kernel.org/linux-mm/20211130100853.GP3366@techsingularity.net/ Link: https://lkml.kernel.org/r/20220124175957.1261961-1-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: Mel Gorman Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 11 +++++++++++ mm/page_alloc.c | 44 ++++++++++++++++++++---------------------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 08517376c7658..c15f58ad5f338 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,6 +83,17 @@ static inline bool is_migrate_movable(int mt) return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } +/* + * Check whether a migratetype can be merged with another migratetype. + * + * It is only mergeable when it can fall back to other migratetypes for + * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. + */ +static inline bool migratetype_is_mergeable(int mt) +{ + return mt < MIGRATE_PCPTYPES; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 368c6c5bf42a9..b72898d79e618 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1099,25 +1099,24 @@ static inline void __free_one_page(struct page *page, } if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on isolate - * pageblock and normal pageblock. Without this, pageblock - * isolation could cause incorrect freepage or CMA accounting. + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. * * We don't want to hit this code for the more frequent * low-order merging. */ - if (unlikely(has_isolate_pageblock(zone))) { - int buddy_mt; + int buddy_mt; - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - buddy_mt = get_pageblock_migratetype(buddy); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + buddy_mt = get_pageblock_migratetype(buddy); - if (migratetype != buddy_mt - && (is_migrate_isolate(migratetype) || - is_migrate_isolate(buddy_mt))) - goto done_merging; - } + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; max_order = order + 1; goto continue_merging; } @@ -2535,17 +2534,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted + * + * The other migratetypes do not have fallbacks. */ static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, -#ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ -#endif }; #ifdef CONFIG_CMA @@ -2851,8 +2846,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) - && !is_migrate_cma(mt)) { + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); @@ -3601,8 +3596,11 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && !is_migrate_highatomic(mt)) + /* + * Only change normal pageblocks (i.e., they can merge + * with others) + */ + if (migratetype_is_mergeable(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } From 807fb5116c0eabd54d93095b158b73994ec11e68 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 16 Feb 2022 15:31:16 +1100 Subject: [PATCH 138/334] mm/page_alloc: adding same penalty is enough to get round-robin order To make node order in round-robin in the same distance group, we add a penalty to the first node we got in each round. To get a round-robin order in the same distance group, we don't need to decrease the penalty since: * find_next_best_node() always iterates node in the same order * distance matters more then penalty in find_next_best_node() * in nodes with the same distance, the first one would be picked up So it is fine to increase same penalty when we get the first node in the same distance group. Link: https://lkml.kernel.org/r/20220123013537.20491-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Krupa Ramakrishnan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b72898d79e618..ea5202e672774 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6319,13 +6319,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; - int node, load, nr_nodes = 0; + int node, nr_nodes = 0; nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = nr_online_nodes; prev_node = local_node; memset(node_order, 0, sizeof(node_order)); @@ -6337,11 +6336,10 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] += nr_online_nodes; node_order[nr_nodes++] = node; prev_node = node; - load--; } build_zonelists_in_node_order(pgdat, node_order, nr_nodes); From 72eca1089b93ceab69dd7e6d06b1b2cb35c5edff Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 16 Feb 2022 15:31:16 +1100 Subject: [PATCH 139/334] mm/page_alloc: add penalty to local_node Commit 54d032ced983 ("mm/page_alloc: use accumulated load when building node fallback list") fixed a bug on zonelist order. This made me think about what would happen if we have a node system with the following distance matrix. Node 0 1 2 3 4 5 6 7 ---------------------------- 0 10 12 12 12 32 32 32 32 1 12 10 12 12 32 32 32 32 2 12 12 10 12 32 32 32 32 3 12 12 12 10 32 32 32 32 4 32 32 32 32 10 12 12 12 5 32 32 32 32 12 10 12 12 6 32 32 32 32 12 12 10 12 7 32 32 32 32 12 12 12 10 Unfortunately for this case, the node fallback list gets built like this: Node Fallback list --------------------- 0: 0 1 2 3 4 5 6 7 1: 1 0 2 3 5 6 7 4 2: 2 3 0 1 6 7 4 5 3: 3 2 0 1 7 4 5 6 4: 4 5 6 7 0 1 2 3 5: 5 4 6 7 1 2 3 0 6: 6 7 4 5 2 3 0 1 7: 7 6 4 5 3 0 1 2 We found the order in diagonal block is not expected. The reason is we don't penaltize local node. After penalizing local node, the node fallback list gets built like this: Node Fallback list --------------------- 0: 0 1 2 3 4 5 6 7 1: 1 2 3 0 5 6 7 4 2: 2 3 0 1 6 7 4 5 3: 3 0 1 2 7 4 5 6 4: 4 5 6 7 0 1 2 3 5: 5 6 7 4 1 2 3 0 6: 6 7 4 5 2 3 0 1 7: 7 4 5 6 3 0 1 2 Now the fallback list is in round-robin order. I am not very familiar with the node distance pattern, while I tried the following distance matrix. Both of them works with this change. Node 0 1 2 3 ---------------- 0 10 10 10 10 1 10 10 10 10 2 10 10 10 10 3 10 10 10 10 Node 0 1 2 3 4 5 6 7 ---------------------------- 0 10 10 10 10 32 32 32 32 1 10 10 10 10 32 32 32 32 2 10 10 10 10 32 32 32 32 3 10 10 10 10 32 32 32 32 4 32 32 32 32 10 10 10 10 5 32 32 32 32 10 10 10 10 6 32 32 32 32 10 10 10 10 7 32 32 32 32 10 10 10 10 Link: https://lkml.kernel.org/r/20220123013537.20491-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Krupa Ramakrishnan Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea5202e672774..229ea7f317325 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6334,8 +6334,9 @@ static void build_zonelists(pg_data_t *pgdat) * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + if ((node_distance(local_node, node) != + node_distance(local_node, prev_node)) || + node == local_node) node_load[node] += nr_online_nodes; node_order[nr_nodes++] = node; From be309555a716fda11d3491b67775f515f61c9bbe Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 16 Feb 2022 15:31:16 +1100 Subject: [PATCH 140/334] mm/mmzone.c: use try_cmpxchg() in page_cpupid_xchg_last() This will let us avoid an additional read from page->flags when retrying the compare-exchange on some architectures. Link: https://lkml.kernel.org/r/20220120011200.1322836-1-pcc@google.com Link: https://linux-review.googlesource.com/id/I2e1f5b5b080ac9c4e0eb7f98768dba6fd7821693 Signed-off-by: Peter Collingbourne Suggested-by: Peter Zijlstra Cc: Andrey Konovalov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmzone.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/mmzone.c b/mm/mmzone.c index eb89d6e018e29..d8a9b0e1b5267 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -89,13 +89,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid) unsigned long old_flags, flags; int last_cpupid; + old_flags = READ_ONCE(page->flags); do { - old_flags = flags = page->flags; - last_cpupid = page_cpupid_last(page); + flags = old_flags; + last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); return last_cpupid; } From c79da81ae59ac23a2f65f9288b7543826b60be03 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2022 15:31:16 +1100 Subject: [PATCH 141/334] mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - causes __GFP_HIGH to set ALLOC_HARDER unless __GFP_NOMEMALLOC is set (as well as ALLOC_HIGH). - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Signed-off-by: NeilBrown Reviewed-by: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thierry Reding Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/vm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 16 ++++------------ tools/perf/builtin-kmem.c | 1 - tools/testing/radix-tree/linux/gfp.h | 3 +-- 9 files changed, 17 insertions(+), 32 deletions(-) diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst index 6a1fadf3e1735..e38e9d83c1c72 100644 --- a/Documentation/vm/balance.rst +++ b/Documentation/vm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e900e3c46903b..c5fa8b8673b6a 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -676,12 +676,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7797c915ce54c..6eef3e4475401 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,7 +39,7 @@ struct vm_area_struct; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -124,11 +124,8 @@ struct vm_area_struct; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -143,7 +140,6 @@ struct vm_area_struct; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -339,7 +335,7 @@ struct vm_area_struct; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 6532119a6bf1a..0698c5d0f1947 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -29,7 +29,6 @@ {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ - {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ diff --git a/lib/test_printf.c b/lib/test_printf.c index 07309c45f3279..8010de49b6c5d 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -673,17 +673,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index 9c298afb96884..927dfba5111f2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 229ea7f317325..343724c57a2cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4002,12 +4002,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4735,12 +4735,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -4933,14 +4933,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - retry_cpuset: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 99d7ff9a8effe..e5b38d0b08fb5 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -640,7 +640,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index 32159c08a52e5..0a0741104dfeb 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h @@ -12,7 +12,6 @@ #define __GFP_FS 0x80u #define __GFP_NOWARN 0x200u #define __GFP_ZERO 0x8000u -#define __GFP_ATOMIC 0x80000u #define __GFP_ACCOUNT 0x100000u #define __GFP_DIRECT_RECLAIM 0x400000u #define __GFP_KSWAPD_RECLAIM 0x2000000u @@ -20,7 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) #define GFP_ZONEMASK 0x0fu -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) From 673be4d40e751532dd4e67292f7f97faa3016cdf Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:17 +1100 Subject: [PATCH 142/334] mm/mmzone.h: remove unused macros Remove pgdat_page_nr, nid_page_nr and NODE_MEM_MAP. They are unused now. Link: https://lkml.kernel.org/r/20220127093210.62293-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c15f58ad5f338..0ac8ef50cea38 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -931,12 +931,6 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) -#ifdef CONFIG_FLATMEM -#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) -#else -#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) -#endif -#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) @@ -1112,7 +1106,6 @@ static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } -#define NODE_MEM_MAP(nid) mem_map #else /* CONFIG_NUMA */ From f8edcc2a03bba7c09ec272134b82e32a6f232e25 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 16 Feb 2022 15:31:17 +1100 Subject: [PATCH 143/334] mm/page_alloc: don't pass pfn to free_unref_page_commit() free_unref_page_commit() doesn't make use of its pfn argument, so get rid of it. Link: https://lkml.kernel.org/r/20220202140451.415928-1-nsaenzju@redhat.com Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Vlastimil Babka Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 343724c57a2cf..b40a63353ed33 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3422,8 +3422,8 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype, unsigned int order) +static void free_unref_page_commit(struct page *page, int migratetype, + unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -3472,7 +3472,7 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype, order); + free_unref_page_commit(page, migratetype, order); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3482,13 +3482,13 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; - unsigned long flags, pfn; + unsigned long flags; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - pfn = page_to_pfn(page); + unsigned long pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); continue; @@ -3504,15 +3504,10 @@ void free_unref_page_list(struct list_head *list) free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); continue; } - - set_page_private(page, pfn); } local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - pfn = page_private(page); - set_page_private(page, 0); - /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3522,7 +3517,7 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype, 0); + free_unref_page_commit(page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get From f7242481486b005f28940a5b9baa3aada53b634a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:17 +1100 Subject: [PATCH 144/334] cma: factor out minimum alignment requirement Patch series "mm: enforce pageblock_order < MAX_ORDER". Having pageblock_order >= MAX_ORDER seems to be able to happen in corner cases and some parts of the kernel are not prepared for it. For example, Aneesh has shown [1] that such kernels can be compiled on ppc64 with 64k base pages by setting FORCE_MAX_ZONEORDER=8, which will run into a WARN_ON_ONCE(order >= MAX_ORDER) in comapction code right during boot. We can get pageblock_order >= MAX_ORDER when the default hugetlb size is bigger than the maximum allocation granularity of the buddy, in which case we are no longer talking about huge pages but instead gigantic pages. Having pageblock_order >= MAX_ORDER can only make alloc_contig_range() of such gigantic pages more likely to succeed. Reliable use of gigantic pages either requires boot time allcoation or CMA, no need to overcomplicate some places in the kernel to optimize for corner cases that are broken in other areas of the kernel. This patch (of 2): Let's enforce pageblock_order < MAX_ORDER and simplify. Especially patch #1 can be regarded a cleanup before: [PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range alignment. [2] [1] https://lkml.kernel.org/r/87r189a2ks.fsf@linux.ibm.com [2] https://lkml.kernel.org/r/20220211164135.1803616-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20220214174132.219303-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Aneesh Kumar K.V Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Rob Herring Cc: Frank Rowand Cc: Michael S. Tsirkin Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Minchan Kim Cc: Vlastimil Babka Cc: John Garry via iommu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/include/asm/fadump-internal.h | 5 ----- arch/powerpc/kernel/fadump.c | 2 +- drivers/of/of_reserved_mem.c | 9 +++------ include/linux/cma.h | 9 +++++++++ kernel/dma/contiguous.c | 4 +--- mm/cma.c | 20 +++++--------------- 6 files changed, 19 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 52189928ec081..81bcb9abb3718 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -19,11 +19,6 @@ #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) -/* Alignment per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, \ - pageblock_order)) - /* FAD commands */ #define FADUMP_REGISTER 1 #define FADUMP_UNREGISTER 2 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d03e488cfe9ca..7eb67201ea415 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -544,7 +544,7 @@ int __init fadump_reserve_mem(void) if (!fw_dump.nocma) { fw_dump.boot_memory_size = ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + CMA_MIN_ALIGNMENT_BYTES); } #endif diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9c0fb962c22b0..75caa6f5d36f4 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "of_private.h" @@ -116,12 +117,8 @@ static int __init __reserved_mem_alloc_size(unsigned long node, if (IS_ENABLED(CONFIG_CMA) && of_flat_dt_is_compatible(node, "shared-dma-pool") && of_get_flat_dt_prop(node, "reusable", NULL) - && !nomap) { - unsigned long order = - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - - align = max(align, (phys_addr_t)PAGE_SIZE << order); - } + && !nomap) + align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES); prop = of_get_flat_dt_prop(node, "alloc-ranges", &len); if (prop) { diff --git a/include/linux/cma.h b/include/linux/cma.h index bd801023504b2..75fe188ec4a1c 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -20,6 +20,15 @@ #define CMA_MAX_NAME 64 +/* + * TODO: once the buddy -- especially pageblock merging and alloc_contig_range() + * -- can deal with only some pageblocks of a higher-order page being + * MIGRATE_CMA, we can use pageblock_nr_pages. + */ +#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \ + pageblock_nr_pages) +#define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) + struct cma; extern unsigned long totalcma_pages; diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5cf..6ea80ae426228 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = { static int __init rmem_cma_setup(struct reserved_mem *rmem) { - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; @@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if ((rmem->base & mask) || (rmem->size & mask)) { + if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } diff --git a/mm/cma.c b/mm/cma.c index bc9ca8f3c4871..5a2cd58516589 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -168,7 +168,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma) { struct cma *cma; - phys_addr_t alignment; /* Sanity checks */ if (cma_area_count == ARRAY_SIZE(cma_areas)) { @@ -179,15 +178,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - /* alignment should be aligned with order_per_bit */ - if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit)) return -EINVAL; - if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + /* ensure minimal alignment required by mm core */ + if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; /* @@ -262,14 +258,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (alignment && !is_power_of_2(alignment)) return -EINVAL; - /* - * Sanitise input arguments. - * Pages both ends in CMA area could be merged into adjacent unmovable - * migratetype page by page allocator's buddy algorithm. In the case, - * you couldn't get a contiguous memory, which is not what we want. - */ - alignment = max(alignment, (phys_addr_t)PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { ret = -EINVAL; pr_err("Region at %pa must be aligned to %pa bytes\n", From 2a034a7529c30785c81e672e1af5f8aabafe6932 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:17 +1100 Subject: [PATCH 145/334] mm: enforce pageblock_order < MAX_ORDER Some places in the kernel don't really expect pageblock_order >= MAX_ORDER, and it looks like this is only possible in corner cases: 1) CONFIG_DEFERRED_STRUCT_PAGE_INIT we'll end up freeing pageblock_order pages via __free_pages_core(), which cannot possibly work. 2) find_zone_movable_pfns_for_nodes() will roundup the ZONE_MOVABLE start PFN to MAX_ORDER_NR_PAGES. Consequently with a bigger pageblock_order, we could have a single pageblock partially managed by two zones. 3) compaction code runs into __fragmentation_index() with order >= MAX_ORDER, when checking WARN_ON_ONCE(order >= MAX_ORDER). [1] 4) mm/page_reporting.c won't be reporting any pages with default page_reporting_order == pageblock_order, as we'll be skipping the reporting loop inside page_reporting_process_zone(). 5) __rmqueue_fallback() will never be able to steal with ALLOC_NOFRAGMENT. pageblock_order >= MAX_ORDER is weird either way: it's a pure optimization for making alloc_contig_range(), as used for allcoation of gigantic pages, a little more reliable to succeed. However, if there is demand for somewhat reliable allocation of gigantic pages, affected setups should be using CMA or boottime allocations instead. So let's make sure that pageblock_order < MAX_ORDER and simplify. [1] https://lkml.kernel.org/r/87r189a2ks.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20220214174132.219303-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Christoph Hellwig Cc: Frank Rowand Cc: John Garry via iommu Cc: Marek Szyprowski Cc: Michael Ellerman Cc: Michael S. Tsirkin Cc: Minchan Kim Cc: Paul Mackerras Cc: Rob Herring Cc: Robin Murphy Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/virtio/virtio_mem.c | 9 +++------ include/linux/cma.h | 3 +-- include/linux/pageblock-flags.h | 7 +++++-- mm/Kconfig | 3 +++ mm/page_alloc.c | 32 ++++++++------------------------ 5 files changed, 20 insertions(+), 34 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 38becd8d578c7..e7d6b679596d3 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2476,13 +2476,10 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm) VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); /* - * We want subblocks to span at least MAX_ORDER_NR_PAGES and - * pageblock_nr_pages pages. This: - * - Is required for now for alloc_contig_range() to work reliably - - * it doesn't properly handle smaller granularity on ZONE_NORMAL. + * TODO: once alloc_contig_range() works reliably with pageblock + * granularity on ZONE_NORMAL, use pageblock_nr_pages instead. */ - sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) * PAGE_SIZE; + sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES; sb_size = max_t(uint64_t, vm->device_block_size, sb_size); if (sb_size < memory_block_size_bytes() && !force_bbm) { diff --git a/include/linux/cma.h b/include/linux/cma.h index 75fe188ec4a1c..b1ba94f1cc9c5 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -25,8 +25,7 @@ * -- can deal with only some pageblocks of a higher-order page being * MIGRATE_CMA, we can use pageblock_nr_pages. */ -#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \ - pageblock_nr_pages) +#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) struct cma; diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 973fd731a5206..83c7248053a1e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -37,8 +37,11 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -/* Huge pages are a constant size */ -#define pageblock_order HUGETLB_PAGE_ORDER +/* + * Huge pages are a constant size, but don't exceed the maximum allocation + * granularity. + */ +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ diff --git a/mm/Kconfig b/mm/Kconfig index 257ed9c86de34..0ac5dbad9ed07 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,6 +262,9 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. + Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be + clamped down to MAX_ORDER - 1. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b40a63353ed33..4f1ce52985c8d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1054,14 +1054,12 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); + unsigned int max_order = pageblock_order; unsigned long buddy_pfn; unsigned long combined_pfn; - unsigned int max_order; struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); - VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -2262,19 +2260,8 @@ void __init init_cma_reserved_pageblock(struct page *page) } while (++p, --i); set_pageblock_migratetype(page, MIGRATE_CMA); - - if (pageblock_order >= MAX_ORDER) { - i = pageblock_nr_pages; - p = page; - do { - set_page_refcounted(p); - __free_pages(p, MAX_ORDER - 1); - p += MAX_ORDER_NR_PAGES; - } while (i -= MAX_ORDER_NR_PAGES); - } else { - set_page_refcounted(page); - __free_pages(page, pageblock_order); - } + set_page_refcounted(page); + __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; @@ -7429,16 +7416,15 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order; + unsigned int order = MAX_ORDER - 1; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; - if (HPAGE_SHIFT > PAGE_SHIFT) + /* Don't let pageblocks exceed the maximum allocation granularity. */ + if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) order = HUGETLB_PAGE_ORDER; - else - order = MAX_ORDER - 1; /* * Assume the largest contiguous order of interest is a huge page. @@ -9026,14 +9012,12 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, #ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { - return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) - 1); + return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES); } static unsigned long pfn_max_align_up(unsigned long pfn) { - return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages)); + return ALIGN(pfn, MAX_ORDER_NR_PAGES); } #if defined(CONFIG_DYNAMIC_DEBUG) || \ From ab7e803d908317d0c44bb491a817db79de773106 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 16 Feb 2022 15:31:17 +1100 Subject: [PATCH 146/334] mm/page_alloc: mark pagesets as __maybe_unused Commit 9983a9d577db ("locking/local_lock: Make the empty local_lock_*() function a macro.") in the -tip tree converted the local_lock_*() functions into macros, which causes a warning with clang with CONFIG_PREEMPT_RT=n + CONFIG_DEBUG_LOCK_ALLOC=n: mm/page_alloc.c:131:40: error: variable 'pagesets' is not needed and will not be emitted [-Werror,-Wunneeded-internal-declaration] static DEFINE_PER_CPU(struct pagesets, pagesets) = { ^ 1 error generated. Prior to that change, clang was not able to tell that pagesets was unused in this configuration because it does not perform cross function analysis in the frontend. After that change, it sees that the macros just do a typecheck on the lock member of pagesets, which is evaluated at compile time (so the variable is technically "used"), meaning the variable is not needed in the final assembly, as the warning states. Mark the variable as __maybe_unused to make it clear to clang that this is expected in this configuration so there is no more warning. Link: https://github.com/ClangBuiltLinux/linux/issues/1593 Link: https://lkml.kernel.org/r/20220215184322.440969-1-nathan@kernel.org Signed-off-by: Nathan Chancellor Suggested-by: Nick Desaulniers Reported-by: "kernelci.org bot" Cc: Sebastian Andrzej Siewior Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f1ce52985c8d..4f53d2d12f4cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -128,7 +128,7 @@ static DEFINE_MUTEX(pcp_batch_high_lock); struct pagesets { local_lock_t lock; }; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { +static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = { .lock = INIT_LOCAL_LOCK(lock), }; From 8c8732ea39e2cc4ffa175ad1c69349637af16ff0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 16 Feb 2022 15:31:18 +1100 Subject: [PATCH 147/334] mm/page_alloc: fetch the correct pcp buddy during bulk free Patch series "Follow-up on high-order PCP caching". Commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") was primarily aimed at reducing the cost of SLUB cache refills of high-order pages in two ways. Firstly, zone lock acquisitions was reduced and secondly, there were fewer buddy list modifications. This is a follow-up series fixing some issues that became apparant after merging. Patch 1 is a functional fix. It's harmless but inefficient. Patches 2-4 reduce the overhead of bulk freeing of PCP pages. While the overhead is small, it's cumulative and noticable when truncating large files. The changelog for patch 4 includes results of a microbench that deletes large sparse files with data in page cache. Sparse files were used to eliminate filesystem overhead. Patch 5 addresses issues with high-order PCP pages being stored on PCP lists for too long. Pages freed on a CPU potentially may not be quickly reused and in some cases this can increase cache miss rates. Details are included in the changelog. This patch (of 5): free_pcppages_bulk() prefetches buddies about to be freed but the order must also be passed in as PCP lists store multiple orders. Link: https://lkml.kernel.org/r/20220215145111.27082-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20220215145111.27082-2-mgorman@techsingularity.net Fixes: 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") Signed-off-by: Mel Gorman Cc: Aaron Lu Cc: Dave Hansen Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f53d2d12f4cc..baea2203c3fb4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1432,10 +1432,10 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ -static inline void prefetch_buddy(struct page *page) +static inline void prefetch_buddy(struct page *page, unsigned int order) { unsigned long pfn = page_to_pfn(page); - unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); + unsigned long buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy = page + (buddy_pfn - pfn); prefetch(buddy); @@ -1512,7 +1512,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, * prefetch buddy for the first pcp->batch nr of pages. */ if (prefetch_nr) { - prefetch_buddy(page); + prefetch_buddy(page, order); prefetch_nr--; } } while (count > 0 && --batch_free && !list_empty(list)); From 09c0fc2b3db8e79d6e9b033f6a619d23903b0586 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 16 Feb 2022 15:31:18 +1100 Subject: [PATCH 148/334] mm/page_alloc: track range of active PCP lists during bulk free free_pcppages_bulk() frees pages in a round-robin fashion. Originally, this was dealing only with migratetypes but storing high-order pages means that there can be many more empty lists that are uselessly checked. Track the minimum and maximum active pindex to reduce the search space. Link: https://lkml.kernel.org/r/20220215145111.27082-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Aaron Lu Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index baea2203c3fb4..61cf0f1ca4ad3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1450,6 +1450,8 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { int pindex = 0; + int min_pindex = 0; + int max_pindex = NR_PCP_LISTS - 1; int batch_free = 0; int nr_freed = 0; unsigned int order; @@ -1478,10 +1480,17 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (++pindex == NR_PCP_LISTS) pindex = 0; list = &pcp->lists[pindex]; - } while (list_empty(list)); + if (!list_empty(list)) + break; + + if (pindex == max_pindex) + max_pindex--; + if (pindex == min_pindex) + min_pindex++; + } while (1); /* This is the only non-empty list. Free them all. */ - if (batch_free == NR_PCP_LISTS) + if (batch_free >= max_pindex - min_pindex) batch_free = count; order = pindex_to_order(pindex); From 45b27a3f3396e3c4a95d0ba068621c8d2e7ceb8d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 16 Feb 2022 15:31:18 +1100 Subject: [PATCH 149/334] mm/page_alloc: simplify how many pages are selected per pcp list during bulk free free_pcppages_bulk() selects pages to free by round-robining between lists. Originally this was to evenly shrink pages by migratetype but uneven freeing is inevitable due to high pages. Simplify list selection by starting with a list that definitely has pages on it in free_unref_page_commit() and for drain, it does not matter where draining starts as all pages are removed. Link: https://lkml.kernel.org/r/20220215145111.27082-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Aaron Lu Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 61cf0f1ca4ad3..5a7520d466176 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1447,13 +1447,11 @@ static inline void prefetch_buddy(struct page *page, unsigned int order) * count is the number of pages to free. */ static void free_pcppages_bulk(struct zone *zone, int count, - struct per_cpu_pages *pcp) + struct per_cpu_pages *pcp, + int pindex) { - int pindex = 0; int min_pindex = 0; int max_pindex = NR_PCP_LISTS - 1; - int batch_free = 0; - int nr_freed = 0; unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; @@ -1467,16 +1465,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, count = min(pcp->count, count); while (count > 0) { struct list_head *list; + int nr_pages; - /* - * Remove pages from lists in a round-robin fashion. A - * batch_free count is maintained that is incremented when an - * empty list is encountered. This is so more pages are freed - * off fuller lists instead of spinning excessively around empty - * lists - */ + /* Remove pages from lists in a round-robin fashion. */ do { - batch_free++; if (++pindex == NR_PCP_LISTS) pindex = 0; list = &pcp->lists[pindex]; @@ -1489,18 +1481,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, min_pindex++; } while (1); - /* This is the only non-empty list. Free them all. */ - if (batch_free >= max_pindex - min_pindex) - batch_free = count; - order = pindex_to_order(pindex); + nr_pages = 1 << order; BUILD_BUG_ON(MAX_ORDER >= (1<lru); - nr_freed += 1 << order; - count -= 1 << order; + count -= nr_pages; + pcp->count -= nr_pages; if (bulkfree_pcp_prepare(page)) continue; @@ -1524,9 +1513,8 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page, order); prefetch_nr--; } - } while (count > 0 && --batch_free && !list_empty(list)); + } while (count > 0 && !list_empty(list)); } - pcp->count -= nr_freed; /* * local_lock_irq held so equivalent to spin_lock_irqsave for @@ -3133,7 +3121,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) - free_pcppages_bulk(zone, to_drain, pcp); + free_pcppages_bulk(zone, to_drain, pcp, 0); local_unlock_irqrestore(&pagesets.lock, flags); } #endif @@ -3154,7 +3142,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp); + free_pcppages_bulk(zone, pcp->count, pcp, 0); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3435,7 +3423,7 @@ static void free_unref_page_commit(struct page *page, int migratetype, if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp, pindex); } } From af2da3ea0948ef39ee6de7edd6046226849188ea Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 16 Feb 2022 15:31:18 +1100 Subject: [PATCH 150/334] mm/page_alloc: free pages in a single pass during bulk free free_pcppages_bulk() has taken two passes through the pcp lists since commit 0a5f4e5b4562 ("mm/free_pcppages_bulk: do not hold lock when picking pages to free") due to deferring the cost of selecting PCP lists until the zone lock is held. Now that list selection is simpler, the main cost during selection is bulkfree_pcp_prepare() which in the normal case is a simple check and prefetching. As the list manipulations have cost in itself, go back to freeing pages in a single pass. The series up to this point was evaulated using a trunc microbenchmark that is truncating sparse files stored in page cache (mmtests config config-io-trunc). Sparse files were used to limit filesystem interaction. The results versus a revert of storing high-order pages in the PCP lists is 1-socket Skylake 5.17.0-rc3 5.17.0-rc3 5.17.0-rc3 vanilla mm-reverthighpcp-v1r1 mm-highpcpopt-v1 Min elapsed 540.00 ( 0.00%) 530.00 ( 1.85%) 530.00 ( 1.85%) Amean elapsed 543.00 ( 0.00%) 530.00 * 2.39%* 530.00 * 2.39%* Stddev elapsed 4.83 ( 0.00%) 0.00 ( 100.00%) 0.00 ( 100.00%) CoeffVar elapsed 0.89 ( 0.00%) 0.00 ( 100.00%) 0.00 ( 100.00%) Max elapsed 550.00 ( 0.00%) 530.00 ( 3.64%) 530.00 ( 3.64%) BAmean-50 elapsed 540.00 ( 0.00%) 530.00 ( 1.85%) 530.00 ( 1.85%) BAmean-95 elapsed 542.22 ( 0.00%) 530.00 ( 2.25%) 530.00 ( 2.25%) BAmean-99 elapsed 542.22 ( 0.00%) 530.00 ( 2.25%) 530.00 ( 2.25%) 2-socket CascadeLake 5.17.0-rc3 5.17.0-rc3 5.17.0-rc3 vanilla mm-reverthighpcp-v1 mm-highpcpopt-v1 Min elapsed 510.00 ( 0.00%) 500.00 ( 1.96%) 500.00 ( 1.96%) Amean elapsed 529.00 ( 0.00%) 521.00 ( 1.51%) 516.00 * 2.46%* Stddev elapsed 16.63 ( 0.00%) 12.87 ( 22.64%) 9.66 ( 41.92%) CoeffVar elapsed 3.14 ( 0.00%) 2.47 ( 21.46%) 1.87 ( 40.45%) Max elapsed 550.00 ( 0.00%) 540.00 ( 1.82%) 530.00 ( 3.64%) BAmean-50 elapsed 516.00 ( 0.00%) 512.00 ( 0.78%) 510.00 ( 1.16%) BAmean-95 elapsed 526.67 ( 0.00%) 518.89 ( 1.48%) 514.44 ( 2.32%) BAmean-99 elapsed 526.67 ( 0.00%) 518.89 ( 1.48%) 514.44 ( 2.32%) The original motivation for multi-passes was will-it-scale page_fault1 using $nr_cpu processes. 2-socket CascadeLake (40 cores, 80 CPUs HT enabled) 5.17.0-rc3 5.17.0-rc3 vanilla mm-highpcpopt-v1r4 Hmean page_fault1-processes-2 2694662.26 ( 0.00%) 2696801.07 ( 0.08%) Hmean page_fault1-processes-5 6425819.34 ( 0.00%) 6426573.21 ( 0.01%) Hmean page_fault1-processes-8 9642169.10 ( 0.00%) 9647444.94 ( 0.05%) Hmean page_fault1-processes-12 12167502.10 ( 0.00%) 12073323.10 * -0.77%* Hmean page_fault1-processes-21 15636859.03 ( 0.00%) 15587449.50 * -0.32%* Hmean page_fault1-processes-30 25157348.61 ( 0.00%) 25111707.15 * -0.18%* Hmean page_fault1-processes-48 27694013.85 ( 0.00%) 27728568.63 ( 0.12%) Hmean page_fault1-processes-79 25928742.64 ( 0.00%) 25920933.41 ( -0.03%) <--- Hmean page_fault1-processes-110 25730869.75 ( 0.00%) 25695727.57 * -0.14%* Hmean page_fault1-processes-141 25626992.42 ( 0.00%) 25675346.68 * 0.19%* Hmean page_fault1-processes-172 25611651.35 ( 0.00%) 25650940.14 * 0.15%* Hmean page_fault1-processes-203 25577298.75 ( 0.00%) 25584848.65 ( 0.03%) Hmean page_fault1-processes-234 25580686.07 ( 0.00%) 25601794.52 * 0.08%* Hmean page_fault1-processes-265 25570215.47 ( 0.00%) 25553191.25 ( -0.07%) Hmean page_fault1-processes-296 25549488.62 ( 0.00%) 25530311.58 ( -0.08%) Hmean page_fault1-processes-320 25555149.05 ( 0.00%) 25585059.83 ( 0.12%) The differences are mostly within the noise and the difference close to $nr_cpus is negligible. Link: https://lkml.kernel.org/r/20220215145111.27082-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Aaron Lu Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 57 +++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a7520d466176..2974f3340e7c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1455,14 +1455,21 @@ static void free_pcppages_bulk(struct zone *zone, int count, unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; - struct page *page, *tmp; - LIST_HEAD(head); + struct page *page; /* * Ensure proper count is passed which otherwise would stuck in the * below while (list_empty(list)) loop. */ count = min(pcp->count, count); + + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + while (count > 0) { struct list_head *list; int nr_pages; @@ -1485,7 +1492,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, nr_pages = 1 << order; BUILD_BUG_ON(MAX_ORDER >= (1<lru); count -= nr_pages; @@ -1494,12 +1505,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (bulkfree_pcp_prepare(page)) continue; - /* Encode order with the migratetype */ - page->index <<= NR_PCP_ORDER_WIDTH; - page->index |= order; - - list_add_tail(&page->lru, &head); - /* * We are going to put the page back to the global * pool, prefetch its buddy to speed up later access @@ -1513,36 +1518,18 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page, order); prefetch_nr--; } - } while (count > 0 && !list_empty(list)); - } - - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); - - /* - * Use safe version since after __free_one_page(), - * page->lru.next will not point to original list. - */ - list_for_each_entry_safe(page, tmp, &head, lru) { - int mt = get_pcppage_migratetype(page); - /* mt has been encoded with the order (see above) */ - order = mt & NR_PCP_ORDER_MASK; - mt >>= NR_PCP_ORDER_WIDTH; + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, order, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); + } while (count > 0 && !list_empty(list)); } + spin_unlock(&zone->lock); } From 9dfc6caa212de73039edd34411653208f8f2332f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 16 Feb 2022 15:31:18 +1100 Subject: [PATCH 151/334] mm/page_alloc: limit number of high-order pages on PCP during bulk free When a PCP is mostly used for frees then high-order pages can exist on PCP lists for some time. This is problematic when the allocation pattern is all allocations from one CPU and all frees from another resulting in colder pages being used. When bulk freeing pages, limit the number of high-order pages that are stored on the PCP lists. Netperf running on localhost exhibits this pattern and while it does not matter for some machines, it does matter for others with smaller caches where cache misses cause problems due to reduced page reuse. Pages freed directly to the buddy list may be reused quickly while still cache hot where as storing on the PCP lists may be cold by the time free_pcppages_bulk() is called. Using perf kmem:mm_page_alloc, the 5 most used page frames were 5.17-rc3 13041 pfn=0x111a30 13081 pfn=0x5814d0 13097 pfn=0x108258 13121 pfn=0x689598 13128 pfn=0x5814d8 5.17-revert-highpcp 192009 pfn=0x54c140 195426 pfn=0x1081d0 200908 pfn=0x61c808 243515 pfn=0xa9dc20 402523 pfn=0x222bb8 5.17-full-series 142693 pfn=0x346208 162227 pfn=0x13bf08 166413 pfn=0x2711e0 166950 pfn=0x2702f8 The spread is wider as there is still time before pages freed to one PCP get released with a tradeoff between fast reuse and reduced zone lock acquisition. On the machine used to gather the traces, the headline performance was equivalent. netperf-tcp 5.17.0-rc3 5.17.0-rc3 5.17.0-rc3 vanilla mm-reverthighpcp-v1r1 mm-highpcplimit-v1r12 Hmean 64 839.93 ( 0.00%) 840.77 ( 0.10%) 835.34 * -0.55%* Hmean 128 1614.22 ( 0.00%) 1622.07 * 0.49%* 1604.18 * -0.62%* Hmean 256 2952.00 ( 0.00%) 2953.19 ( 0.04%) 2959.46 ( 0.25%) Hmean 1024 10291.67 ( 0.00%) 10239.17 ( -0.51%) 10287.05 ( -0.04%) Hmean 2048 17335.08 ( 0.00%) 17399.97 ( 0.37%) 17125.73 * -1.21%* Hmean 3312 22628.15 ( 0.00%) 22471.97 ( -0.69%) 22414.24 * -0.95%* Hmean 4096 25009.50 ( 0.00%) 24752.83 * -1.03%* 24620.03 * -1.56%* Hmean 8192 32745.01 ( 0.00%) 31682.63 * -3.24%* 32475.31 ( -0.82%) Hmean 16384 39759.59 ( 0.00%) 36805.78 * -7.43%* 39291.42 ( -1.18%) On a 1-socket skylake machine with a small CPU cache that suffers more if cache misses are too high netperf-tcp 5.17.0-rc3 5.17.0-rc3 5.17.0-rc3 vanilla mm-reverthighpcp-v1 mm-highpcplimit-v1 Min 64 935.38 ( 0.00%) 939.40 ( 0.43%) 940.11 ( 0.51%) Min 128 1831.69 ( 0.00%) 1856.15 ( 1.34%) 1849.30 ( 0.96%) Min 256 3560.61 ( 0.00%) 3659.25 ( 2.77%) 3654.12 ( 2.63%) Min 1024 13165.24 ( 0.00%) 13444.74 ( 2.12%) 13281.71 ( 0.88%) Min 2048 22706.44 ( 0.00%) 23219.67 ( 2.26%) 23027.31 ( 1.41%) Min 3312 30960.26 ( 0.00%) 31985.01 ( 3.31%) 31484.40 ( 1.69%) Min 4096 35149.03 ( 0.00%) 35997.44 ( 2.41%) 35891.92 ( 2.11%) Min 8192 48064.73 ( 0.00%) 49574.05 ( 3.14%) 48928.89 ( 1.80%) Min 16384 58017.25 ( 0.00%) 60352.93 ( 4.03%) 60691.14 ( 4.61%) Hmean 64 938.95 ( 0.00%) 941.50 * 0.27%* 940.47 ( 0.16%) Hmean 128 1843.10 ( 0.00%) 1857.58 * 0.79%* 1855.83 * 0.69%* Hmean 256 3573.07 ( 0.00%) 3667.45 * 2.64%* 3662.08 * 2.49%* Hmean 1024 13206.52 ( 0.00%) 13487.80 * 2.13%* 13351.11 * 1.09%* Hmean 2048 22870.23 ( 0.00%) 23337.96 * 2.05%* 23149.68 * 1.22%* Hmean 3312 31001.99 ( 0.00%) 32206.50 * 3.89%* 31849.40 * 2.73%* Hmean 4096 35364.59 ( 0.00%) 36490.96 * 3.19%* 36112.91 * 2.12%* Hmean 8192 48497.71 ( 0.00%) 49954.05 * 3.00%* 49384.50 * 1.83%* Hmean 16384 58410.86 ( 0.00%) 60839.80 * 4.16%* 61362.12 * 5.05%* Note that this was a machine that did not benefit from caching high-order pages and performance is almost restored with the series applied. It's not fully restored as cache misses are still higher. This is a trade-off between optimising for a workload that does all allocs on one CPU and frees on another or more general workloads that need high-order pages for SLUB and benefit from avoiding zone->lock for every SLUB refill/drain. Link: https://lkml.kernel.org/r/20220215145111.27082-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Aaron Lu Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2974f3340e7c3..89969a1c144c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3352,10 +3352,15 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, + bool free_high) { int min_nr_free, max_nr_free; + /* Free everything if batch freeing high-order pages. */ + if (unlikely(free_high)) + return pcp->count; + /* Check for PCP disabled or boot pageset */ if (unlikely(high < batch)) return 1; @@ -3376,11 +3381,12 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) return batch; } -static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, + bool free_high) { int high = READ_ONCE(pcp->high); - if (unlikely(!high)) + if (unlikely(!high || free_high)) return 0; if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) @@ -3400,17 +3406,27 @@ static void free_unref_page_commit(struct page *page, int migratetype, struct per_cpu_pages *pcp; int high; int pindex; + bool free_high; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); list_add(&page->lru, &pcp->lists[pindex]); pcp->count += 1 << order; - high = nr_pcp_high(pcp, zone); + + /* + * As high-order pages other than THP's stored on PCP can contribute + * to fragmentation, limit the number stored when PCP is heavily + * freeing without allocation. The remainder after bulk freeing + * stops will be drained from vmstat refresh context. + */ + free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); + + high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp, pindex); + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); } } From ea4ae9f64bd835bd42f9697c3fb46c78e2778017 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 16 Feb 2022 15:31:19 +1100 Subject: [PATCH 152/334] mm/pages_alloc.c: don't create ZONE_MOVABLE beyond the end of a node ZONE_MOVABLE uses the remaining memory in each node. Its starting pfn is also aligned to MAX_ORDER_NR_PAGES. It is possible for the remaining memory in a node to be less than MAX_ORDER_NR_PAGES, meaning there is not enough room for ZONE_MOVABLE on that node. Unfortunately this condition is not checked for. This leads to zone_movable_pfn[] getting set to a pfn greater than the last pfn in a node. calculate_node_totalpages() then sets zone->present_pages to be greater than zone->spanned_pages which is invalid, as spanned_pages represents the maximum number of pages in a zone assuming no holes. Subsequently it is possible free_area_init_core() will observe a zone of size zero with present pages. In this case it will skip setting up the zone, including the initialisation of free_lists[]. However populated_zone() checks zone->present_pages to see if a zone has memory available. This is used by iterators such as walk_zones_in_node(). pagetypeinfo_showfree() uses this to walk the free_list of each zone in each node, which are assumed to be initialised due to the zone not being empty. As free_area_init_core() never initialised the free_lists[] this results in the following kernel crash when trying to read /proc/pagetypeinfo: [ 67.534914] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 67.535429] #PF: supervisor read access in kernel mode [ 67.535789] #PF: error_code(0x0000) - not-present page [ 67.536128] PGD 0 P4D 0 [ 67.536305] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI [ 67.536696] CPU: 0 PID: 456 Comm: cat Not tainted 5.16.0 #461 [ 67.537096] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 [ 67.537638] RIP: 0010:pagetypeinfo_show+0x163/0x460 [ 67.537992] Code: 9e 82 e8 80 57 0e 00 49 8b 06 b9 01 00 00 00 4c 39 f0 75 16 e9 65 02 00 00 48 83 c1 01 48 81 f9 a0 86 01 00 0f 84 48 02 00 00 <48> 8b 00 4c 39 f0 75 e7 48 c7 c2 80 a2 e2 82 48 c7 c6 79 ef e3 82 [ 67.538259] RSP: 0018:ffffc90001c4bd10 EFLAGS: 00010003 [ 67.538259] RAX: 0000000000000000 RBX: ffff88801105f638 RCX: 0000000000000001 [ 67.538259] RDX: 0000000000000001 RSI: 000000000000068b RDI: ffff8880163dc68b [ 67.538259] RBP: ffffc90001c4bd90 R08: 0000000000000001 R09: ffff8880163dc67e [ 67.538259] R10: 656c6261766f6d6e R11: 6c6261766f6d6e55 R12: ffff88807ffb4a00 [ 67.538259] R13: ffff88807ffb49f8 R14: ffff88807ffb4580 R15: ffff88807ffb3000 [ 67.538259] FS: 00007f9c83eff5c0(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 [ 67.538259] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 67.538259] CR2: 0000000000000000 CR3: 0000000013c8e000 CR4: 0000000000350ef0 [ 67.538259] Call Trace: [ 67.538259] [ 67.538259] seq_read_iter+0x128/0x460 [ 67.538259] ? aa_file_perm+0x1af/0x5f0 [ 67.538259] proc_reg_read_iter+0x51/0x80 [ 67.538259] ? lock_is_held_type+0xea/0x140 [ 67.538259] new_sync_read+0x113/0x1a0 [ 67.538259] vfs_read+0x136/0x1d0 [ 67.538259] ksys_read+0x70/0xf0 [ 67.538259] __x64_sys_read+0x1a/0x20 [ 67.538259] do_syscall_64+0x3b/0xc0 [ 67.538259] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 67.538259] RIP: 0033:0x7f9c83e23cce [ 67.538259] Code: c0 e9 b6 fe ff ff 50 48 8d 3d 6e 13 0a 00 e8 c9 e3 01 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28 [ 67.538259] RSP: 002b:00007fff116e1a08 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 67.538259] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f9c83e23cce [ 67.538259] RDX: 0000000000020000 RSI: 00007f9c83a2c000 RDI: 0000000000000003 [ 67.538259] RBP: 00007f9c83a2c000 R08: 00007f9c83a2b010 R09: 0000000000000000 [ 67.538259] R10: 00007f9c83f2d7d0 R11: 0000000000000246 R12: 0000000000000000 [ 67.538259] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [ 67.538259] Fix this by checking that the aligned zone_movable_pfn[] does not exceed the end of the node, and if it does skip creating a movable zone on this node. Link: https://lkml.kernel.org/r/20220215025831.2113067-1-apopple@nvidia.com Signed-off-by: Alistair Popple Fixes: 2a1e274acf0b ("Create the ZONE_MOVABLE zone") Cc: John Hubbard Cc: Zi Yan Cc: Anshuman Khandual Cc: Oscar Salvador Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89969a1c144c1..79d4ad7a4f051 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7998,10 +7998,17 @@ static void __init find_zone_movable_pfns_for_nodes(void) out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ - for (nid = 0; nid < MAX_NUMNODES; nid++) + for (nid = 0; nid < MAX_NUMNODES; nid++) { + unsigned long start_pfn, end_pfn; + zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (zone_movable_pfn[nid] >= end_pfn) + zone_movable_pfn[nid] = 0; + } + out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; From 376bbe43f1739939b585c66d5d4fe58d3afa3778 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 16 Feb 2022 15:31:19 +1100 Subject: [PATCH 153/334] mm/memory-failure.c: remove obsolete comment With the introduction of mf_mutex, most of memory error handling process is mutually exclusive, so the in-line comment about subtlety about double-checking PageHWPoison is no more correct. So remove it. Link: https://lkml.kernel.org/r/20220125025601.3054511-1-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Suggested-by: Mike Kravetz Reviewed-by: Miaohe Lin Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Reviewed-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97a9ed8f87a96..0f6413a2f3016 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2150,12 +2150,6 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - /* - * Check PageHWPoison again inside page lock because PageHWPoison - * is set by memory_failure() outside page lock. Note that - * memory_failure() also double-checks PageHWPoison inside page lock, - * so there's no race between soft_offline_page() and memory_failure(). - */ lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); From 6e1a291573d3a8d06c57b3caae2efa27f77ba7e3 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 16 Feb 2022 15:31:19 +1100 Subject: [PATCH 154/334] mm/hwpoison: fix error page recovered but reported "not recovered" When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed. If the CMCI wins that race, the page is marked poisoned when uc_decode_notifier() calls memory_failure() and the machine check processing code finds the page already poisoned. It calls kill_accessing_process() to make sure a SIGBUS is sent. But returns the wrong error code. Console log looks like this: [34775.674296] mce: Uncorrected hardware memory error in user-access at 3710b3400 [34775.675413] Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered [34775.690310] Memory failure: 0x3710b3: already hardware poisoned [34775.696247] Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption [34775.706072] mce: Memory error not recovered kill_accessing_process() is supposed to return -EHWPOISON to notify that SIGBUS is already set to the process and kill_me_maybe() doesn't have to send it again. But current code simply fails to do this, so fix it to make sure to work as intended. This change avoids the noise message "Memory error not recovered" and skips duplicate SIGBUSs. [tony.luck@intel.com: reword some parts of commit message] Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Naoya Horiguchi Reported-by: Youquan Song Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0f6413a2f3016..2e2f740c63dc7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -707,8 +707,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { From a5824cab6598281441500555af202fee501efb91 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:19 +1100 Subject: [PATCH 155/334] mm/memory-failure.c: minor clean up for memory_failure_dev_pagemap Patch series "mm/memory-failure.c: A few cleanup patches for memory failure". This series contains a few patches to simplify the code logic, remove unneeded variable and remove obsolete comment. More details can be found in the respective changelogs. This patch (of 8): The flags always has MF_ACTION_REQUIRED and MF_MUST_KILL set. So we do not need to check these flags again. Link: https://lkml.kernel.org/r/20220210141733.1908-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220210141733.1908-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2e2f740c63dc7..1e1cb1191ab77 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1640,7 +1640,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(page, &tokill, true); list_for_each_entry(tk, &tokill, nd) if (tk->size_shift) @@ -1655,7 +1655,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, start = (page->index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(page->mapping, start, size, 0); } - kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags); + kill_procs(&tokill, true, false, pfn, flags); rc = 0; unlock: dax_unlock_page(page, cookie); From 7ccad0ac903d5c195ae012fb9f5ab21118d9fa55 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:19 +1100 Subject: [PATCH 156/334] mm/memory-failure.c: avoid walking page table when vma_address() return -EFAULT It's unnecessary to walk the page table when vma_address() return -EFAULT. Return early if so to save some cpu cycles. Link: https://lkml.kernel.org/r/20220210141733.1908-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1e1cb1191ab77..2768cd1aa8af3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -315,6 +315,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, pmd_t *pmd; pte_t *pte; + if (address == -EFAULT) + return 0; pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return 0; From bd92b445e02d9c30fdca995eb42ba06ad35fc0a8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:20 +1100 Subject: [PATCH 157/334] mm/memory-failure.c: rework the signaling logic in kill_proc BUS_MCEERR_AR code is only sent when MF_ACTION_REQUIRED is set and the target is current. Rework the code to make this clear. Link: https://lkml.kernel.org/r/20220210141733.1908-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2768cd1aa8af3..dc61c133cd249 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -258,16 +258,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); - if (flags & MF_ACTION_REQUIRED) { - if (t == current) - ret = force_sig_mceerr(BUS_MCEERR_AR, - (void __user *)tk->addr, addr_lsb); - else - /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */ - ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, - addr_lsb, t); - } else { + if ((flags & MF_ACTION_REQUIRED) && (t == current)) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + else /* + * Signal other processes sharing the page if they have + * PF_MCE_EARLY set. * Don't use force here, it's convenient if the signal * can be temporarily blocked. * This could cause a loop when the user sets SIGBUS @@ -275,7 +272,6 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) */ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); /* synchronous? */ - } if (ret < 0) pr_info("Memory failure: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); From 46c0856fea3bfa49e1e5cdcae78ef7f991f7ba31 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:20 +1100 Subject: [PATCH 158/334] mm/memory-failure.c: remove unneeded orig_head orig_head is used to check whether the page have changed compound pages during the locking. But it's always equal to hpage. So we can use hpage directly and remove this redundant one. Link: https://lkml.kernel.org/r/20220210141733.1908-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dc61c133cd249..6e629ce73a292 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1687,7 +1687,6 @@ int memory_failure(unsigned long pfn, int flags) { struct page *p; struct page *hpage; - struct page *orig_head; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; @@ -1733,7 +1732,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; } - orig_head = hpage = compound_head(p); + hpage = compound_head(p); num_poisoned_pages_inc(); /* @@ -1817,7 +1816,7 @@ int memory_failure(unsigned long pfn, int flags) * The page could have changed compound pages during the locking. * If this happens just bail out. */ - if (PageCompound(p) && compound_head(p) != orig_head) { + if (PageCompound(p) && compound_head(p) != hpage) { action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto unlock_page; From 78863155b67ac3cb4347e1c9bb07e55bbee61325 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:20 +1100 Subject: [PATCH 159/334] mm/memory-failure.c: remove PageSlab check in hwpoison_filter_dev Since commit 03e5ac2fc3bf ("mm: fix crash when using XFS on loopback"), page_mapping() can handle the Slab pages. So remove this unnecessary PageSlab check and obsolete comment. Link: https://lkml.kernel.org/r/20220210141733.1908-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6e629ce73a292..83a28d214643f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -130,12 +130,6 @@ static int hwpoison_filter_dev(struct page *p) hwpoison_filter_dev_minor == ~0U) return 0; - /* - * page_mapping() does not accept slab pages. - */ - if (PageSlab(p)) - return -EINVAL; - mapping = page_mapping(p); if (mapping == NULL || mapping->host == NULL) return -EINVAL; From 0782ae63ec48e65052e9349bfbec1f876134c35d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:20 +1100 Subject: [PATCH 160/334] mm/memory-failure.c: rework the try_to_unmap logic in hwpoison_user_mappings() Only for hugetlb pages in shared mappings, try_to_unmap should take semaphore in write mode here. Rework the code to make it clear. Link: https://lkml.kernel.org/r/20220210141733.1908-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 83a28d214643f..7fc907ab48965 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1405,26 +1405,22 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - if (!PageHuge(hpage)) { - try_to_unmap(hpage, ttu); + if (PageHuge(hpage) && !PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); } else { - if (!PageAnon(hpage)) { - /* - * For hugetlb pages in shared mappings, try_to_unmap - * could potentially call huge_pmd_unshare. Because of - * this, take semaphore in write mode here and set - * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higher level. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - if (mapping) { - try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); - } else { - try_to_unmap(hpage, ttu); - } + try_to_unmap(hpage, ttu); } unmap_success = !page_mapped(hpage); From 6a3a84fe608a5159a5e18e32bc51b91abf8fcc39 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:20 +1100 Subject: [PATCH 161/334] mm/memory-failure.c: remove obsolete comment in __soft_offline_page Since commit add05cecef80 ("mm: soft-offline: don't free target page in successful page migration"), set_migratetype_isolate logic is removed. Remove this obsolete comment. Link: https://lkml.kernel.org/r/20220210141733.1908-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7fc907ab48965..98c0fd7489664 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2157,10 +2157,6 @@ static int __soft_offline_page(struct page *page) ret = invalidate_inode_page(page); unlock_page(page); - /* - * RED-PEN would be better to keep it isolated here, but we - * would need to fix isolation locking first. - */ if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); page_handle_poison(page, false, true); From 0311b4c473e1e8c56c162f7b1fbde4c5f6d85116 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:21 +1100 Subject: [PATCH 162/334] mm/memory-failure.c: remove unnecessary PageTransTail check When we reach here, we're guaranteed to have non-compound page as thp is already splited. Remove this unnecessary PageTransTail check. Link: https://lkml.kernel.org/r/20220210141733.1908-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 98c0fd7489664..f092013c7f5df 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1834,7 +1834,7 @@ int memory_failure(unsigned long pfn, int flags) * page_lock. We need wait writeback completion for this page or it * may trigger vfs BUG while evict inode. */ - if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) + if (!PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* From e14533e49ce3f4ffc937025e7add48cb3c5c87a1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 16 Feb 2022 15:31:21 +1100 Subject: [PATCH 163/334] mm: invalidate hwpoison page cache page in fault path Sometimes the page offlining code can leave behind a hwpoisoned clean page cache page. This can lead to programs being killed over and over and over again as they fault in the hwpoisoned page, get killed, and then get re-spawned by whatever wanted to run them. This is particularly embarrassing when the page was offlined due to having too many corrected memory errors. Now we are killing tasks due to them trying to access memory that probably isn't even corrupted. This problem can be avoided by invalidating the page from the page fault handler, which already has a branch for dealing with these kinds of pages. With this patch we simply pretend the page fault was successful if the page was invalidated, return to userspace, incur another page fault, read in the file from disk (to a new memory page), and then everything works again. Link: https://lkml.kernel.org/r/20220212213740.423efcea@imladris.surriel.com Signed-off-by: Rik van Riel Reviewed-by: Miaohe Lin Cc: Mel Gorman Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f4c0226fda489..ca9b2f5751f8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3926,11 +3926,16 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; if (unlikely(PageHWPoison(vmf->page))) { - if (ret & VM_FAULT_LOCKED) + vm_fault_t poisonret = VM_FAULT_HWPOISON; + if (ret & VM_FAULT_LOCKED) { + /* Retry if a clean page was removed from the cache. */ + if (invalidate_inode_page(vmf->page)) + poisonret = 0; unlock_page(vmf->page); + } put_page(vmf->page); vmf->page = NULL; - return VM_FAULT_HWPOISON; + return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) From ea2c33eafa55294bc9c964af45fc51fa29c6414f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:21 +1100 Subject: [PATCH 164/334] mm/munlock: delete page_mlock() and all its works Patch series "mm/munlock: rework of mlock+munlock page handling", v2. This patch (of 13): We have recommended some applications to mlock their userspace, but that turns out to be counter-productive: when many processes mlock the same file, contention on rmap's i_mmap_rwsem can become intolerable at exit: it is needed for write, to remove any vma mapping that file from rmap's tree; but hogged for read by those with mlocks calling page_mlock() (formerly known as try_to_munlock()) on *each* page mapped from the file (the purpose being to find out whether another process has the page mlocked, so therefore it should not be unmlocked yet). Several optimizations have been made in the past: one is to skip page_mlock() when mapcount tells that nothing else has this page mapped; but that doesn't help at all when others do have it mapped. This time around, I initially intended to add a preliminary search of the rmap tree for overlapping VM_LOCKED ranges; but that gets messy with locking order, when in doubt whether a page is actually present; and risks adding even more contention on the i_mmap_rwsem. A solution would be much easier, if only there were space in struct page for an mlock_count... but actually, most of the time, there is space for it - an mlocked page spends most of its life on an unevictable LRU, but since 3.18 removed the scan_unevictable_pages sysctl, that "LRU" has been redundant. Let's try to reuse its page->lru. But leave that until a later patch: in this patch, clear the ground by removing page_mlock(), and all the infrastructure that has gathered around it - which mostly hinders understanding, and will make reviewing new additions harder. Don't mind those old comments about THPs, they date from before 4.5's refcounting rework: splitting is not a risk here. Just keep a minimal version of munlock_vma_page(), as reminder of what it should attend to (in particular, the odd way PGSTRANDED is counted out of PGMUNLOCKED), and likewise a stub for munlock_vma_pages_range(). Move unchanged __mlock_posix_error_return() out of the way, down to above its caller: this series then makes no further change after mlock_fixup(). After this and each following commit, the kernel builds, boots and runs; but with deficiencies which may show up in testing of mlock and munlock. The system calls succeed or fail as before, and mlock remains effective in preventing page reclaim; but meminfo's Unevictable and Mlocked amounts may be shown too low after mlock, grow, then stay too high after munlock: with previously mlocked pages remaining unevictable for too long, until finally unmapped and freed and counts corrected. Normal service will be resumed in "mm/munlock: mlock_pte_range() when mlocking or munlocking". Link: https://lkml.kernel.org/r/55a49083-37f9-3766-1de9-9feea7428ac@google.com Link: https://lkml.kernel.org/r/48c44eae-4cf0-a8ce-454c-5ec88457ffea@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Alistair Popple Cc: Johannes Weiner Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Yu Zhao Cc: Greg Thelen Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/rmap.h | 6 - mm/internal.h | 2 +- mm/mlock.c | 375 +++---------------------------------------- mm/rmap.c | 80 --------- 4 files changed, 25 insertions(+), 438 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e704b1a4c06c0..dc48aa8c2c947 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -237,12 +237,6 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int folio_mkclean(struct folio *); -/* - * called in munlock()/munmap() path to check for other vmas holding - * the page mlocked. - */ -void page_mlock(struct page *page); - void remove_migration_ptes(struct page *old, struct page *new, bool locked); /* diff --git a/mm/internal.h b/mm/internal.h index 927dfba5111f2..67059d49fed22 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -409,7 +409,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) * must be called with vma's mmap_lock held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); -extern unsigned int munlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); diff --git a/mm/mlock.c b/mm/mlock.c index 8f584eddd3053..aec4ce7919dae 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -46,12 +46,6 @@ EXPORT_SYMBOL(can_do_mlock); * be placed on the LRU "unevictable" list, rather than the [in]active lists. * The unevictable list is an LRU sibling list to the [in]active lists. * PageUnevictable is set to indicate the unevictable state. - * - * When lazy mlocking via vmscan, it is important to ensure that the - * vma's VM_LOCKED status is not concurrently being modified, otherwise we - * may have mlocked a page that is being munlocked. So lazy mlock must take - * the mmap_lock for read, and verify that the vma really is locked - * (see mm/rmap.c). */ /* @@ -106,299 +100,28 @@ void mlock_vma_page(struct page *page) } } -/* - * Finish munlock after successful page isolation - * - * Page must be locked. This is a wrapper for page_mlock() - * and putback_lru_page() with munlock accounting. - */ -static void __munlock_isolated_page(struct page *page) -{ - /* - * Optimization: if the page was mapped just once, that's our mapping - * and we don't need to check all the other vmas. - */ - if (page_mapcount(page) > 1) - page_mlock(page); - - /* Did try_to_unlock() succeed or punt? */ - if (!PageMlocked(page)) - count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); - - putback_lru_page(page); -} - -/* - * Accounting for page isolation fail during munlock - * - * Performs accounting when page isolation fails in munlock. There is nothing - * else to do because it means some other task has already removed the page - * from the LRU. putback_lru_page() will take care of removing the page from - * the unevictable list, if necessary. vmscan [page_referenced()] will move - * the page back to the unevictable list if some other vma has it mlocked. - */ -static void __munlock_isolation_failed(struct page *page) -{ - int nr_pages = thp_nr_pages(page); - - if (PageUnevictable(page)) - __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - else - __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); -} - /** * munlock_vma_page - munlock a vma page * @page: page to be unlocked, either a normal page or THP page head - * - * returns the size of the page as a page mask (0 for normal page, - * HPAGE_PMD_NR - 1 for THP head page) - * - * called from munlock()/munmap() path with page supposedly on the LRU. - * When we munlock a page, because the vma where we found the page is being - * munlock()ed or munmap()ed, we want to check whether other vmas hold the - * page locked so that we can leave it on the unevictable lru list and not - * bother vmscan with it. However, to walk the page's rmap list in - * page_mlock() we must isolate the page from the LRU. If some other - * task has removed the page from the LRU, we won't be able to do that. - * So we clear the PageMlocked as we might not get another chance. If we - * can't isolate the page, we leave it for putback_lru_page() and vmscan - * [page_referenced()/try_to_unmap()] to deal with. */ -unsigned int munlock_vma_page(struct page *page) +void munlock_vma_page(struct page *page) { - int nr_pages; - - /* For page_mlock() and to serialize with page migration */ + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - - if (!TestClearPageMlocked(page)) { - /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ - return 0; - } - - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - - if (!isolate_lru_page(page)) - __munlock_isolated_page(page); - else - __munlock_isolation_failed(page); - - return nr_pages - 1; -} - -/* - * convert get_user_pages() return value to posix mlock() error - */ -static int __mlock_posix_error_return(long retval) -{ - if (retval == -EFAULT) - retval = -ENOMEM; - else if (retval == -ENOMEM) - retval = -EAGAIN; - return retval; -} - -/* - * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() - * - * The fast path is available only for evictable pages with single mapping. - * Then we can bypass the per-cpu pvec and get better performance. - * when mapcount > 1 we need page_mlock() which can fail. - * when !page_evictable(), we need the full redo logic of putback_lru_page to - * avoid leaving evictable page in unevictable list. - * - * In case of success, @page is added to @pvec and @pgrescued is incremented - * in case that the page was previously unevictable. @page is also unlocked. - */ -static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, - int *pgrescued) -{ - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - - if (page_mapcount(page) <= 1 && page_evictable(page)) { - pagevec_add(pvec, page); - if (TestClearPageUnevictable(page)) - (*pgrescued)++; - unlock_page(page); - return true; - } - - return false; -} -/* - * Putback multiple evictable pages to the LRU - * - * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of - * the pages might have meanwhile become unevictable but that is OK. - */ -static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) -{ - count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); - /* - *__pagevec_lru_add() calls release_pages() so we don't call - * put_page() explicitly - */ - __pagevec_lru_add(pvec); - count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); -} - -/* - * Munlock a batch of pages from the same zone - * - * The work is split to two main phases. First phase clears the Mlocked flag - * and attempts to isolate the pages, all under a single zone lru lock. - * The second phase finishes the munlock only for pages where isolation - * succeeded. - * - * Note that the pagevec may be modified during the process. - */ -static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) -{ - int i; - int nr = pagevec_count(pvec); - int delta_munlocked = -nr; - struct pagevec pvec_putback; - struct lruvec *lruvec = NULL; - int pgrescued = 0; - - pagevec_init(&pvec_putback); - - /* Phase 1: page isolation */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); - - if (TestClearPageMlocked(page)) { - /* - * We already have pin from follow_page_mask() - * so we can spare the get_page() here. - */ - if (TestClearPageLRU(page)) { - lruvec = folio_lruvec_relock_irq(folio, lruvec); - del_page_from_lru_list(page, lruvec); - continue; - } else - __munlock_isolation_failed(page); - } else { - delta_munlocked++; - } + VM_BUG_ON_PAGE(PageTail(page), page); - /* - * We won't be munlocking this page in the next phase - * but we still need to release the follow_page_mask() - * pin. We cannot do it under lru_lock however. If it's - * the last pin, __page_cache_release() would deadlock. - */ - pagevec_add(&pvec_putback, pvec->pages[i]); - pvec->pages[i] = NULL; - } - if (lruvec) { - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - unlock_page_lruvec_irq(lruvec); - } else if (delta_munlocked) { - mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - } + if (TestClearPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); - /* Now we can release pins of pages that we are not munlocking */ - pagevec_release(&pvec_putback); - - /* Phase 2: page munlock */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - - if (page) { - lock_page(page); - if (!__putback_lru_fast_prepare(page, &pvec_putback, - &pgrescued)) { - /* - * Slow path. We don't want to lose the last - * pin before unlock_page() - */ - get_page(page); /* for putback_lru_page() */ - __munlock_isolated_page(page); - unlock_page(page); - put_page(page); /* from follow_page_mask() */ - } + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + } else if (PageUnevictable(page)) { + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } } - - /* - * Phase 3: page putback for pages that qualified for the fast path - * This will also call put_page() to return pin from follow_page_mask() - */ - if (pagevec_count(&pvec_putback)) - __putback_lru_fast(&pvec_putback, pgrescued); -} - -/* - * Fill up pagevec for __munlock_pagevec using pte walk - * - * The function expects that the struct page corresponding to @start address is - * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. - * - * The rest of @pvec is filled by subsequent pages within the same pmd and same - * zone, as long as the pte's are present and vm_normal_page() succeeds. These - * pages also get pinned. - * - * Returns the address of the next page that should be scanned. This equals - * @start + PAGE_SIZE when no page could be added by the pte walk. - */ -static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, - struct vm_area_struct *vma, struct zone *zone, - unsigned long start, unsigned long end) -{ - pte_t *pte; - spinlock_t *ptl; - - /* - * Initialize pte walk starting at the already pinned page where we - * are sure that there is a pte, as it was pinned under the same - * mmap_lock write op. - */ - pte = get_locked_pte(vma->vm_mm, start, &ptl); - /* Make sure we do not cross the page table boundary */ - end = pgd_addr_end(start, end); - end = p4d_addr_end(start, end); - end = pud_addr_end(start, end); - end = pmd_addr_end(start, end); - - /* The page next to the pinned page is the first we will try to get */ - start += PAGE_SIZE; - while (start < end) { - struct page *page = NULL; - pte++; - if (pte_present(*pte)) - page = vm_normal_page(vma, start, *pte); - /* - * Break if page could not be obtained or the page's node+zone does not - * match - */ - if (!page || page_zone(page) != zone) - break; - - /* - * Do not use pagevec for PTE-mapped THP, - * munlock_vma_pages_range() will handle them. - */ - if (PageTransCompound(page)) - break; - - get_page(page); - /* - * Increase the address that will be returned *before* the - * eventual break due to pvec becoming full by adding the page - */ - start += PAGE_SIZE; - if (pagevec_add(pvec, page) == 0) - break; - } - pte_unmap_unlock(pte, ptl); - return start; } /* @@ -413,75 +136,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, * * Returns with VM_LOCKED cleared. Callers must be prepared to * deal with this. - * - * We don't save and restore VM_LOCKED here because pages are - * still on lru. In unmap path, pages might be scanned by reclaim - * and re-mlocked by page_mlock/try_to_unmap before we unmap and - * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - while (start < end) { - struct page *page; - unsigned int page_mask = 0; - unsigned long page_increm; - struct pagevec pvec; - struct zone *zone; - - pagevec_init(&pvec); - /* - * Although FOLL_DUMP is intended for get_dump_page(), - * it just so happens that its special treatment of the - * ZERO_PAGE (returning an error instead of doing get_page) - * suits munlock very well (and if somehow an abnormal page - * has sneaked into the range, we won't oops here: great). - */ - page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); - - if (page && !IS_ERR(page)) { - if (PageTransTail(page)) { - VM_BUG_ON_PAGE(PageMlocked(page), page); - put_page(page); /* follow_page_mask() */ - } else if (PageTransHuge(page)) { - lock_page(page); - /* - * Any THP page found by follow_page_mask() may - * have gotten split before reaching - * munlock_vma_page(), so we need to compute - * the page_mask here instead. - */ - page_mask = munlock_vma_page(page); - unlock_page(page); - put_page(page); /* follow_page_mask() */ - } else { - /* - * Non-huge pages are handled in batches via - * pagevec. The pin from follow_page_mask() - * prevents them from collapsing by THP. - */ - pagevec_add(&pvec, page); - zone = page_zone(page); - - /* - * Try to fill the rest of pagevec using fast - * pte walk. This will also update start to - * the next page to process. Then munlock the - * pagevec. - */ - start = __munlock_pagevec_fill(&pvec, vma, - zone, start, end); - __munlock_pagevec(&pvec, zone); - goto next; - } - } - page_increm = 1 + page_mask; - start += page_increm * PAGE_SIZE; -next: - cond_resched(); - } + /* Reimplementation to follow in later commit */ } /* @@ -645,6 +306,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, return count >> PAGE_SHIFT; } +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; diff --git a/mm/rmap.c b/mm/rmap.c index 6a1e8c7f62136..7ce7f1946cff1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1996,76 +1996,6 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) rmap_walk(page, &rwc); } -/* - * Walks the vma's mapping a page and mlocks the page if any locked vma's are - * found. Once one is found the page is locked and the scan can be terminated. - */ -static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, void *unused) -{ - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; - - /* An un-locked vma doesn't have any pages to lock, continue the scan */ - if (!(vma->vm_flags & VM_LOCKED)) - return true; - - while (page_vma_mapped_walk(&pvmw)) { - /* - * Need to recheck under the ptl to serialise with - * __munlock_pagevec_fill() after VM_LOCKED is cleared in - * munlock_vma_pages_range(). - */ - if (vma->vm_flags & VM_LOCKED) { - /* - * PTE-mapped THP are never marked as mlocked; but - * this function is never called on a DoubleMap THP, - * nor on an Anon THP (which may still be PTE-mapped - * after DoubleMap was cleared). - */ - mlock_vma_page(page); - /* - * No need to scan further once the page is marked - * as mlocked. - */ - page_vma_mapped_walk_done(&pvmw); - return false; - } - } - - return true; -} - -/** - * page_mlock - try to mlock a page - * @page: the page to be mlocked - * - * Called from munlock code. Checks all of the VMAs mapping the page and mlocks - * the page if any are found. The page will be returned with PG_mlocked cleared - * if it is not mapped by any locked vmas. - */ -void page_mlock(struct page *page) -{ - struct rmap_walk_control rwc = { - .rmap_one = page_mlock_one, - .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, - - }; - - VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - - /* Anon THP are only marked as mlocked when singly mapped */ - if (PageTransCompound(page) && PageAnon(page)) - return; - - rmap_walk(page, &rwc); -} - #ifdef CONFIG_DEVICE_PRIVATE struct make_exclusive_args { struct mm_struct *mm; @@ -2291,11 +2221,6 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) @@ -2344,11 +2269,6 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) From 05a5ae4a296ee9feb0dbb93a5830aa687ac938a7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:21 +1100 Subject: [PATCH 165/334] mm/munlock: delete FOLL_MLOCK and FOLL_POPULATE If counting page mlocks, we must not double-count: follow_page_pte() can tell if a page has already been Mlocked or not, but cannot tell if a pte has already been counted or not: that will have to be done when the pte is mapped in (which lru_cache_add_inactive_or_unevictable() already tracks for new anon pages, but there's no such tracking yet for others). Delete all the FOLL_MLOCK code - faulting in the missing pages will do all that is necessary, without special mlock_vma_page() calls from here. But then FOLL_POPULATE turns out to serve no purpose - it was there so that its absence would tell faultin_page() not to faultin page when setting up VM_LOCKONFAULT areas; but if there's no special work needed here for mlock, then there's no work at all here for VM_LOCKONFAULT. Have I got that right? I've not looked into the history, but see that FOLL_POPULATE goes back before VM_LOCKONFAULT: did it serve a different purpose before? Ah, yes, it was used to skip the old stack guard page. And is it intentional that COW is not broken on existing pages when setting up a VM_LOCKONFAULT area? I can see that being argued either way, and have no reason to disagree with current behaviour. Link: https://lkml.kernel.org/r/cbed9c9f-1747-f06a-15ad-b2d9fb6025eb@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 -- mm/gup.c | 43 ++++++++----------------------------------- mm/huge_memory.c | 33 --------------------------------- 3 files changed, 8 insertions(+), 70 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ac11fc132ed10..dafad4448724e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2916,13 +2916,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ diff --git a/mm/gup.c b/mm/gup.c index 7337520afa499..45f828be6b1d1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -593,32 +593,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* Do not mlock pte-mapped THP */ - if (PageTransCompound(page)) - goto out; - - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } out: pte_unmap_unlock(ptep, ptl); return page; @@ -941,9 +915,6 @@ static int faultin_page(struct vm_area_struct *vma, unsigned int fault_flags = 0; vm_fault_t ret; - /* mlock all present pages, but do not fault in new pages */ - if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) - return -ENOENT; if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) @@ -1194,8 +1165,6 @@ static long __get_user_pages(struct mm_struct *mm, case -ENOMEM: case -EHWPOISON: goto out; - case -ENOENT: - goto next_page; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { @@ -1500,9 +1469,14 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + /* + * Rightly or wrongly, the VM_LOCKONFAULT case has never used + * faultin_page() to break COW, so it has no work to do here. + */ if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; + return nr_pages; + + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1569,10 +1543,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. - * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON; if (write) gup_flags |= FOLL_WRITE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0bda8b43c95f0..9e3eff1511487 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1385,39 +1385,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags); - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * We don't mlock() pte-mapped THPs. This way we can avoid - * leaking mlocked pages into non-VM_LOCKED VMAs. - * - * For anon THP: - * - * In most cases the pmd is the only mapping of the page as we - * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for - * writable private mappings in populate_vma_page_range(). - * - * The only scenario when we have the page shared here is if we - * mlocking read-only mapping shared over fork(). We skip - * mlocking such pages. - * - * For file THP: - * - * We can expect PageDoubleMap() to be stable under page lock: - * for file pages we set it in page_add_file_rmap(), which - * requires page to be locked. - */ - - if (PageAnon(page) && compound_mapcount(page) != 1) - goto skip_mlock; - if (PageDoubleMap(page) || !page->mapping) - goto skip_mlock; - if (!trylock_page(page)) - goto skip_mlock; - if (page->mapping && !PageDoubleMap(page)) - mlock_vma_page(page); - unlock_page(page); - } -skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); From 8140d8d6b00ddee35da73ed376d00a283eaf5892 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:21 +1100 Subject: [PATCH 166/334] mm/munlock: delete munlock_vma_pages_all(), allow oomreap munlock_vma_pages_range() will still be required, when munlocking but not munmapping a set of pages; but when unmapping a pte, the mlock count will be maintained in much the same way as it will be maintained when mapping in the pte. Which removes the need for munlock_vma_pages_all() on mlocked vmas when munmapping or exiting: eliminating the catastrophic contention on i_mmap_rwsem, and the need for page lock on the pages. There is still a need to update locked_vm accounting according to the munmapped vmas when munmapping: do that in detach_vmas_to_be_unmapped(). exit_mmap() does not need locked_vm updates, so delete unlock_range(). And wasn't I the one who forbade the OOM reaper to attack mlocked vmas, because of the uncertainty in blocking on all those page locks? No fear of that now, so permit the OOM reaper on mlocked vmas. Link: https://lkml.kernel.org/r/d9a9f8c3-1ee0-4c81-7017-6ecb78554a7@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 16 ++-------------- mm/madvise.c | 5 +++++ mm/mlock.c | 4 ++-- mm/mmap.c | 32 ++------------------------------ mm/oom_kill.c | 2 +- 5 files changed, 12 insertions(+), 47 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 67059d49fed22..758e5457a36c7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -71,11 +71,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); -static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); -} - struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -398,12 +393,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -static inline void munlock_vma_pages_all(struct vm_area_struct *vma) -{ - munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); -} +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); /* * must be called with vma's mmap_lock held for read or write, and page locked. @@ -411,9 +402,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) extern void mlock_vma_page(struct page *page); extern void munlock_vma_page(struct page *page); -extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len); - /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., diff --git a/mm/madvise.c b/mm/madvise.c index 1807778a5f70e..bed872a2ad5fa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -554,6 +554,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) diff --git a/mm/mlock.c b/mm/mlock.c index aec4ce7919dae..5d7ced8303beb 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -137,8 +137,8 @@ void munlock_vma_page(struct page *page) * Returns with VM_LOCKED cleared. Callers must be prepared to * deal with this. */ -void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { vma->vm_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/mmap.c b/mm/mmap.c index d445c1b9d6065..cca69e24ec88b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2674,6 +2674,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = NULL; do { vma_rb_erase(vma, &mm->mm_rb); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2778,22 +2780,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline void -unlock_range(struct vm_area_struct *start, unsigned long limit) -{ - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - - while (tmp && tmp->vm_start < limit) { - if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); - munlock_vma_pages_all(tmp); - } - - tmp = tmp->vm_next; - } -} - /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2874,12 +2860,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return error; } - /* - * unlock any mlock()ed ranges before detaching vmas - */ - if (mm->locked_vm) - unlock_range(vma, end); - /* Detach vmas from rbtree */ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) downgrade = false; @@ -3147,20 +3127,12 @@ void exit_mmap(struct mm_struct *mm) * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. - * - * This needs to be done before calling unlock_range(), - * which clears VM_LOCKED, otherwise the oom reaper cannot - * reliably test it. */ (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); } mmap_write_lock(mm); - if (mm->locked_vm) - unlock_range(mm->mmap, ULONG_MAX); - arch_exit_mmap(mm); vma = mm->mmap; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 832fb330376ef..6b875acabd1e7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -526,7 +526,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_lru_vma(vma)) + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; /* From c1879365d325d0f44337fedcfffe8bf200092f95 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:22 +1100 Subject: [PATCH 167/334] mm/munlock: rmap call mlock_vma_page() munlock_vma_page() Add vma argument to mlock_vma_page() and munlock_vma_page(), make them inline functions which check (vma->vm_flags & VM_LOCKED) before calling mlock_page() and munlock_page() in mm/mlock.c. Add bool compound to mlock_vma_page() and munlock_vma_page(): this is because we have understandable difficulty in accounting pte maps of THPs, and if passed a PageHead page, mlock_page() and munlock_page() cannot tell whether it's a pmd map to be counted or a pte map to be ignored. Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the others, and use that to call mlock_vma_page() at the end of the page adds, and munlock_vma_page() at the end of page_remove_rmap() (end or beginning? unimportant, but end was easier for assertions in testing). No page lock is required (although almost all adds happen to hold it): delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s. Certainly page lock did serialize with page migration, but I'm having difficulty explaining why that was ever important. Mlock accounting on THPs has been hard to define, differed between anon and file, involved PageDoubleMap in some places and not others, required clear_page_mlock() at some points. Keep it simple now: just count the pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks. page_add_new_anon_rmap() callers unchanged: they have long been calling lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED handling (it also checks for not VM_SPECIAL: I think that's overcautious, and inconsistent with other checks, that mmap_region() already prevents VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it). Link: https://lkml.kernel.org/r/501673c-a5a-6c5f-ab65-38545dfb723d@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/rmap.h | 17 +++++++------ kernel/events/uprobes.c | 7 ++---- mm/huge_memory.c | 17 ++++++------- mm/hugetlb.c | 4 +-- mm/internal.h | 36 ++++++++++++++++++++++---- mm/khugepaged.c | 4 +-- mm/ksm.c | 12 +-------- mm/memory.c | 45 +++++++++++---------------------- mm/migrate.c | 9 ++----- mm/mlock.c | 21 ++++++---------- mm/rmap.c | 56 +++++++++++++++++++---------------------- mm/userfaultfd.c | 14 ++++++----- 12 files changed, 113 insertions(+), 129 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dc48aa8c2c947..ac29b076082b7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -167,18 +167,19 @@ struct anon_vma *page_get_anon_vma(struct page *page); */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, int); + unsigned long address, int flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); -void page_add_file_rmap(struct page *, bool); -void page_remove_rmap(struct page *, bool); - + unsigned long address, bool compound); +void page_add_file_rmap(struct page *, struct vm_area_struct *, + bool compound); +void page_remove_rmap(struct page *, struct vm_area_struct *, + bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); static inline void page_dup_rmap(struct page *page, bool compound) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07b..eed2f7437d963 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9e3eff1511487..14aa18f9d4164 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1582,7 +1582,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1967,7 +1967,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); @@ -2101,6 +2101,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); } smp_wmb(); /* make pte visible before pmd */ @@ -2108,7 +2111,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, vma, false); put_page(page + i); } } @@ -2140,8 +2143,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (pmd_trans_huge(*pmd)) { if (!page) page = pmd_page(*pmd); - if (PageMlocked(page)) - clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); @@ -3065,7 +3066,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } @@ -3094,10 +3095,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else - page_add_file_rmap(new, true); + page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) - mlock_vma_page(new); /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3d450f8028233..1f0cca036f7fb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5012,7 +5012,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5257,7 +5257,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, true); + page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); diff --git a/mm/internal.h b/mm/internal.h index 758e5457a36c7..75cec0ade36dc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -395,12 +395,35 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); - /* - * must be called with vma's mmap_lock held for read or write, and page locked. + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. + * + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed in lru_cache_add_inactive_or_unevictable(). + * + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. */ -extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +void mlock_page(struct page *page); +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + mlock_page(page); +} +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -487,7 +510,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } -static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a325a646be33e..ab3ae46f5dbf4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -763,7 +763,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1502,7 +1502,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); } pte_unmap_unlock(start_pte, ptl); diff --git a/mm/ksm.c b/mm/ksm.c index c20bd4d9a0d9e..c5a4403b5dc9d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, err = replace_page(vma, page, kpage, orig_pte); } - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - out_unlock: unlock_page(page); out: diff --git a/mm/memory.c b/mm/memory.c index ca9b2f5751f8b..13a2fe911bf84 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, ptep, pte); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); - /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. @@ -1377,7 +1374,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1397,10 +1394,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); continue; } @@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1842,7 +1836,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -4007,7 +3991,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -4056,7 +4041,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } diff --git a/mm/migrate.c b/mm/migrate.c index 54b168a3b84a5..d3def9f044edc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,14 +248,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); - - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -2337,7 +2332,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); put_page(page); if (pte_present(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 5d7ced8303beb..92f28258b4ae5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -78,17 +78,13 @@ void clear_page_mlock(struct page *page) } } -/* - * Mark page as mlocked if not already. - * If page on LRU, isolate and putback to move to unevictable list. +/** + * mlock_page - mlock a page + * @page: page to be mlocked, either a normal page or a THP head. */ -void mlock_vma_page(struct page *page) +void mlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); if (!TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); @@ -101,14 +97,11 @@ void mlock_vma_page(struct page *page) } /** - * munlock_vma_page - munlock a vma page - * @page: page to be unlocked, either a normal page or THP page head + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. */ -void munlock_vma_page(struct page *page) +void munlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); if (TestClearPageMlocked(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index 7ce7f1946cff1..6cc8bf129f185 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1181,17 +1181,17 @@ void do_page_add_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) { + if (unlikely(PageKsm(page))) unlock_page_memcg(page); - return; - } /* address might be in next vma when migration races vma_adjust */ - if (first) + else if (first) __page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); } /** @@ -1232,12 +1232,14 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @compound: charge the page as compound or small page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, bool compound) +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { int i, nr = 1; @@ -1260,13 +1262,8 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { - struct page *head = compound_head(page); - VM_WARN_ON_ONCE(!PageLocked(page)); - - SetPageDoubleMap(head); - if (PageMlocked(page)) - clear_page_mlock(head); + SetPageDoubleMap(compound_head(page)); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1274,6 +1271,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); } static void page_remove_file_rmap(struct page *page, bool compound) @@ -1368,11 +1367,13 @@ static void page_remove_anon_compound_rmap(struct page *page) /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { lock_page_memcg(page); @@ -1414,6 +1415,8 @@ void page_remove_rmap(struct page *page, bool compound) */ out: unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); } /* @@ -1469,28 +1472,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + /* - * If the page is mlock()d, we cannot swap it out. + * If the page is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * PTE-mapped THP are never marked as mlocked: so do - * not set it on a DoubleMap THP, nor on an Anon THP - * (which may still be PTE-mapped after DoubleMap was - * cleared). But stop unmapping even in those cases. - */ - if (!PageTransCompound(page) || (PageHead(page) && - !PageDoubleMap(page) && !PageAnon(page))) - mlock_vma_page(page); + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, false); page_vma_mapped_walk_done(&pvmw); ret = false; break; } - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; @@ -1668,7 +1664,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -1942,7 +1938,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -2078,7 +2074,7 @@ static bool page_make_device_exclusive_one(struct page *page, * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ - page_remove_rmap(subpage, false); + page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 6ccc534d1c1cb..0cb8e5ef17136 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none(*dst_pte)) goto out_unlock; - if (page_in_cache) - page_add_file_rmap(page, false); - else + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } /* * Must happen after rmap, as mm_counter() checks mapping (via @@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(page)); - if (newly_allocated) - lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ From ea6f65faf203023ecaaa612c9b7821c571936170 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:22 +1100 Subject: [PATCH 168/334] mm/munlock: replace clear_page_mlock() by final clearance Placing munlock_vma_page() at the end of page_remove_rmap() shifts most of the munlocking to clear_page_mlock(), since PageMlocked is typically still set when mapcount has fallen to 0. That is not what we want: we want /proc/vmstat's unevictable_pgs_cleared to remain as a useful check on the integrity of of the mlock/munlock protocol - small numbers are not surprising, but big numbers mean the protocol is not working. That could be easily fixed by placing munlock_vma_page() at the start of page_remove_rmap(); but later in the series we shall want to batch the munlocking, and that too would tend to leave PageMlocked still set at the point when it is checked. So delete clear_page_mlock() now: leave it instead to release_pages() (and __page_cache_release()) to do this backstop clearing of Mlocked, when page refcount has fallen to 0. If a pinned page occasionally gets counted as Mlocked and Unevictable until it is unpinned, that's okay. A slightly regrettable side-effect of this change is that, since release_pages() and __page_cache_release() may be called at interrupt time, those places which update NR_MLOCK with interrupts enabled had better use mod_zone_page_state() than __mod_zone_page_state() (but holding the lruvec lock always has interrupts disabled). This change, forcing Mlocked off when refcount 0 instead of earlier when mapcount 0, is not fundamental: it can be reversed if performance or something else is found to suffer; but this is the easiest way to separate the stats - let's not complicate that without good reason. Link: https://lkml.kernel.org/r/ba15e6e-bdd5-7712-76b9-6278209e827a@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 12 ------------ mm/mlock.c | 30 ------------------------------ mm/rmap.c | 9 --------- mm/swap.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 24 insertions(+), 59 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 75cec0ade36dc..6e6a210a08382 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -425,17 +425,6 @@ static inline void munlock_vma_page(struct page *page, munlock_page(page); } -/* - * Clear the page's PageMlocked(). This can be useful in a situation where - * we want to unconditionally remove a page from the pagecache -- e.g., - * on truncation or freeing. - * - * It is legal to call this function for any page, mlocked or not. - * If called for a page that is still mapped by mlocked vmas, all we do - * is revert to lazy LRU behaviour -- semantics are not broken. - */ -extern void clear_page_mlock(struct page *page); - extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* @@ -509,7 +498,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, diff --git a/mm/mlock.c b/mm/mlock.c index 92f28258b4ae5..3c26473050a36 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -48,36 +48,6 @@ EXPORT_SYMBOL(can_do_mlock); * PageUnevictable is set to indicate the unevictable state. */ -/* - * LRU accounting for clear_page_mlock() - */ -void clear_page_mlock(struct page *page) -{ - int nr_pages; - - if (!TestClearPageMlocked(page)) - return; - - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); - /* - * The previous TestClearPageMlocked() corresponds to the smp_mb() - * in __pagevec_lru_add_fn(). - * - * See __pagevec_lru_add_fn for more explanation. - */ - if (!isolate_lru_page(page)) { - putback_lru_page(page); - } else { - /* - * We lost the race. the page already moved to evictable list. - */ - if (PageUnevictable(page)) - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - } -} - /** * mlock_page - mlock a page * @page: page to be mlocked, either a normal page or a THP head. diff --git a/mm/rmap.c b/mm/rmap.c index 6cc8bf129f185..5442a5c97a857 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1315,9 +1315,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); - - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1357,9 +1354,6 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = thp_nr_pages(page); } - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } @@ -1398,9 +1392,6 @@ void page_remove_rmap(struct page *page, */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); diff --git a/mm/swap.c b/mm/swap.c index bcf3ac288b56d..ff4810e4a4bca 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { }; /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. + * This path almost never happens for VM activity - pages are normally freed + * via pagevecs. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct page *page) { @@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page) __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } + /* See comment on PageMlocked in release_pages() */ + if (unlikely(PageMlocked(page))) { + int nr_pages = thp_nr_pages(page); + + __ClearPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); + } __ClearPageWaiters(page); } @@ -489,12 +497,8 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; if (unlikely(unevictable) && !TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); - /* - * We use the irq-unsafe __mod_zone_page_state because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); @@ -969,6 +973,18 @@ void release_pages(struct page **pages, int nr) __clear_page_lru_flags(page); } + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ + if (unlikely(PageMlocked(page))) { + __ClearPageMlocked(page); + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + } + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); From 1b412d12edeceab8ea485ea1e959b8828ebe494f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:22 +1100 Subject: [PATCH 169/334] mm/munlock: maintain page->mlock_count while unevictable Previous patches have been preparatory: now implement page->mlock_count. The ordering of the "Unevictable LRU" is of no significance, and there is no point holding unevictable pages on a list: place page->mlock_count to overlay page->lru.prev (since page->lru.next is overlaid by compound_head, which needs to be even so as not to satisfy PageTail - though 2 could be added instead of 1 for each mlock, if that's ever an improvement). But it's only safe to rely on or modify page->mlock_count while lruvec lock is held and page is on unevictable "LRU" - we can save lots of edits by continuing to pretend that there's an imaginary LRU here (there is an unevictable count which still needs to be maintained, but not a list). The mlock_count technique suffers from an unreliability much like with page_mlock(): while someone else has the page off LRU, not much can be done. As before, err on the safe side (behave as if mlock_count 0), and let try_to_unlock_one() move the page to unevictable if reclaim finds out later on - a few misplaced pages don't matter, what we want to avoid is imbalancing reclaim by flooding evictable lists with unevictable pages. I am not a fan of "if (!isolate_lru_page(page)) putback_lru_page(page);": if we have taken lruvec lock to get the page off its present list, then we save everyone trouble (and however many extra atomic ops) by putting it on its destination list immediately. Link: https://lkml.kernel.org/r/cd14eda-5be0-b8b9-4273-cf28818cfef9@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm_inline.h | 11 +++++-- include/linux/mm_types.h | 19 +++++++++-- mm/huge_memory.c | 5 ++- mm/memcontrol.c | 3 +- mm/mlock.c | 68 +++++++++++++++++++++++++++++++-------- mm/mmzone.c | 7 ++++ mm/swap.c | 1 + 7 files changed, 92 insertions(+), 22 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2ad9b28499b13..0668ceb1736b9 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); - list_add(&folio->lru, &lruvec->lists[lru]); + if (lru != LRU_UNEVICTABLE) + list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list(struct page *page, @@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); + /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } @@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { - list_del(&folio->lru); - update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + enum lru_list lru = folio_lru_list(folio); + + if (lru != LRU_UNEVICTABLE) + list_del(&folio->lru); + update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb4866..475bdb2827697 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -85,7 +85,16 @@ struct page { * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ - struct list_head lru; + union { + struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ + struct { + /* Always even, to negate PageTail */ + void *__filler; + /* Count page's or folio's mlocks */ + unsigned int mlock_count; + }; + }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -241,7 +250,13 @@ struct folio { struct { /* public: */ unsigned long flags; - struct list_head lru; + union { + struct list_head lru; + struct { + void *__filler; + unsigned int mlock_count; + }; + }; struct address_space *mapping; pgoff_t index; void *private; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 14aa18f9d4164..be003d6099923 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2275,8 +2275,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!PageLRU(head)); + if (PageUnevictable(tail)) + tail->mlock_count = 0; + else + list_add_tail(&tail->lru, &head->lru); SetPageLRU(tail); - list_add_tail(&tail->lru, &head->lru); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17398e7601f6c..a03959f9881f6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1257,8 +1257,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added - * to or just after a page is removed from an lru list (that ordering being - * so as to allow it to check that lru_size 0 is consistent with list_empty). + * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages) diff --git a/mm/mlock.c b/mm/mlock.c index 3c26473050a36..f8a3a54687dd6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -54,16 +54,35 @@ EXPORT_SYMBOL(can_do_mlock); */ void mlock_page(struct page *page) { + struct lruvec *lruvec; + int nr_pages = thp_nr_pages(page); + VM_BUG_ON_PAGE(PageTail(page), page); if (!TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - if (!isolate_lru_page(page)) - putback_lru_page(page); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return; + + lruvec = folio_lruvec_lock_irq(page_folio(page)); + if (PageUnevictable(page)) { + page->mlock_count++; + goto out; } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = 1; + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); +out: + SetPageLRU(page); + unlock_page_lruvec_irq(lruvec); } /** @@ -72,19 +91,40 @@ void mlock_page(struct page *page) */ void munlock_page(struct page *page) { + struct lruvec *lruvec; + int nr_pages = thp_nr_pages(page); + VM_BUG_ON_PAGE(PageTail(page), page); + lock_page_memcg(page); + lruvec = folio_lruvec_lock_irq(page_folio(page)); + if (PageLRU(page) && PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + if (TestClearPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (!isolate_lru_page(page)) { - putback_lru_page(page); - count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); - } else if (PageUnevictable(page)) { - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - } + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (PageLRU(page) || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (PageLRU(page) && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } +out: + unlock_page_lruvec_irq(lruvec); + unlock_page_memcg(page); } /* diff --git a/mm/mmzone.c b/mm/mmzone.c index d8a9b0e1b5267..0ae7571e35abb 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); + /* + * The "Unevictable LRU" is imaginary: though its size is maintained, + * it is never scanned, and unevictable pages are not threaded on it + * (so that their lru fields can be reused to hold mlock_count). + * Poison its list head, so that any operations on it would crash. + */ + list_del(&lruvec->lists[LRU_UNEVICTABLE]); } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) diff --git a/mm/swap.c b/mm/swap.c index ff4810e4a4bca..682a03301a2c0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1062,6 +1062,7 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) } else { folio_clear_active(folio); folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } From 5ca3031cbdca549e0408c7426f2d347b1464a93f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:22 +1100 Subject: [PATCH 170/334] mm/munlock: mlock_pte_range() when mlocking or munlocking Fill in missing pieces: reimplementation of munlock_vma_pages_range(), required to lower the mlock_counts when munlocking without munmapping; and its complement, implementation of mlock_vma_pages_range(), required to raise the mlock_counts on pages already there when a range is mlocked. Combine them into just the one function mlock_vma_pages_range(), using walk_page_range() to run mlock_pte_range(). This approach fixes the "Very slow unlockall()" of unpopulated PROT_NONE areas, reported in https://lore.kernel.org/linux-mm/70885d37-62b7-748b-29df-9e94f3291736@gmail.com/ Munlock clears VM_LOCKED at the start, under exclusive mmap_lock; but if a racing truncate or holepunch (depending on i_mmap_rwsem) gets to the pte first, it will not try to munlock the page: leaving release_pages() to correct it when the last reference to the page is gone - that's okay, a page is not evictable anyway while it is held by an extra reference. Mlock sets VM_LOCKED at the start, under exclusive mmap_lock; but if a racing remove_migration_pte() or try_to_unmap_one() (depending on i_mmap_rwsem) gets to the pte first, it will try to mlock the page, then mlock_pte_range() mlock it a second time. This is harder to reproduce, but a more serious race because it could leave the page unevictable indefinitely though the area is munlocked afterwards. Guard against it by setting the (inappropriate) VM_IO flag, and modifying mlock_vma_page() to decline such vmas. Link: https://lkml.kernel.org/r/d39f6e4d-aa4f-731a-68ee-e77cdbf1d7bb@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 3 +- mm/mlock.c | 111 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 91 insertions(+), 23 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 6e6a210a08382..47715cd599b70 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -412,7 +412,8 @@ void mlock_page(struct page *page); static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { - if (unlikely(vma->vm_flags & VM_LOCKED) && + /* VM_IO check prevents migration from double-counting during mlock */ + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) && (compound || !PageTransCompound(page))) mlock_page(page); } diff --git a/mm/mlock.c b/mm/mlock.c index f8a3a54687dd6..581ea8bf1b83c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -127,25 +128,91 @@ void munlock_page(struct page *page) unlock_page_memcg(page); } +static int mlock_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *start_pte, *pte; + struct page *page; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (!pmd_present(*pmd)) + goto out; + if (is_huge_zero_pmd(*pmd)) + goto out; + page = pmd_page(*pmd); + if (vma->vm_flags & VM_LOCKED) + mlock_page(page); + else + munlock_page(page); + goto out; + } + + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page) + continue; + if (PageTransCompound(page)) + continue; + if (vma->vm_flags & VM_LOCKED) + mlock_page(page); + else + munlock_page(page); + } + pte_unmap(start_pte); +out: + spin_unlock(ptl); + cond_resched(); + return 0; +} + /* - * munlock_vma_pages_range() - munlock all pages in the vma range.' - * @vma - vma containing range to be munlock()ed. + * mlock_vma_pages_range() - mlock any pages already in the range, + * or munlock all pages in the range. + * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range - * @end - end of range in @vma. - * - * For mremap(), munmap() and exit(). + * @end - end of range in @vma + * @newflags - the new set of flags for @vma. * - * Called with @vma VM_LOCKED. - * - * Returns with VM_LOCKED cleared. Callers must be prepared to - * deal with this. + * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; + * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ -static void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t newflags) { - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + static const struct mm_walk_ops mlock_walk_ops = { + .pmd_entry = mlock_pte_range, + }; - /* Reimplementation to follow in later commit */ + /* + * There is a slight chance that concurrent page migration, + * or page reclaim finding a page of this now-VM_LOCKED vma, + * will call mlock_vma_page() and raise page's mlock_count: + * double counting, leaving the page unevictable indefinitely. + * Communicate this danger to mlock_vma_page() with VM_IO, + * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. + * mmap_lock is held in write mode here, so this weird + * combination should not be visible to other mmap_lock users; + * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. + */ + if (newflags & VM_LOCKED) + newflags |= VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); + + lru_add_drain(); + walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); + lru_add_drain(); + + if (newflags & VM_IO) { + newflags &= ~VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); + } } /* @@ -164,10 +231,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = !!(newflags & VM_LOCKED); - vm_flags_t old_flags = vma->vm_flags; + vm_flags_t oldflags = vma->vm_flags; - if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ @@ -199,9 +265,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!lock) + if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; - else if (old_flags & VM_LOCKED) + else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; @@ -211,11 +277,12 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if (lock) + if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + /* No work to do, and mlocking twice would be wrong */ vma->vm_flags = newflags; - else - munlock_vma_pages_range(vma, start, end); - + } else { + mlock_vma_pages_range(vma, start, end, newflags); + } out: *prev = vma; return ret; From 31e2ed74337d368cb2e981c40f0bee82a6186b37 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:23 +1100 Subject: [PATCH 171/334] mm/migrate: __unmap_and_move() push good newpage to LRU Compaction, NUMA page movement, THP collapse/split, and memory failure do isolate unevictable pages from their "LRU", losing the record of mlock_count in doing so (isolators are likely to use page->lru for their own private lists, so mlock_count has to be presumed lost). That's unfortunate, and we should put in some work to correct that: one can imagine a function to build up the mlock_count again - but it would require i_mmap_rwsem for read, so be careful where it's called. Or page_referenced_one() and try_to_unmap_one() might do that extra work. But one place that can very easily be improved is page migration's __unmap_and_move(): a small adjustment to where the successful new page is put back on LRU, and its mlock_count (if any) is built back up by remove_migration_ptes(). Link: https://lkml.kernel.org/r/269eec24-978a-984a-8a85-1d29f36ad343@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d3def9f044edc..d6c3ec428f2c7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1031,6 +1031,21 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (!page_mapped(page)) rc = move_to_new_page(newpage, page, mode); + /* + * When successful, push newpage to LRU immediately: so that if it + * turns out to be an mlocked page, remove_migration_ptes() will + * automatically build up the correct newpage->mlock_count for it. + * + * We would like to do something similar for the old page, when + * unsuccessful, and other cases when a page has been temporarily + * isolated from the unevictable LRU: but this case is the easiest. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + lru_cache_add(newpage); + if (page_was_mapped) + lru_add_drain(); + } + if (page_was_mapped) remove_migration_ptes(page, rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); @@ -1044,20 +1059,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, unlock_page(page); out: /* - * If migration is successful, decrease refcount of the newpage + * If migration is successful, decrease refcount of the newpage, * which will not free the page because new page owner increased - * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. Use the old state of the isolated source page to - * determine if we migrated a LRU page. newpage was already unlocked - * and possibly modified by its owner - don't rely on the page - * state. + * refcounter. */ - if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(!is_lru)) - put_page(newpage); - else - putback_lru_page(newpage); - } + if (rc == MIGRATEPAGE_SUCCESS) + put_page(newpage); return rc; } From d80c0ad21f8b4e9927db42551ee2fdf15f0acaa3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:23 +1100 Subject: [PATCH 172/334] mm/munlock: delete smp_mb() from __pagevec_lru_add_fn() My reading of comment on smp_mb__after_atomic() in __pagevec_lru_add_fn() says that it can now be deleted; and that remains so when the next patch is added. Link: https://lkml.kernel.org/r/28a7c6ff-6270-9060-8df0-862bdcaac366@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/swap.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 682a03301a2c0..3f770b1ea2c12 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1025,37 +1025,18 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + folio_set_lru(folio); /* - * A folio becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. - * 2) Before acquiring LRU lock to put the folio on the correct LRU - * and then - * a) do PageLRU check with lock [check_move_unevictable_pages] - * b) do PageLRU check before lock [clear_page_mlock] - * - * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need - * following strict ordering: - * - * #0: __pagevec_lru_add_fn #1: clear_page_mlock - * - * folio_set_lru() folio_test_clear_mlocked() - * smp_mb() // explicit ordering // above provides strict - * // ordering - * folio_test_mlocked() folio_test_lru() + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests PageMlocked, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because munlock_page() only clears PageMlocked while the LRU + * lock is held. * - * - * if '#1' does not observe setting of PG_lru by '#0' and - * fails isolation, the explicit barrier will make sure that - * folio_evictable check will put the folio on the correct - * LRU. Without smp_mb(), folio_set_lru() can be reordered - * after folio_test_mlocked() check and can make '#1' fail the - * isolation of the folio whose mlocked bit is cleared (#0 is - * also looking at the same folio) and the evictable folio will - * be stranded on an unevictable LRU. + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear PageMlocked after + * put_page_testzero() has excluded any other users of the page.) */ - folio_set_lru(folio); - smp_mb__after_atomic(); - if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); From 20c014c7531564c70eba8fb0e8797903efd676ca Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:23 +1100 Subject: [PATCH 173/334] mm/munlock: mlock_page() munlock_page() batch by pagevec A weakness of the page->mlock_count approach is the need for lruvec lock while holding page table lock. That is not an overhead we would allow on normal pages, but I think acceptable just for pages in an mlocked area. But let's try to amortize the extra cost by gathering on per-cpu pagevec before acquiring the lruvec lock. I have an unverified conjecture that the mlock pagevec might work out well for delaying the mlock processing of new file pages until they have got off lru_cache_add()'s pagevec and on to LRU. The initialization of page->mlock_count is subject to races and awkward: 0 or !!PageMlocked or 1? Was it wrong even in the implementation before this commit, which just widens the window? I haven't gone back to think it through. Maybe someone can point out a better way to initialize it. Bringing lru_cache_add_inactive_or_unevictable()'s mlock initialization into mm/mlock.c has helped: mlock_new_page(), using the mlock pagevec, rather than lru_cache_add()'s pagevec. Experimented with various orderings: the right thing seems to be for mlock_page() and mlock_new_page() to TestSetPageMlocked before adding to pagevec, but munlock_page() to leave TestClearPageMlocked to the later pagevec processing. Dropped the VM_BUG_ON_PAGE(PageTail)s this time around: they have made their point, and the thp_nr_page()s already contain a VM_BUG_ON_PGFLAGS() for that. This still leaves acquiring lruvec locks under page table lock each time the pagevec fills (or a THP is added): which I suppose is rather silly, since they sit on pagevec waiting to be processed long after page table lock has been dropped; but I'm disinclined to uglify the calling sequence until some load shows an actual problem with it (nothing wrong with taking lruvec lock under page table lock, just "nicer" to do it less). Link: https://lkml.kernel.org/r/1abb94ee-fe72-dba9-3eb0-d1e576d148e6@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/internal.h | 9 ++- mm/mlock.c | 212 ++++++++++++++++++++++++++++++++++++++++++-------- mm/swap.c | 27 ++++--- 3 files changed, 201 insertions(+), 47 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 47715cd599b70..827a2e4133c18 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -402,7 +402,8 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * * mlock is usually called at the end of page_add_*_rmap(), * munlock at the end of page_remove_rmap(); but new anon - * pages are managed in lru_cache_add_inactive_or_unevictable(). + * pages are managed by lru_cache_add_inactive_or_unevictable() + * calling mlock_new_page(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -425,6 +426,9 @@ static inline void munlock_vma_page(struct page *page, (compound || !PageTransCompound(page))) munlock_page(page); } +void mlock_new_page(struct page *page); +bool need_mlock_page_drain(int cpu); +void mlock_page_drain(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -503,6 +507,9 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } +static inline void mlock_new_page(struct page *page) { } +static inline bool need_mlock_page_drain(int cpu) { return false; } +static inline void mlock_page_drain(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/mlock.c b/mm/mlock.c index 581ea8bf1b83c..93d616ba3e224 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,6 +28,8 @@ #include "internal.h" +static DEFINE_PER_CPU(struct pagevec, mlock_pvec); + bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) @@ -49,57 +51,79 @@ EXPORT_SYMBOL(can_do_mlock); * PageUnevictable is set to indicate the unevictable state. */ -/** - * mlock_page - mlock a page - * @page: page to be mlocked, either a normal page or a THP head. - */ -void mlock_page(struct page *page) +static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) { - struct lruvec *lruvec; - int nr_pages = thp_nr_pages(page); + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return lruvec; - VM_BUG_ON_PAGE(PageTail(page), page); + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + if (unlikely(page_evictable(page))) { + /* + * This is a little surprising, but quite possible: + * PageMlocked must have got cleared already by another CPU. + * Could this page be on the Unevictable LRU? I'm not sure, + * but move it now if so. + */ + if (PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, + thp_nr_pages(page)); + } + goto out; } - /* There is nothing more we can do while it's off LRU */ - if (!TestClearPageLRU(page)) - return; - - lruvec = folio_lruvec_lock_irq(page_folio(page)); if (PageUnevictable(page)) { - page->mlock_count++; + if (PageMlocked(page)) + page->mlock_count++; goto out; } del_page_from_lru_list(page, lruvec); ClearPageActive(page); SetPageUnevictable(page); - page->mlock_count = 1; + page->mlock_count = !!PageMlocked(page); add_page_to_lru_list(page, lruvec); - __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); out: SetPageLRU(page); - unlock_page_lruvec_irq(lruvec); + return lruvec; } -/** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. - */ -void munlock_page(struct page *page) +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; + + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) { - struct lruvec *lruvec; int nr_pages = thp_nr_pages(page); + bool isolated = false; - VM_BUG_ON_PAGE(PageTail(page), page); + if (!TestClearPageLRU(page)) + goto munlock; - lock_page_memcg(page); - lruvec = folio_lruvec_lock_irq(page_folio(page)); - if (PageLRU(page) && PageUnevictable(page)) { + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + if (PageUnevictable(page)) { /* Then mlock_count is maintained, but might undercount */ if (page->mlock_count) page->mlock_count--; @@ -108,24 +132,144 @@ void munlock_page(struct page *page) } /* else assume that was the last mlock: reclaim will fix it if not */ +munlock: if (TestClearPageMlocked(page)) { __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (PageLRU(page) || !PageUnevictable(page)) + if (isolated || !PageUnevictable(page)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* page_evictable() has to be checked *after* clearing Mlocked */ - if (PageLRU(page) && PageUnevictable(page) && page_evictable(page)) { + if (isolated && PageUnevictable(page) && page_evictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); add_page_to_lru_list(page, lruvec); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: - unlock_page_lruvec_irq(lruvec); - unlock_page_memcg(page); + if (isolated) + SetPageLRU(page); + return lruvec; +} + +/* + * Flags held in the low bits of a struct page pointer on the mlock_pvec. + */ +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +#define mlock_lru(page) ((struct page *)((unsigned long)page + LRU_PAGE)) +#define mlock_new(page) ((struct page *)((unsigned long)page + NEW_PAGE)) + +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). + */ +static void mlock_pagevec(struct pagevec *pvec) +{ + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); + } + + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + +void mlock_page_drain(int cpu) +{ + struct pagevec *pvec; + + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); +} + +bool need_mlock_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(mlock_pvec, cpu)); +} + +/** + * mlock_page - mlock a page already on (or temporarily off) LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + if (!TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + get_page(page); + if (!pagevec_add(pvec, mlock_lru(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_new_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + + get_page(page); + if (!pagevec_add(pvec, mlock_new(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. + */ +void munlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + /* + * TestClearPageMlocked(page) must be left to __munlock_page(), + * which will check whether the page is multiply mlocked. + */ + + get_page(page); + if (!pagevec_add(pvec, page) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, diff --git a/mm/swap.c b/mm/swap.c index 3f770b1ea2c12..842d5cd92cf64 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -490,18 +490,12 @@ EXPORT_SYMBOL(folio_add_lru); void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { - bool unevictable; - VM_BUG_ON_PAGE(PageLRU(page), page); - unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; - if (unlikely(unevictable) && !TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - } - lru_cache_add(page); + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) + mlock_new_page(page); + else + lru_cache_add(page); } /* @@ -640,6 +634,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + mlock_page_drain(cpu); } /** @@ -842,6 +837,7 @@ inline void __lru_add_drain_all(bool force_all_cpus) pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || + need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -1030,7 +1026,7 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests PageMlocked, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because munlock_page() only clears PageMlocked while the LRU + * not, because __munlock_page() only clears PageMlocked while the LRU * lock is held. * * (That is not true of __page_cache_release(), and not necessarily @@ -1043,7 +1039,14 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) } else { folio_clear_active(folio); folio_set_unevictable(folio); - folio->mlock_count = !!folio_test_mlocked(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } From 00b7c8a6293956e8a7ca711be25ebff26a9ecb03 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:23 +1100 Subject: [PATCH 174/334] mm-munlock-mlock_page-munlock_page-batch-by-pagevec-fix implement mlock_lru() and mlock_new() as inlines, per Matthew Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mlock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 93d616ba3e224..d28e56529e5b3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -159,8 +159,15 @@ static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) */ #define LRU_PAGE 0x1 #define NEW_PAGE 0x2 -#define mlock_lru(page) ((struct page *)((unsigned long)page + LRU_PAGE)) -#define mlock_new(page) ((struct page *)((unsigned long)page + NEW_PAGE)) +static inline struct page *mlock_lru(struct page *page) +{ + return (struct page *)((unsigned long)page + LRU_PAGE); +} + +static inline struct page *mlock_new(struct page *page) +{ + return (struct page *)((unsigned long)page + NEW_PAGE); +} /* * mlock_pagevec() is derived from pagevec_lru_move_fn(): From 8cc303b585287cdcfe24191ee2c7e06e03bebffa Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:23 +1100 Subject: [PATCH 175/334] mm/munlock: page migration needs mlock pagevec drained Page migration of a VM_LOCKED page tends to fail, because when the old page is unmapped, it is put on the mlock pagevec with raised refcount, which then fails the freeze. At first I thought this would be fixed by a local mlock_page_drain() at the upper rmap_walk() level - which would have nicely batched all the munlocks of that page; but tests show that the task can too easily move to another cpu, leaving pagevec residue behind which fails the migration. So try_to_migrate_one() drain the local pagevec after page_remove_rmap() from a VM_LOCKED vma; and do the same in try_to_unmap_one(), whose TTU_IGNORE_MLOCK users would want the same treatment; and do the same in remove_migration_pte() - not important when successfully inserting a new page, but necessary when hoping to retry after failure. Any new pagevec runs the risk of adding a new way of stranding, and we might discover other corners where mlock_page_drain() or lru_add_drain() would now help. Link: https://lkml.kernel.org/r/9e2ed861-951a-6e86-e298-a09d2d8e9b9f@google.com Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 2 ++ mm/rmap.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index d6c3ec428f2c7..7e8a6b0a531df 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -251,6 +251,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); diff --git a/mm/rmap.c b/mm/rmap.c index 5442a5c97a857..714bfdc72c7b7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1656,6 +1656,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); put_page(page); } @@ -1930,6 +1932,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); put_page(page); } From a014878e7e506d06ea93e23d888c3c57229f055c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:24 +1100 Subject: [PATCH 176/334] mm/thp: collapse_file() do try_to_unmap(TTU_BATCH_FLUSH) collapse_file() is using unmap_mapping_pages(1) on each small page found mapped, unlike others (reclaim, migration, splitting, memory-failure) who use try_to_unmap(). There are four advantages to try_to_unmap(): first, its TTU_IGNORE_MLOCK option now avoids leaving mlocked page in pagevec; second, its vma lookup uses i_mmap_lock_read() not i_mmap_lock_write(); third, it breaks out early if page is not mapped everywhere it might be; fourth, its TTU_BATCH_FLUSH option can be used, as in page reclaim, to save up all the TLB flushing until all of the pages have been unmapped. Wild guess: perhaps it was originally written to use try_to_unmap(), but hit the VM_BUG_ON_PAGE(page_mapped) after unmapping, because without TTU_SYNC it may skip page table locks; but unmap_mapping_pages() never skips them, so fixed the issue. I did once hit that VM_BUG_ON_PAGE() since making this change: we could pass TTU_SYNC here, but I think just delete the check - the race is very rare, this is an ordinary small page so we don't need to be so paranoid about mapcount surprises, and the page_ref_freeze() just below already handles the case adequately. Link: https://lkml.kernel.org/r/c390e7b-7648-b3e9-9ae1-87c9b9e95ed4@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/khugepaged.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ab3ae46f5dbf4..7d45d463acf55 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1823,13 +1823,12 @@ static void collapse_file(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_pages(mapping, index, 1, false); + try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); - VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: @@ -1893,6 +1892,13 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); xa_unlocked: + /* + * If collapse is successful, flush must be done now before copying. + * If collapse is unsuccessful, does flush actually need to be done? + * Do it anyway, to clear the state. + */ + try_to_unmap_flush(); + if (result == SCAN_SUCCEED) { struct page *page, *tmp; From 00c00f90b15f7d5024cca4d3624cad078d5d68a6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Feb 2022 15:31:24 +1100 Subject: [PATCH 177/334] mm/thp: shrink_page_list() avoid splitting VM_LOCKED THP 4.8 commit 7751b2da6be0 ("vmscan: split file huge pages before paging them out") inserted a split_huge_page_to_list() into shrink_page_list() without considering the mlock case: no problem if the page has already been marked as Mlocked (the !page_evictable check much higher up will have skipped all this), but it has always been the case that races or omissions in setting Mlocked can rely on page reclaim to detect this and correct it before actually reclaiming - and that remains so, but what a shame if a hugepage is needlessly split before discovering it. It is surprising that page_check_references() returns PAGEREF_RECLAIM when VM_LOCKED, but there was a good reason for that: try_to_unmap_one() is where the condition is detected and corrected; and until now it could not be done in page_referenced_one(), because that does not always have the page locked. Now that mlock's requirement for page lock has gone, copy try_to_unmap_one()'s mlock restoration into page_referenced_one(), and let page_check_references() return PAGEREF_ACTIVATE in this case. But page_referenced_one() may find a pte mapping one part of a hugepage: what hold should a pte mapped in a VM_LOCKED area exert over the entire huge page? That's debatable. The approach taken here is to treat that pte mapping in page_referenced_one() as if not VM_LOCKED, and if no VM_LOCKED pmd mapping is found later in the walk, and lack of reference permits, then PAGEREF_RECLAIM take it to attempted splitting as before. Link: https://lkml.kernel.org/r/531d13ee-bc7d-329a-9748-5e272f699d78@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/rmap.c | 7 +++++-- mm/vmscan.c | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 714bfdc72c7b7..c7921c102bc0e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -812,7 +812,10 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if (vma->vm_flags & VM_LOCKED) { + if ((vma->vm_flags & VM_LOCKED) && + (!PageTransCompound(page) || !pvmw.pte)) { + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, !pvmw.pte); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ @@ -851,7 +854,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e1469887afa8..2443ebaf17671 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1375,11 +1375,11 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* - * Mlock lost the isolation race with us. Let try_to_unmap() - * move the page to the unevictable list. + * The supposedly reclaimable page was found to be in a VM_LOCKED vma. + * Let the page, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_RECLAIM; + return PAGEREF_ACTIVATE; if (referenced_ptes) { /* From cdb057ace4bf9af0e21dc5fa04e3eed1092bf800 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:24 +1100 Subject: [PATCH 178/334] mm: hugetlb: free the 2nd vmemmap page associated with each HugeTLB page Patch series "Free the 2nd vmemmap page associated with each HugeTLB page", v7. This series can minimize the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB. It is a nice gain. Comments and reviews are welcome. Thanks. The main implementation and details can refer to the commit log of patch 1. In this series, I have changed the following four helpers, the following table shows the impact of the overhead of those helpers. +------------------+-----------------------+ | APIs | head page | tail page | +------------------+-----------+-----------+ | PageHead() | Y | N | +------------------+-----------+-----------+ | PageTail() | Y | N | +------------------+-----------+-----------+ | PageCompound() | N | N | +------------------+-----------+-----------+ | compound_head() | Y | N | +------------------+-----------+-----------+ Y: Overhead is increased. N: Overhead is _NOT_ increased. It shows that the overhead of those helpers on a tail page don't change between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off". But the overhead on a head page will be increased when "hugetlb_free_vmemmap=on" (except PageCompound()). So I believe that Matthew Wilcox's folio series will help with this. The users of PageHead() and PageTail() are much less than compound_head() and most users of PageTail() are VM_BUG_ON(), so I have done some tests about the overhead of compound_head() on head pages. I have tested the overhead of calling compound_head() on a head page, which is 2.11ns (Measure the call time of 10 million times compound_head(), and then average). For a head page whose address is not aligned with PAGE_SIZE or a non-compound page, the overhead of compound_head() is 2.54ns which is increased by 20%. For a head page whose address is aligned with PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by 40%. Most pages are the former. I do not think the overhead is significant since the overhead of compound_head() itself is low. This patch (of 5): This patch minimizes the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB (2MB type). After the feature of "Free sonme vmemmap pages of HugeTLB page" is enabled, the mapping of the vmemmap addresses associated with a 2MB HugeTLB page becomes the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | ----------------^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | 3 | ------------------+ | | | | | | +-----------+ | | | | | | | 4 | --------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | ----------------------+ | | | | +-----------+ | | | | | 6 | ------------------------+ | | | +-----------+ | | | | 7 | --------------------------+ | | +-----------+ | | | | | | +-----------+ As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and remaped. However, the 2nd vmemmap page frame is also can be freed to the buddy allocator, then we can change the mapping from the figure above to the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | | 2 | -----------------+ | | | | | | | +-----------+ | | | | | | | | 3 | -------------------+ | | | | | | +-----------+ | | | | | | | 4 | ---------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | -----------------------+ | | | | +-----------+ | | | | | 6 | -------------------------+ | | | +-----------+ | | | | 7 | ---------------------------+ | | +-----------+ | | | | | | +-----------+ After we do this, all tail vmemmap pages (1-7) are mapped to the head vmemmap page frame (0). In other words, there are more than one page struct with PG_head associated with each HugeTLB page. We __know__ that there is only one head page struct, the tail page structs with PG_head are fake head page structs. We need an approach to distinguish between those two different types of page structs so that compound_head(), PageHead() and PageTail() can work properly if the parameter is the tail page struct but with PG_head. The following code snippet describes how to distinguish between real and fake head page struct. if (test_bit(PG_head, &page->flags)) { unsigned long head = READ_ONCE(page[1].compound_head); if (head & 1) { if (head == (unsigned long)page + 1) ==> head page struct else ==> tail page struct } else ==> head page struct } We can safely access the field of the @page[1] with PG_head because the @page is a compound page composed with at least two contiguous pages. Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Mike Kravetz Cc: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Chen Huang Cc: Bodeddula Balasubramaniam Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Xiongchun Duan Cc: Fam Zheng Cc: Qi Zheng Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/kernel-parameters.txt | 2 +- include/linux/page-flags.h | 78 ++++++++++++++++++- mm/hugetlb_vmemmap.c | 62 ++++++++------- mm/sparse-vmemmap.c | 21 +++++ 4 files changed, 130 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f5a27f067db9e..85f096fddad9d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1625,7 +1625,7 @@ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). + memory (7 * PAGE_SIZE for each 2MB hugetlb page). Format: { on | off (default) } on: enable the feature diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1c3b6e5c8bfd3..111e453f23d22 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,13 +190,69 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; + +/* + * If the feature of freeing some vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_free_vmemmap_enabled) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; - return (unsigned long)page; + return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) @@ -231,12 +287,13 @@ static inline unsigned long _compound_head(const struct page *page) static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l @@ -695,7 +752,20 @@ static inline bool test_set_page_writeback(struct page *page) return set_page_writeback(page); } -__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline bool folio_test_head(struct folio *folio) +{ + return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); +} + +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5b..4977f5a520c22 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt @@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +bool hugetlb_free_vmemmap_enabled __read_mostly = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); return 0; } @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head); @@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index db6df27c852a7..e881f5db70915 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } From 49e987e20fc5375c9919a230a03be2a029f7001b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:24 +1100 Subject: [PATCH 179/334] mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key The page_fixed_fake_head() is used throughout memory management and the conditional check requires checking a global variable, although the overhead of this check may be small, it increases when the memory cache comes under pressure. Also, the global variable will not be modified after system boot, so it is very appropriate to use static key machanism. Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/hugetlb.h | 6 ------ include/linux/page-flags.h | 16 ++++++++++++++-- mm/hugetlb_vmemmap.c | 12 ++++++------ mm/memory_hotplug.c | 2 +- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 52c462390aee3..08357b4c7be73 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1075,12 +1075,6 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 111e453f23d22..340cb81565683 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -191,7 +191,14 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); + +static __always_inline bool hugetlb_free_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + &hugetlb_free_vmemmap_enabled_key); +} /* * If the feature of freeing some vmemmap pages associated with each HugeTLB @@ -211,7 +218,7 @@ extern bool hugetlb_free_vmemmap_enabled; */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return page; /* @@ -239,6 +246,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool hugetlb_free_vmemmap_enabled(void) +{ + return false; +} #endif static __always_inline int page_is_fake_head(struct page *page) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4977f5a520c22..791626983c2e1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,9 +188,9 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled __read_mostly = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); static int __init early_hugetlb_free_vmemmap_param(char *buf) { @@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL; @@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a9627dc784c3..0139b77c51d5d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1327,7 +1327,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && - !hugetlb_free_vmemmap_enabled && + !hugetlb_free_vmemmap_enabled() && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && From 49a2e542b78f9870b17949be821d45f4f396134f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:24 +1100 Subject: [PATCH 180/334] mm: sparsemem: use page table lock to protect kernel pmd operations The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free(). Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move. Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd. Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/ptdump.c | 16 +++++++++++---- mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/mm/ptdump.c b/mm/ptdump.c index da751448d0e4e..eea3d28d173c2 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index e881f5db70915..c64d1aa3c4b50 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,8 +53,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in pmd_install(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* From c35aacd6e354bbb61f51bda0dce3c7c822040b57 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:25 +1100 Subject: [PATCH 181/334] selftests: vm: add a hugetlb test case Since the head vmemmap page frame associated with each HugeTLB page is reused, we should hide the PG_head flag of tail struct page from the user. Add a tese case to check whether it is work properly. The test steps are as follows. 1) alloc 2MB hugeTLB 2) get each page frame 3) apply those APIs in each page frame 4) Those APIs work completely the same as before. Reading the flags of a page by /proc/kpageflags is done in stable_page_flags(), which has invoked PageHead(), PageTail(), PageCompound() and compound_head(). If those APIs work properly, the head page must have 15 and 17 bits set. And tail pages must have 16 and 17 bits set but 15 bit unset. Those flags are checked in check_page_flags(). Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 11 ++ 4 files changed, 157 insertions(+) create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 2e7e86e852828..3b5faec3c04f4 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -2,6 +2,7 @@ hugepage-mmap hugepage-mremap hugepage-shm +hugepage-vmemmap khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 1607322a112c9..7d100a7dc4624 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -31,6 +31,7 @@ TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += khugepaged TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 0000000000000..557bdbd4f87e8 --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 71d2dc198fc17..e10d50e0b8e83 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -120,6 +120,17 @@ else fi rm -f $mnt/huge_mremap +echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." From e261b1da58e222a1c2bba0714f318be629448c30 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 16 Feb 2022 15:31:25 +1100 Subject: [PATCH 182/334] mm: sparsemem: move vmemmap related to HugeTLB to CONFIG_HUGETLB_PAGE_FREE_VMEMMAP The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP. Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 ++ mm/sparse-vmemmap.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index dafad4448724e..ac15f116e95b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3140,10 +3140,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c64d1aa3c4b50..8aecd6b3896c7 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map From 0c1ae7edbe1b7ef9b32a58ddab4c58f6aa28ea67 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 16 Feb 2022 15:31:25 +1100 Subject: [PATCH 183/334] mm/hugetlb: generalize ARCH_WANT_GENERAL_HUGETLB ARCH_WANT_GENERAL_HUGETLB config has duplicate definitions on platforms that subscribe it. Instead make it a generic config option which can be selected on applicable platforms when required. Link: https://lkml.kernel.org/r/1643718465-4324-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Russell King Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/Kconfig | 4 +--- arch/riscv/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4c97cb40eebb6..ba6ba78a9cb6d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -37,6 +37,7 @@ config ARM select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_LD_ORPHAN_WARN select BINFMT_FLAT_ARGVP_ENVP_ON_STACK @@ -1508,9 +1509,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARM_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" depends on MODULES diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 5adcbd9b5e886..0804b9a11934d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -40,6 +40,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU @@ -171,9 +172,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SELECT_MEMORY_MODEL def_bool ARCH_SPARSEMEM_ENABLE -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b1ce75d0ab0c8..4831e78c844b6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -118,6 +118,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_NO_INSTR + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 @@ -344,9 +345,6 @@ config ARCH_NR_GPIO config ARCH_SUSPEND_POSSIBLE def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config AUDIT_ARCH def_bool y if X86_64 diff --git a/mm/Kconfig b/mm/Kconfig index 0ac5dbad9ed07..67998bd3352e2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -414,6 +414,9 @@ choice benefit. endchoice +config ARCH_WANT_GENERAL_HUGETLB + bool + config ARCH_WANTS_THP_SWAP def_bool n From 245db2b3a5e934553e7f89a9743585c37e2344f2 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 16 Feb 2022 15:31:25 +1100 Subject: [PATCH 184/334] mm: enable MADV_DONTNEED for hugetlb mappings Patch series "Add hugetlb MADV_DONTNEED support", v3. Userfaultfd selftests for hugetlb does not perform UFFD_EVENT_REMAP testing. However, mremap support was recently added in commit 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma"). While attempting to enable mremap support in the test, it was discovered that the mremap test indirectly depends on MADV_DONTNEED. madvise does not allow MADV_DONTNEED for hugetlb mappings. However, that is primarily due to the check in can_madv_lru_vma(). By simply removing the check and adding huge page alignment, MADV_DONTNEED can be made to work for hugetlb mappings. Do note that there is no compelling use case for adding this support. This was discussed in the RFC [1]. However, adding support makes sense as it is fairly trivial and brings hugetlb functionality more in line with 'normal' memory. After enabling support, add selftest for MADV_DONTNEED as well as MADV_REMOVE. Then update userfaultfd selftest. If new functionality is accepted, then madvise man page will be updated to indicate hugetlb is supported. It will also be updated to clarify what happens to the passed length argument. This patch (of 3): MADV_DONTNEED is currently disabled for hugetlb mappings. This certainly makes sense in shared file mappings as the pagecache maintains a reference to the page and it will never be freed. However, it could be useful to unmap and free pages in private mappings. In addition, userfaultfd minor fault users may be able to simplify code by using MADV_DONTNEED. The primary thing preventing MADV_DONTNEED from working on hugetlb mappings is a check in can_madv_lru_vma(). To allow support for hugetlb mappings create and use a new routine madvise_dontneed_free_valid_vma() that allows hugetlb mappings in this specific case. For normal mappings, madvise requires the start address be PAGE aligned and rounds up length to the next multiple of PAGE_SIZE. Do similarly for hugetlb mappings: require start address be huge page size aligned and round up length to the next multiple of huge page size. Use the new madvise_dontneed_free_valid_vma routine to check alignment and round up length/end. zap_page_range requires this alignment for hugetlb vmas otherwise we will hit BUGs. Link: https://lkml.kernel.org/r/20220215002348.128823-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220215002348.128823-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Naoya Horiguchi Cc: David Hildenbrand Cc: Axel Rasmussen Cc: Mina Almasry Cc: Michal Hocko Cc: Peter Xu Cc: Andrea Arcangeli Cc: Shuah Khan Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/madvise.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index bed872a2ad5fa..ede6affa1350a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -554,9 +554,14 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP)); +} + static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); + return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma); } static long madvise_cold(struct vm_area_struct *vma, @@ -829,6 +834,23 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, return 0; } +static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, + unsigned long start, + unsigned long *end, + int behavior) +{ + if (!is_vm_hugetlb_page(vma)) + return can_madv_lru_non_huge_vma(vma); + + if (behavior != MADV_DONTNEED) + return false; + if (start & ~huge_page_mask(hstate_vma(vma))) + return false; + + *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + return true; +} + static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, @@ -837,7 +859,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; *prev = vma; - if (!can_madv_lru_vma(vma)) + if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -859,7 +881,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_lru_vma(vma)) + /* + * Potential end adjustment for hugetlb vma is OK as + * the check below keeps end within vma. + */ + if (!madvise_dontneed_free_valid_vma(vma, start, &end, + behavior)) return -EINVAL; if (end > vma->vm_end) { /* From 5e3a57a806aefd320cbef4aa7cb760804232b404 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 16 Feb 2022 15:31:25 +1100 Subject: [PATCH 185/334] selftests/vm: add hugetlb madvise MADV_DONTNEED MADV_REMOVE test Now that MADV_DONTNEED support for hugetlb is enabled, add corresponding tests. MADV_REMOVE has been enabled for some time, but no tests exist so add them as well. Link: https://lkml.kernel.org/r/20220215002348.128823-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Shuah Khan Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugetlb-madvise.c | 410 +++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 12 + 4 files changed, 424 insertions(+) create mode 100644 tools/testing/selftests/vm/hugetlb-madvise.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 3b5faec3c04f4..d7507f3c7c76a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -3,6 +3,7 @@ hugepage-mmap hugepage-mremap hugepage-shm hugepage-vmemmap +hugetlb-madvise khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7d100a7dc4624..c2f2f99be5182 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -28,6 +28,7 @@ LDLIBS = -lrt -lpthread TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c new file mode 100644 index 0000000000000..6c6af40f57478 --- /dev/null +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-madvise: + * + * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE + * on hugetlb mappings. + * + * Before running this test, make sure the administrator has pre-allocated + * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, + * the test takes an argument that is the path to a file in a hugetlbfs + * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some + * directory. + */ + +#include +#include +#include +#include +#define __USE_GNU +#include + +#define USAGE "USAGE: %s \n" +#define MIN_FREE_PAGES 20 +#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ + +#define validate_free_pages(exp_free) \ + do { \ + int fhp = get_free_hugepages(); \ + if (fhp != (exp_free)) { \ + printf("Unexpected number of free huge " \ + "pages line %d\n", __LINE__); \ + exit(1); \ + } \ + } while (0) + +unsigned long huge_page_size; +unsigned long base_page_size; + +/* + * default_huge_page_size copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} + +void write_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(addr + (i * huge_page_size))) = i; +} + +void read_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i, tmp; + + for (i = 0; i < nr_pages; i++) + tmp += *((unsigned long *)(addr + (i * huge_page_size))); +} + +int main(int argc, char **argv) +{ + unsigned long free_hugepages; + void *addr, *addr2; + int fd; + int ret; + + if (argc != 2) { + printf(USAGE, argv[0]); + exit(1); + } + + huge_page_size = default_huge_page_size(); + if (!huge_page_size) { + printf("Unable to determine huge page size, exiting!\n"); + exit(1); + } + base_page_size = sysconf(_SC_PAGE_SIZE); + if (!huge_page_size) { + printf("Unable to determine base page size, exiting!\n"); + exit(1); + } + + free_hugepages = get_free_hugepages(); + if (free_hugepages < MIN_FREE_PAGES) { + printf("Not enough free huge pages to test, exiting!\n"); + exit(1); + } + + fd = open(argv[1], O_CREAT | O_RDWR, 0755); + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* + * Test validity of MADV_DONTNEED addr and length arguments. mmap + * size is NR_HUGE_PAGES + 2. One page at the beginning and end of + * the mapping will be unmapped so we KNOW there is nothing mapped + * there. + */ + addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + if (munmap(addr, huge_page_size) || + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, + huge_page_size)) { + perror("munmap"); + exit(1); + } + addr = addr + huge_page_size; + + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr before mapping should fail */ + ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid addr line %d\n", + __LINE__); + exit(1); + } + + /* addr + length after mapping should fail */ + ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid length line %d\n", + __LINE__); + exit(1); + } + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test alignment of MADV_DONTNEED addr and length arguments + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr is not huge page size aligned and should fail */ + ret = madvise(addr + base_page_size, + NR_HUGE_PAGES * huge_page_size - base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with unaligned start address %d\n", + __LINE__); + exit(1); + } + + /* addr + length should be aligned up to huge page size */ + if (madvise(addr, + ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on anonymous private mapping + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* read should not consume any pages */ + read_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise should free private pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * The fallocate below certainly should free the pages associated + * with the file. However, pages in the private mapping are also + * freed. This is not the 'correct' behavior, but is expected + * because this is how it has worked since the initial hugetlb + * implementation. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on shared mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* write should not consume any pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* + * Test MADV_REMOVE on shared mapping of hugetlb file + * + * madvise is same as hole punch and should free all pages. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_REMOVE on shared and private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* shared write should not consume any additional pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* private read should not consume any pages */ + read_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of shared mapping should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of private mapping should free private pages */ + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages again */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * madvise should free both file and private pages although this is + * not correct. private pages should not be freed, but this is + * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); + + close(fd); + unlink(argv[1]); + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index e10d50e0b8e83..1948098f431d1 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -131,6 +131,18 @@ else echo "[PASS]" fi +echo "-----------------------" +echo "running hugetlb-madvise" +echo "-----------------------" +./hugetlb-madvise $mnt/madvise-test +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi +rm -f $mnt/madvise-test + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." From d2600b33fd65ee936b055253abf063677934d362 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 16 Feb 2022 15:31:26 +1100 Subject: [PATCH 186/334] userfaultfd/selftests: enable hugetlb remap and remove event testing With MADV_DONTNEED support added to hugetlb mappings, mremap testing can also be enabled for hugetlb. Modify the tests to use madvise MADV_DONTNEED and MADV_REMOVE instead of fallocate hole puch for releasing hugetlb pages. Link: https://lkml.kernel.org/r/20220215002348.128823-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/run_vmtests.sh | 3 +- tools/testing/selftests/vm/userfaultfd.c | 69 ++++++++++++----------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 1948098f431d1..3b265f140c25c 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -208,14 +208,13 @@ echo "running userfaultfd_hugetlb" echo "---------------------------" # Test requires source and destination huge pages. Size of source # (half_ufd_size_MB) is passed as argument to test. -./userfaultfd hugetlb $half_ufd_size_MB 32 $mnt/ufd_test_file +./userfaultfd hugetlb $half_ufd_size_MB 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 else echo "[PASS]" fi -rm -f $mnt/ufd_test_file echo "-------------------------" echo "running userfaultfd_shmem" diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 2f49c9af1b582..96bf54fbca5c6 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -88,7 +88,6 @@ static bool test_uffdio_minor = false; static bool map_shared; static int shm_fd; static int huge_fd; -static char *huge_fd_off0; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; @@ -124,9 +123,9 @@ const char *examples = "./userfaultfd anon 100 99999\n\n" "# Run share memory test on 1GiB region with 99 bounces:\n" "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" - "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" - "# Run the same hugetlb test but using shmem:\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./userfaultfd hugetlb 256 50\n\n" + "# Run the same hugetlb test but using shared file:\n" "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" "# 10MiB-~6GiB 999 bounces anonymous test, " "continue forever unless an error triggers\n" @@ -223,10 +222,13 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) static void hugetlb_release_pages(char *rel_area) { - if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, - nr_pages * page_size)) - err("fallocate() failed"); + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } } static void hugetlb_allocate_area(void **alloc_area) @@ -234,26 +236,37 @@ static void hugetlb_allocate_area(void **alloc_area) void *area_alias = NULL; char **alloc_area_alias; - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - MAP_HUGETLB | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), - huge_fd, *alloc_area == area_src ? 0 : - nr_pages * page_size); + if (!map_shared) + *alloc_area = mmap(NULL, + nr_pages * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | + (*alloc_area == area_src ? 0 : MAP_NORESERVE), + -1, + 0); + else + *alloc_area = mmap(NULL, + nr_pages * page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | + (*alloc_area == area_src ? 0 : MAP_NORESERVE), + huge_fd, + *alloc_area == area_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); if (map_shared) { - area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_HUGETLB, - huge_fd, *alloc_area == area_src ? 0 : - nr_pages * page_size); + area_alias = mmap(NULL, + nr_pages * page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + huge_fd, + *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } if (*alloc_area == area_src) { - huge_fd_off0 = *alloc_area; alloc_area_alias = &area_src_alias; } else { alloc_area_alias = &area_dst_alias; @@ -266,12 +279,7 @@ static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset { if (!map_shared) return; - /* - * We can't zap just the pagetable with hugetlbfs because - * MADV_DONTEED won't work. So exercise -EEXIST on a alias - * mapping where the pagetables are not established initially, - * this way we'll exercise the -EEXEC at the fs level. - */ + *start = (unsigned long) area_dst_alias + offset; } @@ -424,7 +432,6 @@ static void uffd_test_ctx_clear(void) uffd = -1; } - huge_fd_off0 = NULL; munmap_area((void **)&area_src); munmap_area((void **)&area_src_alias); munmap_area((void **)&area_dst); @@ -922,10 +929,7 @@ static int faulting_process(int signal_test) struct sigaction act; unsigned long signalled = 0; - if (test_type != TEST_HUGETLB) - split_nr_pages = (nr_pages + 1) / 2; - else - split_nr_pages = nr_pages; + split_nr_pages = (nr_pages + 1) / 2; if (signal_test) { sigbuf = &jbuf; @@ -982,9 +986,6 @@ static int faulting_process(int signal_test) if (signal_test) return signalled != split_nr_pages; - if (test_type == TEST_HUGETLB) - return 0; - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, area_src); if (area_dst == MAP_FAILED) @@ -1672,7 +1673,7 @@ int main(int argc, char **argv) } nr_pages = nr_pages_per_cpu * nr_cpus; - if (test_type == TEST_HUGETLB) { + if (test_type == TEST_HUGETLB && map_shared) { if (argc < 5) usage(); huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); From 20b9a2201212d7db69ea527b5c4c9ae49473487a Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Wed, 16 Feb 2022 15:31:26 +1100 Subject: [PATCH 187/334] mm/mempolicy: convert from atomic_t to refcount_t on mempolicy->refcnt refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Link: https://lkml.kernel.org/r/1626683671-64407-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Acked-by: Ben Widawsky Reviewed-by: Muchun Song Cc: Feng Tang Cc: Mike Kravetz Cc: Muchun Song Cc: Yanfei Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mempolicy.h | 5 +++-- mm/mempolicy.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d7..44383ab8af554 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -6,6 +6,7 @@ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 +#include #include #include #include @@ -42,7 +43,7 @@ struct mm_struct; * to 1, representing the caller of mpol_dup(). */ struct mempolicy { - atomic_t refcnt; + refcount_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ @@ -94,7 +95,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) static inline void mpol_get(struct mempolicy *pol) { if (pol) - atomic_inc(&pol->refcnt); + refcount_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3f8dc58da3e80..fa49f422dc1e5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -295,7 +295,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); - atomic_set(&policy->refcnt, 1); + refcount_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; @@ -306,7 +306,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!refcount_dec_and_test(&p->refcnt)) return; kmem_cache_free(policy_cache, p); } @@ -2406,7 +2406,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } - atomic_set(&new->refcnt, 1); + refcount_set(&new->refcnt, 1); return new; } @@ -2703,7 +2703,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, goto alloc_new; *mpol_new = *n->policy; - atomic_set(&mpol_new->refcnt, 1); + refcount_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); From 7ad3a5a131ced4e517f40f4da0897367db9664db Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:26 +1100 Subject: [PATCH 188/334] mm-mempolicy-convert-from-atomic_t-to-refcount_t-on-mempolicy-refcnt-fix fix warnings mm/mempolicy.c:125:42: warning: missing braces around initializer [-Wmissing-braces] 125 | static struct mempolicy default_policy = { | ^ mm/mempolicy.c:125:42: warning: missing braces around initializer [-Wmissing-braces] mm/mempolicy.c: In function 'numa_policy_init': mm/mempolicy.c:2815:32: warning: missing braces around initializer [-Wmissing-braces] 2815 | preferred_node_policy[nid] = (struct mempolicy) { | ^ mm/mempolicy.c:2815:32: warning: missing braces around initializer [-Wmissing-braces] Cc: Xiyu Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fa49f422dc1e5..7c852793d9e85 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,7 +123,7 @@ enum zone_type policy_zone = 0; * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { - .refcnt = ATOMIC_INIT(1), /* never free it */ + .refcnt = { ATOMIC_INIT(1), }, /* never free it */ .mode = MPOL_LOCAL, }; @@ -2897,7 +2897,7 @@ void __init numa_policy_init(void) for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { - .refcnt = ATOMIC_INIT(1), + .refcnt = { ATOMIC_INIT(1), }, .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), From 5c4a1da27403648d9c7cdb45af5fcfab44e5a1fb Mon Sep 17 00:00:00 2001 From: sujiaxun Date: Wed, 16 Feb 2022 15:31:26 +1100 Subject: [PATCH 189/334] mm: move oom_kill sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the oom_kill sysctls to their own file, mm/oom_kill.c Link: https://lkml.kernel.org/r/20220215093203.31032-1-sujiaxun@uniontech.com Signed-off-by: sujiaxun Cc: Kees Cook Cc: Iurii Zaikin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/oom.h | 4 ---- kernel/sysctl.c | 23 ----------------------- mm/oom_kill.c | 37 ++++++++++++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a14325112..02d1e7bbd8cd5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -123,8 +123,4 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -/* sysctls */ -extern int sysctl_oom_dump_tasks; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_panic_on_oom; #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34371bcb8ffa8..156e5b6d485d3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2354,29 +2354,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6b875acabd1e7..07ff90c920bcc 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -52,9 +52,37 @@ #define CREATE_TRACE_POINTS #include -int sysctl_panic_on_oom; -int sysctl_oom_kill_allocating_task; -int sysctl_oom_dump_tasks = 1; +static int sysctl_panic_on_oom; +static int sysctl_oom_kill_allocating_task; +static int sysctl_oom_dump_tasks = 1; + +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_oom_kill_table[] = { + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + } +}; +#endif /* * Serializes oom killer invocations (out_of_memory()) from all contexts to @@ -680,6 +708,9 @@ static void wake_oom_reaper(struct task_struct *tsk) static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_oom_kill_table); +#endif return 0; } subsys_initcall(oom_init) From 604d9103bd02af3785f016bfb7547ef354af1151 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 16 Feb 2022 19:09:18 -0800 Subject: [PATCH 190/334] fix up for "mm: move oom_kill sysctls to their own file" Link: https://lkml.kernel.org/r/20220216193202.28838626@canb.auug.org.au Signed-off-by: Stephen Rothwell Cc: sujiaxun Signed-off-by: Andrew Morton --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 07ff90c920bcc..9787e2cc51b5b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -80,7 +80,8 @@ static struct ctl_table vm_oom_kill_table[] = { .maxlen = sizeof(sysctl_oom_dump_tasks), .mode = 0644, .proc_handler = proc_dointvec, - } + }, + {} }; #endif From e366a2e0dd2081e8c1b6ac88b702e8e0bdb82e2b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 16 Feb 2022 15:31:26 +1100 Subject: [PATCH 191/334] mm/migration: add trace events for THP migrations Patch series "mm/migration: Add trace events", v3. This adds trace events for all migration scenarios including base page, THP and HugeTLB. This patch (of 3): This adds two trace events for PMD based THP migration without split. These events closely follow the implementation details like setting and removing of PMD migration entries, which are essential operations for THP migration. This moves CREATE_TRACE_POINTS into generic THP from powerpc for these new trace events to be available on other platforms as well. Link: https://lkml.kernel.org/r/1643368182-9588-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1643368182-9588-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Steven Rostedt Cc: Ingo Molnar Cc: Zi Yan Cc: Naoya Horiguchi Cc: John Hubbard Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/mm/book3s64/trace.c | 1 - include/trace/events/thp.h | 27 +++++++++++++++++++++++++++ mm/huge_memory.c | 5 +++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c index b86e7b9062571..ccd64b5e6cac7 100644 --- a/arch/powerpc/mm/book3s64/trace.c +++ b/arch/powerpc/mm/book3s64/trace.c @@ -3,6 +3,5 @@ * This file is for defining trace points and trace related helpers. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define CREATE_TRACE_POINTS #include #endif diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index ca3f2767828a6..202b3e3e67ff2 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -48,6 +48,33 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); +DECLARE_EVENT_CLASS(migration_pmd, + + TP_PROTO(unsigned long addr, unsigned long pmd), + + TP_ARGS(addr, pmd), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pmd) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pmd = pmd; + ), + TP_printk("addr=%lx, pmd=%lx", __entry->addr, __entry->pmd) +); + +DEFINE_EVENT(migration_pmd, set_migration_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); + +DEFINE_EVENT(migration_pmd, remove_migration_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); #endif /* _TRACE_THP_H */ /* This part must be outside protection */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index be003d6099923..097b1dfa81eb5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -39,6 +39,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not @@ -3071,6 +3074,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); + trace_set_migration_pmd(address, pmd_val(pmdswp)); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -3103,5 +3107,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); + trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif From 3222fc960d6a89de049b39d96cb8b812a8635057 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 16 Feb 2022 15:31:27 +1100 Subject: [PATCH 192/334] mm/migration: add trace events for base page and HugeTLB migrations This adds two trace events for base page and HugeTLB page migrations. These events, closely follow the implementation details like setting and removing of PTE migration entries, which are essential operations for migration. The new CREATE_TRACE_POINTS in covers both and based trace events. Hence drop redundant CREATE_TRACE_POINTS from other places which could have otherwise conflicted during build. Link: https://lkml.kernel.org/r/1643368182-9588-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reported-by: kernel test robot Cc: Steven Rostedt Cc: Ingo Molnar Cc: Zi Yan Cc: Naoya Horiguchi Cc: John Hubbard Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/mm/init.c | 1 - include/trace/events/migrate.h | 31 +++++++++++++++++++++++++++++++ mm/migrate.c | 4 +++- mm/rmap.c | 6 ++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4ba024d5b63ae..d8cfce221275e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -31,7 +31,6 @@ * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y. */ -#define CREATE_TRACE_POINTS #include #include "mm_internal.h" diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 779f3fad9ecd5..061b5128f335a 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -105,6 +105,37 @@ TRACE_EVENT(mm_migrate_pages_start, __print_symbolic(__entry->reason, MIGRATE_REASON)) ); +DECLARE_EVENT_CLASS(migration_pte, + + TP_PROTO(unsigned long addr, unsigned long pte, int order), + + TP_ARGS(addr, pte, order), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pte) + __field(int, order) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pte = pte; + __entry->order = order; + ), + + TP_printk("addr=%lx, pte=%lx order=%d", __entry->addr, __entry->pte, __entry->order) +); + +DEFINE_EVENT(migration_pte, set_migration_pte, + TP_PROTO(unsigned long addr, unsigned long pte, int order), + TP_ARGS(addr, pte, order) +); + +DEFINE_EVENT(migration_pte, remove_migration_pte, + TP_PROTO(unsigned long addr, unsigned long pte, int order), + TP_ARGS(addr, pte, order) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/migrate.c b/mm/migrate.c index 7e8a6b0a531df..4d167d71e9a98 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -54,7 +54,6 @@ #include -#define CREATE_TRACE_POINTS #include #include "internal.h" @@ -254,6 +253,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) mlock_page_drain(smp_processor_id()); + trace_remove_migration_pte(pvmw.address, pte_val(pte), + compound_order(new)); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } diff --git a/mm/rmap.c b/mm/rmap.c index c7921c102bc0e..a239a68cbd257 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,9 @@ #include +#define CREATE_TRACE_POINTS #include +#include #include "internal.h" @@ -1853,6 +1855,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + trace_set_migration_pte(pvmw.address, pte_val(swp_pte), + compound_order(page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1921,6 +1925,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + trace_set_migration_pte(address, pte_val(swp_pte), + compound_order(page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. From acdd6ac60778a04efffedb3896adb2c716a346f9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 16 Feb 2022 15:31:27 +1100 Subject: [PATCH 193/334] mm,migrate: fix establishing demotion target In commit ac16ec835314 ("mm: migrate: support multiple target nodes demotion"), after the first demotion target node is found, we will continue to check the next candidate obtained via find_next_best_node(). This is to find all demotion target nodes with same NUMA distance. But one side effect of find_next_best_node() is that the candidate node returned will be set in "used" parameter, even if the candidate node isn't passed in the following NUMA distance checking, the candidate node will not be used as demotion target node for the following nodes. For example, for system as follows, node distances: node 0 1 2 3 0: 10 21 17 28 1: 21 10 28 17 2: 17 28 10 28 3: 28 17 28 10 when we establish demotion target node for node 0, in the first round node 2 is added to the demotion target node set. Then in the second round, node 3 is checked and failed because distance(0, 3) > distance(0, 2). But node 3 is set in "used" nodemask too. When we establish demotion target node for node 1, there is no available node. This is wrong, node 3 should be set as the demotion target of node 1. To fix this, if the candidate node is failed to pass the distance checking, it will be cleared in "used" nodemask. So that it can be used for the following node. The bug can be reproduced and fixed with this patch on a 2 socket server machine with DRAM and PMEM. Link: https://lkml.kernel.org/r/20220128055940.1792614-1-ying.huang@intel.com Fixes: ac16ec835314 ("mm: migrate: support multiple target nodes demotion") Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Cc: Baolin Wang Cc: Dave Hansen Cc: Zi Yan Cc: Oscar Salvador Cc: Yang Shi Cc: zhongjiang-ali Cc: Xunlei Pang Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 4d167d71e9a98..3f6b196e98400 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -3094,18 +3094,21 @@ static int establish_migrate_target(int node, nodemask_t *used, if (best_distance != -1) { val = node_distance(node, migration_target); if (val > best_distance) - return NUMA_NO_NODE; + goto out_clear; } index = nd->nr; if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, "Exceeds maximum demotion target nodes\n")) - return NUMA_NO_NODE; + goto out_clear; nd->nodes[index] = migration_target; nd->nr++; return migration_target; +out_clear: + node_clear(migration_target, *used); + return NUMA_NO_NODE; } /* From 84eddca5fe272b4bd5e1537c12d8c400615cde41 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 16 Feb 2022 15:31:27 +1100 Subject: [PATCH 194/334] mm/cma: provide option to opt out from exposing pages on activation failure Patch series "powerpc/fadump: handle CMA activation failure appropriately", v3. Commit 072355c1cf2d ("mm/cma: expose all pages to the buddy if activation of an area fails") started exposing all pages to buddy allocator on CMA activation failure. But there can be CMA users that want to handle the reserved memory differently on CMA allocation failure. Provide an option to opt out from exposing pages to buddy for such cases. Link: https://lkml.kernel.org/r/20220117075246.36072-1-hbathini@linux.ibm.com Link: https://lkml.kernel.org/r/20220117075246.36072-2-hbathini@linux.ibm.com Signed-off-by: Hari Bathini Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Mike Kravetz Cc: Mahesh Salgaonkar Cc: Sourabh Jain Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/cma.h | 2 ++ mm/cma.c | 11 +++++++++-- mm/cma.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index b1ba94f1cc9c5..90fd742fd1ef5 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -58,4 +58,6 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); + +extern void cma_reserve_pages_on_error(struct cma *cma); #endif diff --git a/mm/cma.c b/mm/cma.c index 5a2cd58516589..eaa4b5c920a20 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -131,8 +131,10 @@ static void __init cma_activate_area(struct cma *cma) bitmap_free(cma->bitmap); out_error: /* Expose all pages to the buddy, they are useless for CMA. */ - for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) - free_reserved_page(pfn_to_page(pfn)); + if (!cma->reserve_pages_on_error) { + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + } totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); @@ -150,6 +152,11 @@ static int __init cma_init_reserved_areas(void) } core_initcall(cma_init_reserved_areas); +void __init cma_reserve_pages_on_error(struct cma *cma) +{ + cma->reserve_pages_on_error = true; +} + /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area diff --git a/mm/cma.h b/mm/cma.h index 2c775877eae24..88a0595670b76 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -30,6 +30,7 @@ struct cma { /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif + bool reserve_pages_on_error; }; extern struct cma cma_areas[MAX_CMA_AREAS]; From c6f65d14b9155b3ed9f99ee6003567d7bf254b8c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 16 Feb 2022 15:31:27 +1100 Subject: [PATCH 195/334] powerpc/fadump: opt out from freeing pages on cma activation failure With commit a4e92ce8e4c8 ("powerpc/fadump: Reservationless firmware assisted dump"), Linux kernel's Contiguous Memory Allocator (CMA) based reservation was introduced in fadump. That change was aimed at using CMA to let applications utilize the memory reserved for fadump while blocking it from being used for kernel pages. The assumption was, even if CMA activation fails for whatever reason, the memory still remains reserved to avoid it from being used for kernel pages. But commit 072355c1cf2d ("mm/cma: expose all pages to the buddy if activation of an area fails") breaks this assumption as it started exposing all pages to buddy allocator on CMA activation failure. It led to warning messages like below while running crash-utility on vmcore of a kernel having above two commits: crash: seek error: kernel virtual address: To fix this problem, opt out from exposing pages to buddy allocator on CMA activation failure for fadump reserved memory. Link: https://lkml.kernel.org/r/20220117075246.36072-3-hbathini@linux.ibm.com Signed-off-by: Hari Bathini Acked-by: David Hildenbrand Acked-by: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Mike Kravetz Cc: Oscar Salvador Cc: Sourabh Jain Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/kernel/fadump.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7eb67201ea415..4fdb7c77fda1e 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -112,6 +112,12 @@ static int __init fadump_cma_init(void) return 1; } + /* + * If CMA activation fails, keep the pages reserved, instead of + * exposing them to buddy allocator. Same as 'fadump=nocma' case. + */ + cma_reserve_pages_on_error(fadump_cma); + /* * So we now have successfully initialized cma area for fadump. */ From 514fa8276211b193777e130867b19aa8b1b4789f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 16 Feb 2022 15:31:27 +1100 Subject: [PATCH 196/334] NUMA Balancing: add page promotion counter With the advent of various new memory types, some machines will have multiple types of memory, e.g. DRAM and PMEM (persistent memory). The memory subsystem of these machines can be called memory tiering system, because the performance of the different types of memory are different. After commit c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM"), the PMEM could be used as the cost-effective volatile memory in separate NUMA nodes. In a typical memory tiering system, there are CPUs, DRAM and PMEM in each physical NUMA node. The CPUs and the DRAM will be put in one logical node, while the PMEM will be put in another (faked) logical node. To optimize the system overall performance, the hot pages should be placed in DRAM node. To do that, we need to identify the hot pages in the PMEM node and migrate them to DRAM node via NUMA migration. In the original NUMA balancing, there are already a set of existing mechanisms to identify the pages recently accessed by the CPUs in a node and migrate the pages to the node. So we can reuse these mechanisms to build the mechanisms to optimize the page placement in the memory tiering system. This is implemented in this patchset. At the other hand, the cold pages should be placed in PMEM node. So, we also need to identify the cold pages in the DRAM node and migrate them to PMEM node. In commit 26aa2d199d6f ("mm/migrate: demote pages during reclaim"), a mechanism to demote the cold DRAM pages to PMEM node under memory pressure is implemented. Based on that, the cold DRAM pages can be demoted to PMEM node proactively to free some memory space on DRAM node to accommodate the promoted hot PMEM pages. This is implemented in this patchset too. We have tested the solution with the pmbench memory accessing benchmark with the 80:20 read/write ratio and the Gauss access address distribution on a 2 socket Intel server with Optane DC Persistent Memory Model. The test results shows that the pmbench score can improve up to 95.9%. This patch (of 3): In a system with multiple memory types, e.g. DRAM and PMEM, the CPU and DRAM in one socket will be put in one NUMA node as before, while the PMEM will be put in another NUMA node as described in the description of the commit c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM"). So, the NUMA balancing mechanism will identify all PMEM accesses as remote access and try to promote the PMEM pages to DRAM. To distinguish the number of the inter-type promoted pages from that of the inter-socket migrated pages. A new vmstat count is added. The counter is per-node (count in the target node). So this can be used to identify promotion imbalance among the NUMA nodes. Link: https://lkml.kernel.org/r/20220128082751.593478-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20220128082751.593478-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Dave Hansen Cc: Zi Yan Cc: Wei Xu Cc: Shakeel Butt Cc: zhongjiang-ali Cc: Oscar Salvador Cc: Feng Tang Cc: Hasan Al Maruf Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mmzone.h | 3 +++ include/linux/node.h | 5 +++++ mm/migrate.c | 13 ++++++++++--- mm/vmstat.c | 3 +++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0ac8ef50cea38..3fff6deca2c08 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,6 +221,9 @@ enum node_stat_item { NR_PAGETABLE, /* used for pagetables */ #ifdef CONFIG_SWAP NR_SWAPCACHE, +#endif +#ifdef CONFIG_NUMA_BALANCING + PGPROMOTE_SUCCESS, /* promote successfully */ #endif NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/node.h b/include/linux/node.h index bb21fd631b162..81bbf1c0afd37 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -181,4 +181,9 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg, #define to_node(device) container_of(device, struct node, dev) +static inline bool node_is_toptier(int node) +{ + return node_state(node, N_CPU); +} + #endif /* _LINUX_NODE_H_ */ diff --git a/mm/migrate.c b/mm/migrate.c index 3f6b196e98400..9adc6a4cd4899 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2084,6 +2084,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, pg_data_t *pgdat = NODE_DATA(node); int isolated; int nr_remaining; + int nr_succeeded; LIST_HEAD(migratepages); new_page_t *new; bool compound; @@ -2122,7 +2123,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, + &nr_succeeded); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); @@ -2131,8 +2133,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, putback_lru_page(page); } isolated = 0; - } else - count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages); + } + if (nr_succeeded) { + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); + if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) + mod_node_page_state(NODE_DATA(node), PGPROMOTE_SUCCESS, + nr_succeeded); + } BUG_ON(!list_empty(&migratepages)); return isolated; diff --git a/mm/vmstat.c b/mm/vmstat.c index 4057372745d04..846b670dd346a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "nr_swapcached", #endif +#ifdef CONFIG_NUMA_BALANCING + "pgpromote_success", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", From 2416aed403ed9b299503554dd0a7fdab3f4cd484 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 16 Feb 2022 15:31:28 +1100 Subject: [PATCH 197/334] NUMA balancing: optimize page placement for memory tiering system With the advent of various new memory types, some machines will have multiple types of memory, e.g. DRAM and PMEM (persistent memory). The memory subsystem of these machines can be called memory tiering system, because the performance of the different types of memory are usually different. In such system, because of the memory accessing pattern changing etc, some pages in the slow memory may become hot globally. So in this patch, the NUMA balancing mechanism is enhanced to optimize the page placement among the different memory types according to hot/cold dynamically. In a typical memory tiering system, there are CPUs, fast memory and slow memory in each physical NUMA node. The CPUs and the fast memory will be put in one logical node (called fast memory node), while the slow memory will be put in another (faked) logical node (called slow memory node). That is, the fast memory is regarded as local while the slow memory is regarded as remote. So it's possible for the recently accessed pages in the slow memory node to be promoted to the fast memory node via the existing NUMA balancing mechanism. The original NUMA balancing mechanism will stop to migrate pages if the free memory of the target node becomes below the high watermark. This is a reasonable policy if there's only one memory type. But this makes the original NUMA balancing mechanism almost do not work to optimize page placement among different memory types. Details are as follows. It's the common cases that the working-set size of the workload is larger than the size of the fast memory nodes. Otherwise, it's unnecessary to use the slow memory at all. So, there are almost always no enough free pages in the fast memory nodes, so that the globally hot pages in the slow memory node cannot be promoted to the fast memory node. To solve the issue, we have 2 choices as follows, a. Ignore the free pages watermark checking when promoting hot pages from the slow memory node to the fast memory node. This will create some memory pressure in the fast memory node, thus trigger the memory reclaiming. So that, the cold pages in the fast memory node will be demoted to the slow memory node. b. Make kswapd of the fast memory node to reclaim pages until the free pages are a little (for example, high_watermark / 4) more than the high watermark. Then, if the free pages of the fast memory node reaches high watermark, and some hot pages need to be promoted, kswapd of the fast memory node will be waken up to demote more cold pages in the fast memory node to the slow memory node. This will free some extra space in the fast memory node, so the hot pages in the slow memory node can be promoted to the fast memory node. The choice "a" may create high memory pressure in the fast memory node. If the memory pressure of the workload is high, the memory pressure may become so high that the memory allocation latency of the workload is influenced, e.g. the direct reclaiming may be triggered. The choice "b" works much better at this aspect. If the memory pressure of the workload is high, the hot pages promotion will stop earlier because its allocation watermark is higher than that of the normal memory allocation. So in this patch, choice "b" is implemented. In addition to the original page placement optimization among sockets, the NUMA balancing mechanism is extended to be used to optimize page placement according to hot/cold among different memory types. So the sysctl user space interface (numa_balancing) is extended in a backward compatible way as follow, so that the users can enable/disable these functionality individually. The sysctl is converted from a Boolean value to a bits field. The definition of the flags is, - 0x0: NUMA_BALANCING_DISABLED - 0x1: NUMA_BALANCING_NORMAL - 0x2: NUMA_BALANCING_MEMORY_TIERING We have tested the patch with the pmbench memory accessing benchmark with the 80:20 read/write ratio and the Gauss access address distribution on a 2 socket Intel server with Optane DC Persistent Memory Model. The test results shows that the pmbench score can improve up to 95.9%. Link: https://lkml.kernel.org/r/20220128082751.593478-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Baolin Wang Reviewed-by: Baolin Wang Cc: Michal Hocko Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Dave Hansen Cc: Yang Shi Cc: Zi Yan Cc: Wei Xu Cc: Shakeel Butt Cc: zhongjiang-ali Cc: Feng Tang Cc: Hasan Al Maruf Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 29 ++++++++++++++------- include/linux/sched/sysctl.h | 10 +++++++ kernel/sched/core.c | 21 ++++++++++++--- kernel/sysctl.c | 2 +- mm/migrate.c | 19 ++++++++++++-- mm/vmscan.c | 17 ++++++++++++ 6 files changed, 82 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index d359bcfadd39a..ea32ba0c5d3cc 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -595,16 +595,23 @@ Documentation/admin-guide/kernel-parameters.rst). numa_balancing ============== -Enables/disables automatic page fault based NUMA memory -balancing. Memory is moved automatically to nodes -that access it often. +Enables/disables and configure automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes that access it +often. The value to set can be the result to OR the following, -Enables/disables automatic NUMA memory balancing. On NUMA machines, there -is a performance penalty if remote memory is accessed by a CPU. When this -feature is enabled the kernel samples what task thread is accessing memory -by periodically unmapping pages and later trapping a page fault. At the -time of the page fault, it is determined if the data being accessed should -be migrated to a local memory node. += ================================= +0x0 NUMA_BALANCING_DISABLED +0x1 NUMA_BALANCING_NORMAL +0x2 NUMA_BALANCING_MEMORY_TIERING += ================================= + +Or NUMA_BALANCING_NORMAL to optimize page placement among different +NUMA nodes to reduce remote accessing. On NUMA machines, there is a +performance penalty if remote memory is accessed by a CPU. When this +feature is enabled the kernel samples what task thread is accessing +memory by periodically unmapping pages and later trapping a page +fault. At the time of the page fault, it is determined if the data +being accessed should be migrated to a local memory node. The unmapping of pages and trapping faults incur additional overhead that ideally is offset by improved memory locality but there is no universal @@ -615,6 +622,10 @@ faults may be controlled by the `numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls. +Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among +different types of memory (represented as different NUMA nodes) to +place the hot pages in the fast memory. This is implemented based on +unmapping and page fault too. numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb =============================================================================================================================== diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c19dd5a2c05c6..b5eec8854c5a5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -23,6 +23,16 @@ enum sched_tunable_scaling { SCHED_TUNABLESCALING_END, }; +#define NUMA_BALANCING_DISABLED 0x0 +#define NUMA_BALANCING_NORMAL 0x1 +#define NUMA_BALANCING_MEMORY_TIERING 0x2 + +#ifdef CONFIG_NUMA_BALANCING +extern int sysctl_numa_balancing_mode; +#else +#define sysctl_numa_balancing_mode 0 +#endif + /* * control realtime throttling: * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcf0c180617c2..c25348e9ae3ad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4280,7 +4280,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4288,13 +4290,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4304,8 +4315,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 156e5b6d485d3..c39cd550cb7c8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1687,7 +1687,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ { diff --git a/mm/migrate.c b/mm/migrate.c index 9adc6a4cd4899..97048d4e1270e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -2046,16 +2047,30 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; int nr_pages = thp_nr_pages(page); + int order = compound_order(page); - VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, nr_pages)) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) { + int z; + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) || + !numa_demotion_enabled) + return 0; + if (next_demotion_node(pgdat->node_id) == NUMA_NO_NODE) + return 0; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + if (populated_zone(pgdat->node_zones + z)) + break; + } + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; + } if (isolate_lru_page(page)) return 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 2443ebaf17671..8a77de250cd60 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include #include +#include #include "internal.h" @@ -3940,6 +3941,13 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) return false; } +/* + * Keep the free pages on fast memory node a little more than the high + * watermark to accommodate the promoted pages. + */ +#define NUMA_BALANCING_PROMOTE_WATERMARK_DIV 4 +#define NUMA_BALANCING_PROMOTE_WATERMARK_MIN (10UL * 1024 * 1024 >> PAGE_SHIFT) + /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx @@ -3961,6 +3969,15 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) continue; mark = high_wmark_pages(zone); + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + numa_demotion_enabled && + next_demotion_node(pgdat->node_id) != NUMA_NO_NODE) { + unsigned long promote_mark; + + promote_mark = max(NUMA_BALANCING_PROMOTE_WATERMARK_MIN, + mark / NUMA_BALANCING_PROMOTE_WATERMARK_DIV); + mark += promote_mark; + } if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } From b6ad83e401ff2c2903fabad8fe911d8c7ba47187 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:28 +1100 Subject: [PATCH 198/334] numa-balancing-optimize-page-placement-for-memory-tiering-system-fix Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ea32ba0c5d3cc..348ba646906ab 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -595,14 +595,14 @@ Documentation/admin-guide/kernel-parameters.rst). numa_balancing ============== -Enables/disables and configure automatic page fault based NUMA memory -balancing. Memory is moved automatically to nodes that access it -often. The value to set can be the result to OR the following, +Enables/disables and configures automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes that access it often. +The value to set can be the result of ORing the following, = ================================= -0x0 NUMA_BALANCING_DISABLED -0x1 NUMA_BALANCING_NORMAL -0x2 NUMA_BALANCING_MEMORY_TIERING +0 NUMA_BALANCING_DISABLED +1 NUMA_BALANCING_NORMAL +2 NUMA_BALANCING_MEMORY_TIERING = ================================= Or NUMA_BALANCING_NORMAL to optimize page placement among different From 1cc070fdb28e246058566dd63284e2becaf36cd4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:28 +1100 Subject: [PATCH 199/334] numa-balancing-optimize-page-placement-for-memory-tiering-system-fix-fix s/,/::/ per Randy Cc: "Huang, Ying" Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 348ba646906ab..6f2b32a0af8fb 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -597,7 +597,7 @@ numa_balancing Enables/disables and configures automatic page fault based NUMA memory balancing. Memory is moved automatically to nodes that access it often. -The value to set can be the result of ORing the following, +The value to set can be the result of ORing the following:: = ================================= 0 NUMA_BALANCING_DISABLED From 990b9a30a5a46c3c5353f76023c0352dce0060b8 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 16 Feb 2022 15:31:28 +1100 Subject: [PATCH 200/334] numa-balancing-optimize-page-placement-for-memory-tiering-system-fix-fix-fix fix the following warnings of `make htmldocs`, Documentation/admin-guide/sysctl/kernel.rst:603: WARNING: Inconsistent literal block quoting. Link: https://lkml.kernel.org/r/87r18cjwbe.fsf@yhuang6-desk2.ccr.corp.intel.com Reported-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 6f2b32a0af8fb..77c3541039407 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -597,7 +597,7 @@ numa_balancing Enables/disables and configures automatic page fault based NUMA memory balancing. Memory is moved automatically to nodes that access it often. -The value to set can be the result of ORing the following:: +The value to set can be the result of ORing the following: = ================================= 0 NUMA_BALANCING_DISABLED From a17d5393ce656e05ac6e99c2bc469d9d1d9661a2 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 16 Feb 2022 15:31:28 +1100 Subject: [PATCH 201/334] memory tiering: skip to scan fast memory If the NUMA balancing isn't used to optimize the page placement among sockets but only among memory types, the hot pages in the fast memory node couldn't be migrated (promoted) to anywhere. So it's unnecessary to scan the pages in the fast memory node via changing their PTE/PMD mapping to be PROT_NONE. So that the page faults could be avoided too. In the test, if only the memory tiering NUMA balancing mode is enabled, the number of the NUMA balancing hint faults for the DRAM node is reduced to almost 0 with the patch. While the benchmark score doesn't change visibly. Link: https://lkml.kernel.org/r/20220128082751.593478-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Dave Hansen Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Yang Shi Cc: Shakeel Butt Cc: zhongjiang-ali Cc: Hasan Al Maruf Cc: Oscar Salvador Cc: Feng Tang Cc: Zi Yan Cc: Wei Xu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/huge_memory.c | 30 +++++++++++++++++++++--------- mm/mprotect.c | 13 ++++++++++++- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 097b1dfa81eb5..09fb65a80e636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1741,17 +1742,28 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } #endif - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) - goto unlock; + if (prot_numa) { + struct page *page; + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (is_huge_zero_pmd(*pmd)) + goto unlock; - if (prot_numa && pmd_protnone(*pmd)) - goto unlock; + if (pmd_protnone(*pmd)) + goto unlock; + page = pmd_page(*pmd); + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(page_to_nid(page))) + goto unlock; + } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED diff --git a/mm/mprotect.c b/mm/mprotect.c index 0138dfcdb1d80..2fe03e695c81c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ if (prot_numa) { struct page *page; + int nid; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) @@ -109,7 +111,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. */ - if (target_node == page_to_nid(page)) + nid = page_to_nid(page); + if (target_node == nid) + continue; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) continue; } From dd81bc9686ace11081c7dc29c99c0762837bf379 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Feb 2022 15:31:29 +1100 Subject: [PATCH 202/334] mm: page_io: fix psi memory pressure error on cold swapins Once upon a time, all swapins counted toward memory pressure[1]. Then Joonsoo introduced workingset detection for anonymous pages and we gained the ability to distinguish hot from cold swapins[2][3]. But we failed to update swap_readpage() accordingly, and now we account partial memory pressure in the swapin path of cold memory. Not for all situations - which adds more inconsistency: paths using the conventional submit_bio() and lock_page() route will not see much pressure - unless storage itself is heavily congested and the bio submissions stall. ZRAM and ZSWAP do most of the work directly from swap_readpage() and will see all swapins reflected as pressure. Restore consistency by making all swapin stall accounting conditional on the page actually being part of the workingset. [1] commit 937790699be9 ("mm/page_io.c: annotate refault stalls from swap_readpage") [2] commit aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU") [3] commit cad8320b4b39 ("mm/swap: don't SetPageWorkingset unconditionally during swapin") Link: https://lkml.kernel.org/r/20220214214921.419687-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: CGEL Cc: Minchan Kim Cc: Joonsoo Kim Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_io.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 0bf8e40f4e573..5dd4dc2e28645 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -359,6 +359,7 @@ int swap_readpage(struct page *page, bool synchronous) struct bio *bio; int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + bool workingset = PageWorkingset(page); unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); @@ -370,7 +371,8 @@ int swap_readpage(struct page *page, bool synchronous) * or the submitting cgroup IO-throttled, submission can be a * significant part of overall IO time. */ - psi_memstall_enter(&pflags); + if (workingset) + psi_memstall_enter(&pflags); delayacct_swapin_start(); if (frontswap_load(page) == 0) { @@ -433,7 +435,8 @@ int swap_readpage(struct page *page, bool synchronous) bio_put(bio); out: - psi_memstall_leave(&pflags); + if (workingset) + psi_memstall_leave(&pflags); delayacct_swapin_end(); return ret; } From 8728660a83e343bb752e6e471ba92825bc7e474f Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 16 Feb 2022 15:31:29 +1100 Subject: [PATCH 203/334] mm/vmstat: add event for ksm swapping in copy When faults in from swap what used to be a KSM page and that page had been swapped in before, system has to make a copy, and leaves remerging the pages to a later pass of ksmd. That is not good for performace, we'd better to reduce this kind of copy. There are some ways to reduce it, for example lessen swappiness or madvise(, , MADV_MERGEABLE) range. So add this event to support doing this tuning. Just like this patch: "mm, THP, swap: add THP swapping out fallback counting". Link: https://lkml.kernel.org/r/20220113023839.758845-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: Ran Xiaokai Cc: Hugh Dickins Cc: Yang Shi Cc: Dave Hansen Cc: Saravanan D Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/vm_event_item.h | 3 +++ mm/ksm.c | 3 +++ mm/vmstat.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 7b2363388bfa2..16a0a4fd000be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -129,6 +129,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#ifdef CONFIG_KSM + KSM_SWPIN_COPY, +#endif #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/mm/ksm.c b/mm/ksm.c index c5a4403b5dc9d..e246d650266ac 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2585,6 +2585,9 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); +#ifdef CONFIG_SWAP + count_vm_event(KSM_SWPIN_COPY); +#endif } return new_page; diff --git a/mm/vmstat.c b/mm/vmstat.c index 846b670dd346a..d5cc8d739fac1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1388,6 +1388,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", +#ifdef CONFIG_KSM + "ksm_swpin_copy", +#endif #endif #ifdef CONFIG_X86 "direct_map_level2_splits", From b8e3308d58fa56bac6e31756b7ebfe29e7cd63f8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Feb 2022 15:31:29 +1100 Subject: [PATCH 204/334] mm/hwpoison: check the subpage, not the head page Hardware poison is tracked on a per-page basis, not on the head page. Link: https://lkml.kernel.org/r/20220130013042.1906881-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: David Rientjes Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index a239a68cbd257..393640d64e952 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1545,7 +1545,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { hugetlb_count_sub(compound_nr(page), mm); @@ -1869,7 +1869,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * memory are supported. */ subpage = page; - } else if (PageHWPoison(page)) { + } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { hugetlb_count_sub(compound_nr(page), mm); From 8ab00b008419e5b4d002b801dce7c0cf45545769 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:29 +1100 Subject: [PATCH 205/334] mm/balloon_compaction: make balloon page compaction callbacks static Since commit b1123ea6d3b3 ("mm: balloon: use general non-lru movable page feature"), these functions are called via balloon_aops callbacks. They're not called directly outside this file. So make them static and clean up the relevant code. Link: https://lkml.kernel.org/r/20220125132221.2220-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: Rafael Aquini Acked-by: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/balloon_compaction.h | 22 ---------------------- mm/balloon_compaction.c | 6 +++--- 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 338aa27e4773b..edb7f6d41faa0 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -80,12 +80,6 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) #ifdef CONFIG_BALLOON_COMPACTION extern const struct address_space_operations balloon_aops; -extern bool balloon_page_isolate(struct page *page, - isolate_mode_t mode); -extern void balloon_page_putback(struct page *page); -extern int balloon_page_migrate(struct address_space *mapping, - struct page *newpage, - struct page *page, enum migrate_mode mode); /* * balloon_page_insert - insert a page into the balloon's page list and make @@ -155,22 +149,6 @@ static inline void balloon_page_delete(struct page *page) list_del(&page->lru); } -static inline bool balloon_page_isolate(struct page *page) -{ - return false; -} - -static inline void balloon_page_putback(struct page *page) -{ - return; -} - -static inline int balloon_page_migrate(struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - return 0; -} - static inline gfp_t balloon_mapping_gfp_mask(void) { return GFP_HIGHUSER; diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 907fefde25728..4b8eab4b3f456 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -bool balloon_page_isolate(struct page *page, isolate_mode_t mode) +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); @@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode) return true; } -void balloon_page_putback(struct page *page) +static void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; @@ -230,7 +230,7 @@ void balloon_page_putback(struct page *page) /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct address_space *mapping, +static int balloon_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { From 01842d5c53ac8b3e3037ccbeda15aff07c72f809 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 16 Feb 2022 15:31:29 +1100 Subject: [PATCH 206/334] mm: fix race between MADV_FREE reclaim and blkdev direct IO read Problem: ======= Userspace might read the zero-page instead of actual data from a direct IO read on a block device if the buffers have been called madvise(MADV_FREE) on earlier (this is discussed below) due to a race between page reclaim on MADV_FREE and blkdev direct IO read. - Race condition: ============== During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks if the page is not dirty, then discards its rmap PTE(s) (vs. remap back if the page is dirty). However, after try_to_unmap_one() returns to shrink_page_list(), it might keep the page _anyway_ if page_ref_freeze() fails (it expects exactly _one_ page reference, from the isolation for page reclaim). Well, blkdev_direct_IO() gets references for all pages, and on READ operations it only sets them dirty _later_. So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct IO read from block devices, and page reclaim happens during __blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages() returns, but BEFORE the pages are set dirty, the situation happens. The direct IO read eventually completes. Now, when userspace reads the buffers, the PTE is no longer there and the page fault handler do_anonymous_page() services that with the zero-page, NOT the data! A synthetic reproducer is provided. - Page faults: =========== If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't happen, because that faults-in all pages as writeable, so do_anonymous_page() sets up a new page/rmap/PTE, and that is used by direct IO. The userspace reads don't fault as the PTE is there (thus zero-page is not used/setup). But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE is no longer there; the subsequent page faults can't help: The data-read from the block device probably won't generate faults due to DMA (no MMU) but even in the case it wouldn't use DMA, that happens on different virtual addresses (not user-mapped addresses) because `struct bio_vec` stores `struct page` to figure addresses out (which are different from user-mapped addresses) for the read. Thus userspace reads (to user-mapped addresses) still fault, then do_anonymous_page() gets another `struct page` that would address/ map to other memory than the `struct page` used by `struct bio_vec` for the read. (The original `struct page` is not available, since it wasn't freed, as page_ref_freeze() failed due to more page refs. And even if it were available, its data cannot be trusted anymore.) Solution: ======== One solution is to check for the expected page reference count in try_to_unmap_one(). There should be one reference from the isolation (that is also checked in shrink_page_list() with page_ref_freeze()) plus one or more references from page mapping(s) (put in discard: label). Further references mean that rmap/PTE cannot be unmapped/nuked. (Note: there might be more than one reference from mapping due to fork()/clone() without CLONE_VM, which use the same `struct page` for references, until the copy-on-write page gets copied.) So, additional page references (e.g., from direct IO read) now prevent the rmap/PTE from being unmapped/dropped; similarly to the page is not freed per shrink_page_list()/page_ref_freeze()). - Races and Barriers: ================== The new check in try_to_unmap_one() should be safe in races with bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's done under the PTE lock. The fast path doesn't take the lock, but it checks if the PTE has changed and if so, it drops the reference and leaves the page for the slow path (which does take that lock). The fast path requires synchronization w/ full memory barrier: it writes the page reference count first then it reads the PTE later, while try_to_unmap() writes PTE first then it reads page refcount. And a second barrier is needed, as the page dirty flag should not be read before the page reference count (as in __remove_mapping()). (This can be a load memory barrier only; no writes are involved.) Call stack/comments: - try_to_unmap_one() - page_vma_mapped_walk() - map_pte() # see pte_offset_map_lock(): pte_offset_map() spin_lock() - ptep_get_and_clear() # write PTE - smp_mb() # (new barrier) GUP fast path - page_ref_count() # (new check) read refcount - page_vma_mapped_walk_done() # see pte_unmap_unlock(): pte_unmap() spin_unlock() - bio_iov_iter_get_pages() - __bio_iov_iter_get_pages() - iov_iter_get_pages() - get_user_pages_fast() - internal_get_user_pages_fast() # fast path - lockless_pages_from_mm() - gup_{pgd,p4d,pud,pmd,pte}_range() ptep = pte_offset_map() # not _lock() pte = ptep_get_lockless(ptep) page = pte_page(pte) try_grab_compound_head(page) # inc refcount # (RMW/barrier # on success) if (pte_val(pte) != pte_val(*ptep)) # read PTE put_compound_head(page) # dec refcount # go slow path # slow path - __gup_longterm_unlocked() - get_user_pages_unlocked() - __get_user_pages_locked() - __get_user_pages() - follow_{page,p4d,pud,pmd}_mask() - follow_page_pte() ptep = pte_offset_map_lock() pte = *ptep page = vm_normal_page(pte) try_grab_page(page) # inc refcount pte_unmap_unlock() - Huge Pages: ========== Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE (aka lazyfree) pages are PageAnon() && !PageSwapBacked() (madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn()) thus should reach shrink_page_list() -> split_huge_page_to_list() before try_to_unmap[_one](), so it deals with normal pages only. (And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens, which should not or be rare, the page refcount should be greater than mapcount: the head page is referenced by tail pages. That also prevents checking the head `page` then incorrectly call page_remove_rmap(subpage) for a tail page, that isn't even in the shrink_page_list()'s page_list (an effect of split huge pmd/pmvw), as it might happen today in this unlikely scenario.) MADV_FREE'd buffers: =================== So, back to the "if MADV_FREE pages are used as buffers" note. The case is arguable, and subject to multiple interpretations. The madvise(2) manual page on the MADV_FREE advice value says: 1) 'After a successful MADV_FREE ... data will be lost when the kernel frees the pages.' 2) 'the free operation will be canceled if the caller writes into the page' / 'subsequent writes ... will succeed and then [the] kernel cannot free those dirtied pages' 3) 'If there is no subsequent write, the kernel can free the pages at any time.' Thoughts, questions, considerations... respectively: 1) Since the kernel didn't actually free the page (page_ref_freeze() failed), should the data not have been lost? (on userspace read.) 2) Should writes performed by the direct IO read be able to cancel the free operation? - Should the direct IO read be considered as 'the caller' too, as it's been requested by 'the caller'? - Should the bio technique to dirty pages on return to userspace (bio_check_pages_dirty() is called/used by __blkdev_direct_IO()) be considered in another/special way here? 3) Should an upcoming write from a previously requested direct IO read be considered as a subsequent write, so the kernel should not free the pages? (as it's known at the time of page reclaim.) At last: Technically, the last point would seem a reasonable consideration and balance, as the madvise(2) manual page apparently (and fairly) seem to assume that 'writes' are memory access from the userspace process (not explicitly considering writes from the kernel or its corner cases; again, fairly).. plus the kernel fix implementation for the corner case of the largely 'non-atomic write' encompassed by a direct IO read operation, is relatively simple; and it helps. Reproducer: ========== @ test.c (simplified, but works) #define _GNU_SOURCE #include #include #include #include int main() { int fd, i; char *buf; fd = open(DEV, O_RDONLY | O_DIRECT); buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) buf[i] = 1; // init to non-zero madvise(buf, BUF_SIZE, MADV_FREE); read(fd, buf, BUF_SIZE); for (i = 0; i < BUF_SIZE; i += PAGE_SIZE) printf("%p: 0x%x ", &buf[i], buf[i]); return 0; } @ block/fops.c (formerly fs/block_dev.c) +#include ... ... __blkdev_direct_IO[_simple](...) { ... + if (!strcmp(current->comm, "good")) + shrink_all_memory(ULONG_MAX); + ret = bio_iov_iter_get_pages(...); + + if (!strcmp(current->comm, "bad")) + shrink_all_memory(ULONG_MAX); ... } @ shell # NUM_PAGES=4 # PAGE_SIZE=$(getconf PAGE_SIZE) # yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES} # DEV=$(losetup -f --show test.img) # gcc -DDEV=\"$DEV\" \ -DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \ -DPAGE_SIZE=${PAGE_SIZE} \ test.c -o test # od -tx1 $DEV 0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a * 0040000 # mv test good # ./good 0x7f7c10418000: 0x79 0x7f7c10419000: 0x79 0x7f7c1041a000: 0x79 0x7f7c1041b000: 0x79 # mv good bad # ./bad 0x7fa1b8050000: 0x0 0x7fa1b8051000: 0x0 0x7fa1b8052000: 0x0 0x7fa1b8053000: 0x0 Ceph/TCMalloc: ============= For documentation purposes, the use case driving the analysis/fix is Ceph on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to release unused memory to the system from the mmap'ed page heap (might be committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan() -> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() -> TCMalloc_SystemCommit() -> do nothing. Note: TCMalloc switched back to MADV_DONTNEED a few commits after the release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue just 'disappeared' on Ceph on later Ubuntu releases but is still present in the kernel, and can be hit by other use cases. The observed issue seems to be the old Ceph bug #22464 [1], where checksum mismatches are observed (and instrumentation with buffer dumps shows zero-pages read from mmap'ed/MADV_FREE'd page ranges). The issue in Ceph was reasonably deemed a kernel bug (comment #50) and mostly worked around with a retry mechanism, but other parts of Ceph could still hit that (rocksdb). Anyway, it's less likely to be hit again as TCMalloc switched out of MADV_FREE by default. (Some kernel versions/reports from the Ceph bug, and relation with the MADV_FREE introduction/changes; TCMalloc versions not checked.) - 4.4 good - 4.5 (madv_free: introduction) - 4.9 bad - 4.10 good? maybe a swapless system - 4.12 (madv_free: no longer free instantly on swapless systems) - 4.13 bad [1] https://tracker.ceph.com/issues/22464 Thanks: ====== Several people contributed to analysis/discussions/tests/reproducers in the first stages when drilling down on ceph/tcmalloc/linux kernel: - Dan Hill - Dan Streetman - Dongdong Tao - Gavin Guo - Gerald Yang - Heitor Alves de Siqueira - Ioanna Alifieraki - Jay Vosburgh - Matthew Ruffell - Ponnuvel Palaniyappan Link: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: "Huang, Ying" Cc: Minchan Kim Cc: Yu Zhao Cc: Yang Shi Cc: Miaohe Lin Cc: Dan Hill Cc: Dan Streetman Cc: Dongdong Tao Cc: Gavin Guo Cc: Gerald Yang Cc: Heitor Alves de Siqueira Cc: Ioanna Alifieraki Cc: Jay Vosburgh Cc: Matthew Ruffell Cc: Ponnuvel Palaniyappan Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/rmap.c | 25 ++++++++++++++++++++++++- mm/vmscan.c | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 393640d64e952..5df685da578d3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1591,7 +1591,30 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { - if (!PageDirty(page)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = page_count(page); + map_count = page_mapcount(page); + + /* + * Order reads for page refcount and dirty flag; + * see __remove_mapping(). + */ + smp_rmb(); + + /* + * The only page refs must be from the isolation + * plus one or more rmap's (dropped by discard:). + */ + if ((ref_count == 1 + map_count) && + !PageDirty(page)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); diff --git a/mm/vmscan.c b/mm/vmscan.c index 8a77de250cd60..5f471c1e279fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1717,7 +1717,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, mapping = page_mapping(page); } } else if (unlikely(PageTransHuge(page))) { - /* Split file THP */ + /* Split file/lazyfree THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } From 39b049ba5f0231d50ba45d8d283ce464a2e608ad Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 16 Feb 2022 15:31:30 +1100 Subject: [PATCH 207/334] mm, memory_hotplug: make arch_alloc_nodedata independent on CONFIG_MEMORY_HOTPLUG Patch series "mm, memory_hotplug: handle unitialized numa node gracefully". The core of the fix is patch 2 which also links existing bug reports. The high level goal is to have all possible numa nodes have their pgdat allocated and initialized so for_each_possible_node(nid) NODE_DATA(nid) will never return garbage. This has proven to be problem in several places when an offline numa node is used for an allocation just to realize that node_data and therefore allocation fallback zonelists are not initialized and such an allocation request blows up. There were attempts to address that by checking node_online in several places including the page allocator. This patchset approaches the problem from a different perspective and instead of special casing, which just adds a runtime overhead, it allocates pglist_data for each possible node. This can add some memory overhead for platforms with high number of possible nodes if they do not contain any memory. This should be a rather rare configuration though. How to test this? David has provided and excellent howto: http://lkml.kernel.org/r/6e5ebc19-890c-b6dd-1924-9f25c441010d@redhat.com Patches 1 and 3-6 are mostly cleanups. The patchset has been reviewed by Rafael (thanks!) and the core fix tested by Rafael and Alexey (thanks to both). David has tested as per instructions above and hasn't found any fallouts in the memory hotplug scenarios. This patch (of 6): This is a preparatory patch and it doesn't introduce any functional change. It merely pulls out arch_alloc_nodedata (and co) outside of CONFIG_MEMORY_HOTPLUG because the following patch will need to call this from the generic MM code. Link: https://lkml.kernel.org/r/20220127085305.20890-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20220127085305.20890-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/mm/discontig.c | 2 - include/linux/memory_hotplug.h | 119 ++++++++++++++++----------------- 2 files changed, 59 insertions(+), 62 deletions(-) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 791d4176e4a6b..8dc8a554f7742 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -608,7 +608,6 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } -#ifdef CONFIG_MEMORY_HOTPLUG pg_data_t *arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid); @@ -626,7 +625,6 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) pgdat_list[update_node] = update_pgdat; scatter_node_data(); } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index be48e003a5183..4355983b364d3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,6 +16,65 @@ struct memory_group; struct resource; struct vmem_altmap; +#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION +/* + * For supporting node-hotadd, we have to allocate a new pgdat. + * + * If an arch has generic style NODE_DATA(), + * node_data[nid] = kzalloc() works well. But it depends on the architecture. + * + * In general, generic_alloc_nodedata() is used. + * Now, arch_free_nodedata() is just defined for error path of node_hot_add. + * + */ +extern pg_data_t *arch_alloc_nodedata(int nid); +extern void arch_free_nodedata(pg_data_t *pgdat); +extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); + +#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + +#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) +#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) + +#ifdef CONFIG_NUMA +/* + * XXX: node aware allocation can't work well to get new node's memory at this time. + * Because, pgdat for the new node is not allocated/initialized yet itself. + * To use new node's memory, more consideration will be necessary. + */ +#define generic_alloc_nodedata(nid) \ +({ \ + kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ +}) +/* + * This definition is just for error path in node hotadd. + * For node hotremove, we have to replace this. + */ +#define generic_free_nodedata(pgdat) kfree(pgdat) + +extern pg_data_t *node_data[]; +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ + node_data[nid] = pgdat; +} + +#else /* !CONFIG_NUMA */ + +/* never called */ +static inline pg_data_t *generic_alloc_nodedata(int nid) +{ + BUG(); + return NULL; +} +static inline void generic_free_nodedata(pg_data_t *pgdat) +{ +} +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); @@ -154,66 +213,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * Now, arch_free_nodedata() is just defined for error path of node_hot_add. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_free_nodedata(pg_data_t *pgdat); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) -#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) - -#ifdef CONFIG_NUMA -/* - * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat. - * XXX: kmalloc_node() can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ -}) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - void get_online_mems(void); void put_online_mems(void); From da4490c958ade162fdc11bbb850fafea92e9ce09 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 16 Feb 2022 15:31:30 +1100 Subject: [PATCH 208/334] mm: handle uninitialized numa nodes gracefully We have had several reports [1][2][3] that page allocator blows up when an allocation from a possible node is requested. The underlying reason is that NODE_DATA for the specific node is not allocated. NUMA specific initialization is arch specific and it can vary a lot. E.g. x86 tries to initialize all nodes that have some cpu affinity (see init_cpu_to_node) but this can be insufficient because the node might be cpuless for example. One way to address this problem would be to check for !node_online nodes when trying to get a zonelist and silently fall back to another node. That is unfortunately adding a branch into allocator hot path and it doesn't handle any other potential NODE_DATA users. This patch takes a different approach (following a lead of [3]) and it pre allocates pgdat for all possible nodes in an arch indipendent code - free_area_init. All uninitialized nodes are treated as memoryless nodes. node_state of the node is not changed because that would lead to other side effects - e.g. sysfs representation of such a node and from past discussions [4] it is known that some tools might have problems digesting that. Newly allocated pgdat only gets a minimal initialization and the rest of the work is expected to be done by the memory hotplug - hotadd_new_pgdat (renamed to hotadd_init_pgdat). generic_alloc_nodedata is changed to use the memblock allocator because neither page nor slab allocators are available at the stage when all pgdats are allocated. Hotplug doesn't allocate pgdat anymore so we can use the early boot allocator. The only arch specific implementation is ia64 and that is changed to use the early allocator as well. [1] http://lkml.kernel.org/r/20211101201312.11589-1-amakhalov@vmware.com [2] http://lkml.kernel.org/r/20211207224013.880775-1-npache@redhat.com [3] http://lkml.kernel.org/r/20190114082416.30939-1-mhocko@kernel.org [4] http://lkml.kernel.org/r/20200428093836.27190-1-srikar@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/Yfe7RBeLCijnWBON@dhcp22.suse.cz Reported-by: Alexey Makhalov Tested-by: Alexey Makhalov Reported-by: Nico Pache Acked-by: Rafael Aquini Tested-by: Rafael Aquini Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Mike Rapoport Signed-off-by: Michal Hocko Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/mm/discontig.c | 4 ++-- include/linux/memory_hotplug.h | 2 +- mm/internal.h | 2 ++ mm/memory_hotplug.c | 21 +++++++++----------- mm/page_alloc.c | 36 ++++++++++++++++++++++++++++++---- 5 files changed, 46 insertions(+), 19 deletions(-) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 8dc8a554f7742..dd0cf4834eaa8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -608,11 +608,11 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } -pg_data_t *arch_alloc_nodedata(int nid) +pg_data_t * __init arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid); - return kzalloc(size, GFP_KERNEL); + return memblock_alloc(size, SMP_CACHE_BYTES); } void arch_free_nodedata(pg_data_t *pgdat) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4355983b364d3..cdd66bfdf855b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -44,7 +44,7 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); */ #define generic_alloc_nodedata(nid) \ ({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ + memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ }) /* * This definition is just for error path in node hotadd. diff --git a/mm/internal.h b/mm/internal.h index 827a2e4133c18..9a5674bd0a742 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -717,4 +717,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0139b77c51d5d..11f39d0e76ec3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1162,19 +1162,21 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid) +static pg_data_t __ref *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; pgdat = NODE_DATA(nid); - if (!pgdat) { - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; + /* + * NODE_DATA is preallocated (free_area_init) but its internal + * state is not allocated completely. Add missing pieces. + * Completely offline nodes stay around and they just need + * reintialization. + */ + if (pgdat->per_cpu_nodestats == &boot_nodestats) { pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); - arch_refresh_nodedata(nid, pgdat); } else { int cpu; /* @@ -1193,8 +1195,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid) } } - /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ @@ -1246,7 +1246,7 @@ static int __try_online_node(int nid, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid); + pgdat = hotadd_init_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -1445,9 +1445,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: - /* rollback pgdat allocation and others */ - if (new_node) - rollback_node_hotadd(nid); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79d4ad7a4f051..66243e63a4c46 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6407,7 +6407,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { @@ -6429,7 +6429,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); @@ -8129,8 +8133,32 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + arch_refresh_nodedata(nid, pgdat); + free_area_init_memoryless_node(nid); + /* + * not marking this node online because we do not want to + * confuse userspace by sysfs files/directories for node + * without any memory attached to it (see topology_init) + * The pgdat will get fully initialized when a memory is + * hotpluged into it by hotadd_init_pgdat + */ + continue; + } + + pgdat = NODE_DATA(nid); free_area_init_node(nid); /* Any memory on that node */ From bf76447b2c5597095832773f92d45bb8e7cac5d9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:30 +1100 Subject: [PATCH 209/334] mm-handle-uninitialized-numa-nodes-gracefully-fix replace comment, per Mike Cc: Alexey Makhalov Cc: Christoph Lameter Cc: David Hildenbrand Cc: Dennis Zhou Cc: Eric Dumazet Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Oscar Salvador Cc: Rafael Aquini Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 66243e63a4c46..2129fad576639 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8148,12 +8148,16 @@ void __init free_area_init(unsigned long *max_zone_pfn) } arch_refresh_nodedata(nid, pgdat); free_area_init_memoryless_node(nid); + /* - * not marking this node online because we do not want to - * confuse userspace by sysfs files/directories for node - * without any memory attached to it (see topology_init) - * The pgdat will get fully initialized when a memory is - * hotpluged into it by hotadd_init_pgdat + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. */ continue; } From 20fae44c82fc01608721dfcce73a32909b70f524 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 16 Feb 2022 15:31:30 +1100 Subject: [PATCH 210/334] mm, memory_hotplug: drop arch_free_nodedata Prior to "mm: handle uninitialized numa nodes gracefully" memory hotplug used to allocate pgdat when memory has been added to a node (hotadd_init_pgdat) arch_free_nodedata has been only used in the failure path because once the pgdat is exported (to be visible by NODA_DATA(nid)) it cannot really be freed because there is no synchronization available for that. pgdat is allocated for each possible nodes now so the memory hotplug doesn't need to do the ever use arch_free_nodedata so drop it. This patch doesn't introduce any functional change. Link: https://lkml.kernel.org/r/20220127085305.20890-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/mm/discontig.c | 5 ----- include/linux/memory_hotplug.h | 3 --- mm/memory_hotplug.c | 10 ---------- 3 files changed, 18 deletions(-) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index dd0cf4834eaa8..73d0db36edb60 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -615,11 +615,6 @@ pg_data_t * __init arch_alloc_nodedata(int nid) return memblock_alloc(size, SMP_CACHE_BYTES); } -void arch_free_nodedata(pg_data_t *pgdat) -{ - kfree(pgdat); -} - void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) { pgdat_list[update_node] = update_pgdat; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index cdd66bfdf855b..60f09d3ebb3d4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -24,17 +24,14 @@ struct vmem_altmap; * node_data[nid] = kzalloc() works well. But it depends on the architecture. * * In general, generic_alloc_nodedata() is used. - * Now, arch_free_nodedata() is just defined for error path of node_hot_add. * */ extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_free_nodedata(pg_data_t *pgdat); extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); #else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ #define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) -#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) #ifdef CONFIG_NUMA /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 11f39d0e76ec3..55c3e53090885 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1217,16 +1217,6 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) return pgdat; } -static void rollback_node_hotadd(int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - - arch_refresh_nodedata(nid, NULL); - free_percpu(pgdat->per_cpu_nodestats); - arch_free_nodedata(pgdat); -} - - /* * __try_online_node - online a node if offlined * @nid: the node ID From 9f2b2b89742e71cddd0e10e06cea54303c2fe2e4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 16 Feb 2022 15:31:31 +1100 Subject: [PATCH 211/334] mm, memory_hotplug: reorganize new pgdat initialization When a !node_online node is brought up it needs a hotplug specific initialization because the node could be either uninitialized yet or it could have been recycled after previous hotremove. hotadd_init_pgdat is responsible for that. Internal pgdat state is initialized at two places currently - hotadd_init_pgdat - free_area_init_core_hotplug There is no real clear cut what should go where but this patch's chosen to move the whole internal state initialization into free_area_init_core_hotplug. hotadd_init_pgdat is still responsible to pull all the parts together - most notably to initialize zonelists because those depend on the overall topology. This patch doesn't introduce any functional change. Link: https://lkml.kernel.org/r/20220127085305.20890-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Mike Rapoport Cc: Nico Pache Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 28 +++------------------------- mm/page_alloc.c | 25 +++++++++++++++++++++++-- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 60f09d3ebb3d4..76bf2de86defc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,7 +319,7 @@ extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); #ifdef CONFIG_MEMORY_HOTPLUG -extern void __ref free_area_init_core_hotplug(int nid); +extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory_resource(int nid, struct resource *resource, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 55c3e53090885..a4f69d399929c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1166,39 +1166,16 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; - pgdat = NODE_DATA(nid); - /* * NODE_DATA is preallocated (free_area_init) but its internal * state is not allocated completely. Add missing pieces. * Completely offline nodes stay around and they just need * reintialization. */ - if (pgdat->per_cpu_nodestats == &boot_nodestats) { - pgdat->per_cpu_nodestats = - alloc_percpu(struct per_cpu_nodestat); - } else { - int cpu; - /* - * Reset the nr_zones, order and highest_zoneidx before reuse. - * Note that kswapd will init kswapd_highest_zoneidx properly - * when it starts in the near future. - */ - pgdat->nr_zones = 0; - pgdat->kswapd_order = 0; - pgdat->kswapd_highest_zoneidx = 0; - for_each_online_cpu(cpu) { - struct per_cpu_nodestat *p; - - p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); - memset(p, 0, sizeof(*p)); - } - } - - pgdat->node_start_pfn = 0; + pgdat = NODE_DATA(nid); /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_core_hotplug(nid); + free_area_init_core_hotplug(pgdat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1210,6 +1187,7 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). + * TODO: should be in free_area_init_core_hotplug? */ reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2129fad576639..fb563032865ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7532,12 +7532,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, * NOTE: this function is only called during memory hotplug */ #ifdef CONFIG_MEMORY_HOTPLUG -void __ref free_area_init_core_hotplug(int nid) +void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) { + int nid = pgdat->node_id; enum zone_type z; - pg_data_t *pgdat = NODE_DATA(nid); + int cpu; pgdat_init_internals(pgdat); + + if (pgdat->per_cpu_nodestats == &boot_nodestats) + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); + + /* + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly + * when it starts in the near future. + */ + pgdat->nr_zones = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_highest_zoneidx = 0; + pgdat->node_start_pfn = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } + for (z = 0; z < MAX_NR_ZONES; z++) zone_init_internals(&pgdat->node_zones[z], z, nid, 0); } From c65e5b7389f082abd178790de72c7ac227cf7ff6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 16 Feb 2022 15:31:31 +1100 Subject: [PATCH 212/334] mm: make free_area_init_node aware of memory less nodes free_area_init_node is also called from memory less node initialization path (free_area_init_memoryless_node). It doesn't really make much sense to display the physical memory range for those nodes: Initmem setup node XX [mem 0x0000000000000000-0x0000000000000000] Instead be explicit that the node is memoryless: Initmem setup node XX as memoryless Link: https://lkml.kernel.org/r/20220127085305.20890-6-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Reviewed-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_alloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb563032865ca..e0c1e6bb09dd1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7708,9 +7708,14 @@ static void __init free_area_init_node(int nid) pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + if (start_pfn != end_pfn) { + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + pr_info("Initmem setup node %d as memoryless\n", nid); + } + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); From 06248201928f3dc3783682e87b54955d2d810804 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 16 Feb 2022 15:31:31 +1100 Subject: [PATCH 213/334] memcg: do not tweak node in alloc_mem_cgroup_per_node_info alloc_mem_cgroup_per_node_info is allocated for each possible node and this used to be a problem because !node_online nodes didn't have appropriate data structure allocated. This has changed by "mm: handle uninitialized numa nodes gracefully" so we can drop the special casing here. Link: https://lkml.kernel.org/r/20220127085305.20890-7-mhocko@kernel.org Signed-off-by: Wei Yang Signed-off-by: Michal Hocko Cc: David Hildenbrand Cc: Alexey Makhalov Cc: Dennis Zhou Cc: Eric Dumazet Cc: Tejun Heo Cc: Christoph Lameter Cc: Nico Pache Cc: Wei Yang Cc: Mike Rapoport Cc: Oscar Salvador Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memcontrol.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a03959f9881f6..760be043c3bfb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5073,18 +5073,8 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - int tmp = node; - /* - * This routine is called against possible nodes. - * But it's BUG to call kmalloc() against offline node. - * - * TODO: this routine can waste much memory for nodes which will - * never be onlined. It's better to use memory hotplug callback - * function. - */ - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); if (!pn) return 1; From ba235ed61614468b13556b046fedf50b44045bbf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:31 +1100 Subject: [PATCH 214/334] drivers/base/memory: add memory block to memory group after registration succeeded If register_memory() fails, we freed the memory block but already added the memory block to the group list, not good. Let's defer adding the block to the memory group to after registering the memory block device. We do handle it properly during unregister_memory(), but that's not called when the registration fails. Link: https://lkml.kernel.org/r/20220128144540.153902-1-david@redhat.com Fixes: 028fc57a1c36 ("drivers/base/memory: introduce "memory groups" to logically group memory blocks") Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 365cd4a7f2397..60c38f9cf1a75 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -663,14 +663,16 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->nr_vmemmap_pages = nr_vmemmap_pages; INIT_LIST_HEAD(&mem->group_next); + ret = register_memory(mem); + if (ret) + return ret; + if (group) { mem->group = group; list_add(&mem->group_next, &group->memory_blocks); } - ret = register_memory(mem); - - return ret; + return 0; } static int add_memory_block(unsigned long base_section_nr) From 0be5adb775e11f5e9843c16fb68031b961e0d7cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:31 +1100 Subject: [PATCH 215/334] drivers/base/node: consolidate node device subsystem initialization in node_dev_init() ... and call node_dev_init() after memory_dev_init() from driver_init(), so before any of the existing arch/subsys calls. All online nodes should be known at that point: early during boot, arch code determines node and zone ranges and sets the relevant nodes online; usually this happens in setup_arch(). This is in line with memory_dev_init(), which initializes the memory device subsystem and creates all memory block devices. Similar to memory_dev_init(), panic() if anything goes wrong, we don't want to continue with such basic initialization errors. The important part is that node_dev_init() gets called after memory_dev_init() and after cpu_dev_init(), but before any of the relevant archs call register_cpu() to register the new cpu device under the node device. The latter should be the case for the current users of topology_init(). Link: https://lkml.kernel.org/r/20220203105212.30385-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Tested-by: Anatoly Pugachev (sparc64) Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Oscar Salvador Cc: Mike Rapoport Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kernel/setup.c | 3 --- arch/ia64/kernel/topology.c | 10 ---------- arch/mips/kernel/topology.c | 5 ----- arch/powerpc/kernel/sysfs.c | 17 ----------------- arch/riscv/kernel/setup.c | 3 --- arch/s390/kernel/numa.c | 7 ------- arch/sh/kernel/topology.c | 5 ----- arch/sparc/kernel/sysfs.c | 12 ------------ arch/x86/kernel/topology.c | 5 ----- drivers/base/init.c | 1 + drivers/base/node.c | 30 +++++++++++++++++------------- include/linux/node.h | 4 ++++ 12 files changed, 22 insertions(+), 80 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f70573928f1bf..3505789cf4bd9 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -406,9 +406,6 @@ static int __init topology_init(void) { int i; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_data.cpu, i); cpu->hotpluggable = cpu_can_disable(i); diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index e4992917a24b7..94a848b06f15a 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -70,16 +70,6 @@ static int __init topology_init(void) { int i, err = 0; -#ifdef CONFIG_NUMA - /* - * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? - */ - for_each_online_node(i) { - if ((err = register_one_node(i))) - goto out; - } -#endif - sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL); if (!sysfs_cpus) panic("kzalloc in topology_init failed - NR_CPUS too big?"); diff --git a/arch/mips/kernel/topology.c b/arch/mips/kernel/topology.c index 08ad6371fbe08..9429d85a4703c 100644 --- a/arch/mips/kernel/topology.c +++ b/arch/mips/kernel/topology.c @@ -12,11 +12,6 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif /* CONFIG_NUMA */ - for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index d45a415d5374b..2069bbb90a9a3 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -1110,14 +1110,6 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group); /* NUMA stuff */ #ifdef CONFIG_NUMA -static void __init register_nodes(void) -{ - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -} - int sysfs_add_device_to_node(struct device *dev, int nid) { struct node *node = node_devices[nid]; @@ -1132,13 +1124,6 @@ void sysfs_remove_device_from_node(struct device *dev, int nid) sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); } EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); - -#else -static void __init register_nodes(void) -{ - return; -} - #endif /* Only valid if CPU is present. */ @@ -1155,8 +1140,6 @@ static int __init topology_init(void) { int cpu, r; - register_nodes(); - for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index b42bfdc674823..834eb652a7b9d 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -301,9 +301,6 @@ static int __init topology_init(void) { int i, ret; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_devices, i); diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 51c5a9f6e5257..23ab9f02f2787 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -33,10 +33,3 @@ void __init numa_setup(void) NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } - -static int __init numa_init_late(void) -{ - register_one_node(0); - return 0; -} -arch_initcall(numa_init_late); diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c index 76af6db9daa23..2d2a7509b565a 100644 --- a/arch/sh/kernel/topology.c +++ b/arch/sh/kernel/topology.c @@ -46,11 +46,6 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index 6d60d416f0dd7..f19487e4cc71e 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -244,22 +244,10 @@ static void __init check_mmu_stats(void) mmu_stats_supported = 1; } -static void register_nodes(void) -{ -#ifdef CONFIG_NUMA - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -#endif -} - static int __init topology_init(void) { int cpu, ret; - register_nodes(); - check_mmu_stats(); for_each_possible_cpu(cpu) { diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index bd83748e2bde3..8617d1ed9d31b 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -154,11 +154,6 @@ static int __init topology_init(void) { int i; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) arch_register_cpu(i); diff --git a/drivers/base/init.c b/drivers/base/init.c index a9f57c22fb9e2..d8d0fe687111a 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -35,5 +35,6 @@ void __init driver_init(void) auxiliary_bus_init(); cpu_dev_init(); memory_dev_init(); + node_dev_init(); container_dev_init(); } diff --git a/drivers/base/node.c b/drivers/base/node.c index 87acc47e89515..a133981a12fc6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -1065,26 +1065,30 @@ static const struct attribute_group *cpu_root_attr_groups[] = { }; #define NODE_CALLBACK_PRI 2 /* lower than SLAB */ -static int __init register_node_type(void) +void __init node_dev_init(void) { - int ret; + static struct notifier_block node_memory_callback_nb = { + .notifier_call = node_memory_callback, + .priority = NODE_CALLBACK_PRI, + }; + int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES); ret = subsys_system_register(&node_subsys, cpu_root_attr_groups); - if (!ret) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; - register_hotmemory_notifier(&node_memory_callback_nb); - } + if (ret) + panic("%s() failed to register subsystem: %d\n", __func__, ret); + + register_hotmemory_notifier(&node_memory_callback_nb); /* - * Note: we're not going to unregister the node class if we fail - * to register the node state class attribute files. + * Create all node devices, which will properly link the node + * to applicable memory block devices and already created cpu devices. */ - return ret; + for_each_online_node(i) { + ret = register_one_node(i); + if (ret) + panic("%s() failed to add node: %d\n", __func__, ret); + } } -postcore_initcall(register_node_type); diff --git a/include/linux/node.h b/include/linux/node.h index 81bbf1c0afd37..7f876d48af11f 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -112,6 +112,7 @@ static inline void link_mem_sections(int nid, unsigned long start_pfn, extern void unregister_node(struct node *node); #ifdef CONFIG_NUMA +extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ extern int __register_one_node(int nid); @@ -149,6 +150,9 @@ extern void register_hugetlbfs_with_node(node_registration_func_t doregister, node_registration_func_t unregister); #endif #else +static inline void node_dev_init(void) +{ +} static inline int __register_one_node(int nid) { return 0; From cf42afad1abf0d7a18ed8d809ea885a25aa39fc4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:32 +1100 Subject: [PATCH 216/334] mm/memory_hotplug: remove obsolete comment of __add_pages Patch series "A few cleanup patches around memory_hotplug". This series contains a few patches to fix obsolete and misplaced comments, clean up the try_offline_node function and so on. This patch (of 4): Since commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online"), there is no need to pass in the zone. Link: https://lkml.kernel.org/r/20220207133643.23427-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220207133643.23427-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a4f69d399929c..cbc67c27e0dd9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -296,10 +296,7 @@ struct page *pfn_to_online_page(unsigned long pfn) EXPORT_SYMBOL_GPL(pfn_to_online_page); /* - * Reasonably generic function for adding memory. It is - * expected that archs that support memory hotplug will - * call this function after deciding the zone to which to - * add the new pages. + * Reasonably generic function for adding memory. */ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) From cb8f416b647065ee5b048a9a01363f87cee9c252 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:32 +1100 Subject: [PATCH 217/334] mm-memory_hotplug-remove-obsolete-comment-of-__add_pages-fix remove the comment altogether, per David Cc: David Hildenbrand Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cbc67c27e0dd9..39da3812bf476 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -295,9 +295,6 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -/* - * Reasonably generic function for adding memory. - */ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { From 8eeb155213950af4a52ad3e1011d40ac8da2e0dc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:32 +1100 Subject: [PATCH 218/334] mm/memory_hotplug: avoid calling zone_intersects() for ZONE_NORMAL If zid reaches ZONE_NORMAL, the caller will always get the NORMAL zone no matter what zone_intersects() returns. So we can save some possible cpu cycles by avoid calling zone_intersects() for ZONE_NORMAL. Link: https://lkml.kernel.org/r/20220207133643.23427-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 39da3812bf476..947ba4fc8124c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -823,7 +823,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn struct pglist_data *pgdat = NODE_DATA(nid); int zid; - for (zid = 0; zid <= ZONE_NORMAL; zid++) { + for (zid = 0; zid < ZONE_NORMAL; zid++) { struct zone *zone = &pgdat->node_zones[zid]; if (zone_intersects(zone, start_pfn, nr_pages)) From 41b37ef315901ae5dcfac97f9b4845925c301184 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:32 +1100 Subject: [PATCH 219/334] mm/memory_hotplug: clean up try_offline_node We can use helper macro node_spanned_pages to check whether node spans pages. And we can change the parameter of check_cpu_on_node to nid as that's what it really cares. Thus we can further get rid of the local variable pgdat and improve the readability a bit. Link: https://lkml.kernel.org/r/20220207133643.23427-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 947ba4fc8124c..bb1893f10f0f4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -2005,12 +2005,12 @@ static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) return mem->nr_vmemmap_pages; } -static int check_cpu_on_node(pg_data_t *pgdat) +static int check_cpu_on_node(int nid) { int cpu; for_each_present_cpu(cpu) { - if (cpu_to_node(cpu) == pgdat->node_id) + if (cpu_to_node(cpu) == nid) /* * the cpu on this node isn't removed, and we can't * offline this node. @@ -2044,7 +2044,6 @@ static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) */ void try_offline_node(int nid) { - pg_data_t *pgdat = NODE_DATA(nid); int rc; /* @@ -2052,7 +2051,7 @@ void try_offline_node(int nid) * offline it. A node spans memory after move_pfn_range_to_zone(), * e.g., after the memory block was onlined. */ - if (pgdat->node_spanned_pages) + if (node_spanned_pages(nid)) return; /* @@ -2064,7 +2063,7 @@ void try_offline_node(int nid) if (rc) return; - if (check_cpu_on_node(pgdat)) + if (check_cpu_on_node(nid)) return; /* From 1b30f9f96f448ef4b890d04cfeb967f4a12ca4f2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:32 +1100 Subject: [PATCH 220/334] mm/memory_hotplug: fix misplaced comment in offline_pages It's misplaced since commit 7960509329c2 ("mm, memory_hotplug: print reason for the offlining failure"). Move it to the right place. Link: https://lkml.kernel.org/r/20220207133643.23427-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bb1893f10f0f4..ce68098832aa9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1963,6 +1963,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, return 0; failed_removal_isolated: + /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: @@ -1973,7 +1974,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - /* pushback to free area */ mem_hotplug_done(); return ret; } From b608031eff6a67b2255f7f4be7fe3b93f35f21f0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:33 +1100 Subject: [PATCH 221/334] drivers/base/node: rename link_mem_sections() to register_memory_block_under_node() Patch series "drivers/base/memory: determine and store zone for single-zone memory blocks", v2. I remember talking to Michal in the past about removing test_pages_in_a_zone(), which we use for: * verifying that a memory block we intend to offline is really only managed by a single zone. We don't support offlining of memory blocks that are managed by multiple zones (e.g., multiple nodes, DMA and DMA32) * exposing that zone to user space via /sys/devices/system/memory/memory*/valid_zones Now that I identified some more cases where test_pages_in_a_zone() might go wrong, and we received an UBSAN report (see patch #3), let's get rid of this PFN walker. So instead of detecting the zone at runtime with test_pages_in_a_zone() by scanning the memmap, let's determine and remember for each memory block if it's managed by a single zone. The stored zone can then be used for the above two cases, avoiding a manual lookup using test_pages_in_a_zone(). This avoids eventually stumbling over uninitialized memmaps in corner cases, especially when ZONE_DEVICE ranges partly fall into memory block (that are responsible for managing System RAM). Handling memory onlining is easy, because we online to exactly one zone. Handling boot memory is more tricky, because we want to avoid scanning all zones of all nodes to detect possible zones that overlap with the physical memory region of interest. Fortunately, we already have code that determines the applicable nodes for a memory block, to create sysfs links -- we'll hook into that. Patch #1 is a simple cleanup I had laying around for a longer time. Patch #2 contains the main logic to remove test_pages_in_a_zone() and further details. [1] https://lkml.kernel.org/r/20220128144540.153902-1-david@redhat.com [2] https://lkml.kernel.org/r/20220203105212.30385-1-david@redhat.com This patch (of 2): Let's adjust the stale terminology, making it match unregister_memory_block_under_nodes() and do_register_memory_block_under_node(). We're dealing with memory block devices, which span 1..X memory sections. Link: https://lkml.kernel.org/r/20220210184359.235565-1-david@redhat.com Link: https://lkml.kernel.org/r/20220210184359.235565-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Rafael Parra Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/node.c | 5 +++-- include/linux/node.h | 16 ++++++++-------- mm/memory_hotplug.c | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index a133981a12fc6..5d75341413ce3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -892,8 +892,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } -void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, - enum meminit_context context) +void register_memory_blocks_under_node(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { walk_memory_blocks_func_t func; diff --git a/include/linux/node.h b/include/linux/node.h index 7f876d48af11f..40d641a8bfb0d 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,13 +99,13 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) -void link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context); +void register_memory_blocks_under_node(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else -static inline void link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context) +static inline void register_memory_blocks_under_node(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { } #endif @@ -129,8 +129,8 @@ static inline int register_one_node(int nid) error = __register_one_node(nid); if (error) return error; - /* link memory sections under this node */ - link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY); + register_memory_blocks_under_node(nid, start_pfn, end_pfn, + MEMINIT_EARLY); } return error; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ce68098832aa9..ed1a5dac67978 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1383,9 +1383,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) BUG_ON(ret); } - /* link memory sections under this node.*/ - link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); + register_memory_blocks_under_node(nid, PFN_DOWN(start), + PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) From 47c94c15200ea6d5772d9219bc54d1961074a82b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:33 +1100 Subject: [PATCH 222/334] drivers/base/memory: determine and store zone for single-zone memory blocks test_pages_in_a_zone() is just another nasty PFN walker that can easily stumble over ZONE_DEVICE memory ranges falling into the same memory block as ordinary system RAM: the memmap of parts of these ranges might possibly be uninitialized. In fact, we observed (on an older kernel) with UBSAN: [ 7691.855626] UBSAN: Undefined behaviour in ./include/linux/mm.h:1133:50 [ 7691.862155] index 7 is out of range for type 'zone [5]' [ 7691.867393] CPU: 121 PID: 35603 Comm: read_all Kdump: loaded Tainted: [...] [ 7691.879990] Hardware name: Dell Inc. PowerEdge R7425/08V001, BIOS 1.12.2 11/15/2019 [ 7691.887643] Call Trace: [ 7691.890107] dump_stack+0x9a/0xf0 [ 7691.893438] ubsan_epilogue+0x9/0x7a [ 7691.897025] __ubsan_handle_out_of_bounds+0x13a/0x181 [ 7691.902086] ? __ubsan_handle_shift_out_of_bounds+0x289/0x289 [ 7691.907841] ? sched_clock_cpu+0x18/0x1e0 [ 7691.911867] ? __lock_acquire+0x610/0x38d0 [ 7691.915979] test_pages_in_a_zone+0x3c4/0x500 [ 7691.920357] show_valid_zones+0x1fa/0x380 [ 7691.924375] ? print_allowed_zone+0x80/0x80 [ 7691.928571] ? __lock_is_held+0xb4/0x140 [ 7691.932509] ? __lock_is_held+0xb4/0x140 [ 7691.936447] ? dev_attr_store+0x70/0x70 [ 7691.940296] dev_attr_show+0x43/0xb0 [ 7691.943884] ? memset+0x1f/0x40 [ 7691.947042] sysfs_kf_seq_show+0x1c5/0x440 [ 7691.951153] seq_read+0x49d/0x1190 [ 7691.954574] ? seq_escape+0x1f0/0x1f0 [ 7691.958249] ? fsnotify_first_mark+0x150/0x150 [ 7691.962713] vfs_read+0xff/0x300 [ 7691.965952] ksys_read+0xb8/0x170 [ 7691.969279] ? kernel_write+0x130/0x130 [ 7691.973126] ? entry_SYSCALL_64_after_hwframe+0x7a/0xdf [ 7691.978365] ? do_syscall_64+0x22/0x4b0 [ 7691.982212] do_syscall_64+0xa5/0x4b0 [ 7691.985887] entry_SYSCALL_64_after_hwframe+0x6a/0xdf [ 7691.990947] RIP: 0033:0x7f01f4439b52 We seem to stumble over a memmap that contains a garbage zone id. While we could try inserting pfn_to_online_page() calls, it will just make memory offlining slower, because we use test_pages_in_a_zone() to make sure we're offlining pages that all belong to the same zone. Let's just get rid of this PFN walker and determine the single zone of a memory block -- if any -- for early memory blocks during boot. For memory onlining, we know the single zone already. Let's avoid any additional memmap scanning and just rely on the zone information available during boot. For memory hot(un)plug, we only really care about memory blocks that: * span a single zone (and, thereby, a single node) * are completely System RAM (IOW, no holes, no ZONE_DEVICE) If one of these conditions is not met, we reject memory offlining. Hotplugged memory blocks (starting out offline), always meet both conditions. There are three scenarios to handle: (1) Memory hot(un)plug A memory block with zone == NULL cannot be offlined, corresponding to our previous test_pages_in_a_zone() check. After successful memory onlining/offlining, we simply set the zone accordingly. * Memory onlining: set the zone we just used for onlining * Memory offlining: set zone = NULL So a hotplugged memory block starts with zone = NULL. Once memory onlining is done, we set the proper zone. (2) Boot memory with !CONFIG_NUMA We know that there is just a single pgdat, so we simply scan all zones of that pgdat for an intersection with our memory block PFN range when adding the memory block. If more than one zone intersects (e.g., DMA and DMA32 on x86 for the first memory block) we set zone = NULL and consequently mimic what test_pages_in_a_zone() used to do. (3) Boot memory with CONFIG_NUMA At the point in time we create the memory block devices during boot, we don't know yet which nodes *actually* span a memory block. While we could scan all zones of all nodes for intersections, overlapping nodes complicate the situation and scanning all nodes is possibly expensive. But that problem has already been solved by the code that sets the node of a memory block and creates the link in the sysfs -- do_register_memory_block_under_node(). So, we hook into the code that sets the node id for a memory block. If we already have a different node id set for the memory block, we know that multiple nodes *actually* have PFNs falling into our memory block: we set zone = NULL and consequently mimic what test_pages_in_a_zone() used to do. If there is no node id set, we do the same as (2) for the given node. Note that the call order in driver_init() is: -> memory_dev_init(): create memory block devices -> node_dev_init(): link memory block devices to the node and set the node id So in summary, we detect if there is a single zone responsible for this memory block and we consequently store the zone in that case in the memory block, updating it during memory onlining/offlining. Link: https://lkml.kernel.org/r/20220210184359.235565-3-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Rafael Parra Cc: "Rafael J. Wysocki" Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Oscar Salvador Cc: Rafael Parra Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/memory.c | 101 +++++++++++++++++++++++++++++++-- drivers/base/node.c | 13 ++--- include/linux/memory.h | 12 ++++ include/linux/memory_hotplug.h | 6 +- mm/memory_hotplug.c | 50 ++++------------ 5 files changed, 125 insertions(+), 57 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 60c38f9cf1a75..5297c8a84428d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -215,6 +215,7 @@ static int memory_block_online(struct memory_block *mem) adjust_present_page_count(pfn_to_page(start_pfn), mem->group, nr_vmemmap_pages); + mem->zone = zone; return ret; } @@ -225,6 +226,9 @@ static int memory_block_offline(struct memory_block *mem) unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; int ret; + if (!mem->zone) + return -EINVAL; + /* * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). @@ -234,7 +238,7 @@ static int memory_block_offline(struct memory_block *mem) -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages, mem->group); + nr_pages - nr_vmemmap_pages, mem->zone, mem->group); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) @@ -246,6 +250,7 @@ static int memory_block_offline(struct memory_block *mem) if (nr_vmemmap_pages) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + mem->zone = NULL; return ret; } @@ -411,11 +416,10 @@ static ssize_t valid_zones_show(struct device *dev, */ if (mem->state == MEM_ONLINE) { /* - * The block contains more than one zone can not be offlined. - * This can happen e.g. for ZONE_DMA and ZONE_DMA32 + * If !mem->zone, the memory block spans multiple zones and + * cannot get offlined. */ - default_zone = test_pages_in_a_zone(start_pfn, - start_pfn + nr_pages); + default_zone = mem->zone; if (!default_zone) return sysfs_emit(buf, "%s\n", "none"); len += sysfs_emit_at(buf, len, "%s", default_zone->name); @@ -641,6 +645,82 @@ int register_memory(struct memory_block *memory) return ret; } +static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, + int nid) +{ + const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + struct zone *zone, *matching_zone = NULL; + pg_data_t *pgdat = NODE_DATA(nid); + int i; + + /* + * This logic only works for early memory, when the applicable zones + * already span the memory block. We don't expect overlapping zones on + * a single node for early memory. So if we're told that some PFNs + * of a node fall into this memory block, we can assume that all node + * zones that intersect with the memory block are actually applicable. + * No need to look at the memmap. + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + if (!zone_intersects(zone, start_pfn, nr_pages)) + continue; + if (!matching_zone) { + matching_zone = zone; + continue; + } + /* Spans multiple zones ... */ + matching_zone = NULL; + break; + } + return matching_zone; +} + +#ifdef CONFIG_NUMA +/** + * memory_block_add_nid() - Indicate that system RAM falling into this memory + * block device (partially) belongs to the given node. + * @mem: The memory block device. + * @nid: The node id. + * @context: The memory initialization context. + * + * Indicate that system RAM falling into this memory block (partially) belongs + * to the given node. If the context indicates ("early") that we are adding the + * node during node device subsystem initialization, this will also properly + * set/adjust mem->zone based on the zone ranges of the given node. + */ +void memory_block_add_nid(struct memory_block *mem, int nid, + enum meminit_context context) +{ + if (context == MEMINIT_EARLY && mem->nid != nid) { + /* + * For early memory we have to determine the zone when setting + * the node id and handle multiple nodes spanning a single + * memory block by indicate via zone == NULL that we're not + * dealing with a single zone. So if we're setting the node id + * the first time, determine if there is a single zone. If we're + * setting the node id a second time to a different node, + * invalidate the single detected zone. + */ + if (mem->nid == NUMA_NO_NODE) + mem->zone = early_node_zone_for_memory_block(mem, nid); + else + mem->zone = NULL; + } + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. If we span multiple nodes (not applicable + * to hotplugged memory), zone == NULL will prohibit memory offlining + * and consequently unplug. + */ + mem->nid = nid; +} +#endif + static int init_memory_block(unsigned long block_id, unsigned long state, unsigned long nr_vmemmap_pages, struct memory_group *group) @@ -663,6 +743,17 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->nr_vmemmap_pages = nr_vmemmap_pages; INIT_LIST_HEAD(&mem->group_next); +#ifndef CONFIG_NUMA + if (state == MEM_ONLINE) + /* + * MEM_ONLINE at this point implies early memory. With NUMA, + * we'll determine the zone when setting the node id via + * memory_block_add_nid(). Memory hotplug updated the zone + * manually when memory onlining/offlining succeeds. + */ + mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); +#endif /* CONFIG_NUMA */ + ret = register_memory(mem); if (ret) return ret; diff --git a/drivers/base/node.c b/drivers/base/node.c index 5d75341413ce3..ec8bb24a5a227 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -796,15 +796,12 @@ static int __ref get_nid_for_pfn(unsigned long pfn) } static void do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk) + struct memory_block *mem_blk, + enum meminit_context context) { int ret; - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. - */ - mem_blk->nid = nid; + memory_block_add_nid(mem_blk, nid, context); ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, @@ -857,7 +854,7 @@ static int register_mem_block_under_node_early(struct memory_block *mem_blk, if (page_nid != nid) continue; - do_register_memory_block_under_node(nid, mem_blk); + do_register_memory_block_under_node(nid, mem_blk, MEMINIT_EARLY); return 0; } /* mem section does not span the specified node */ @@ -873,7 +870,7 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - do_register_memory_block_under_node(nid, mem_blk); + do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG); return 0; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 88eb587b51438..aa619464a1df0 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -70,6 +70,13 @@ struct memory_block { unsigned long state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ + /* + * The single zone of this memory block if all PFNs of this memory block + * that are System RAM (not a memory hole, not ZONE_DEVICE ranges) are + * managed by a single zone. NULL if multiple zones (including nodes) + * apply. + */ + struct zone *zone; struct device dev; /* * Number of vmemmap pages. These pages @@ -161,6 +168,11 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, }) #define register_hotmemory_notifier(nb) register_memory_notifier(nb) #define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) + +#ifdef CONFIG_NUMA +void memory_block_add_nid(struct memory_block *mem, int nid, + enum meminit_context context); +#endif /* CONFIG_NUMA */ #endif /* CONFIG_MEMORY_HOTPLUG */ /* diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 76bf2de86defc..1ce6f8044f1eb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -163,8 +163,6 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); -extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, - unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn); @@ -293,7 +291,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, - struct memory_group *group); + struct zone *zone, struct memory_group *group); extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -302,7 +300,7 @@ extern int offline_and_remove_memory(u64 start, u64 size); static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, - struct memory_group *group) + struct zone *zone, struct memory_group *group) { return -EINVAL; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed1a5dac67978..aee69281dad68 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1548,38 +1548,6 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) } #ifdef CONFIG_MEMORY_HOTREMOVE -/* - * Confirm all pages in a range [start, end) belong to the same zone (skipping - * memory holes). When true, return the zone. - */ -struct zone *test_pages_in_a_zone(unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long pfn, sec_end_pfn; - struct zone *zone = NULL; - struct page *page; - - for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); - pfn < end_pfn; - pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { - /* Make sure the memory section is present first */ - if (!present_section_nr(pfn_to_section_nr(pfn))) - continue; - for (; pfn < sec_end_pfn && pfn < end_pfn; - pfn += MAX_ORDER_NR_PAGES) { - /* Check if we got outside of the zone */ - if (zone && !zone_spans_pfn(zone, pfn)) - return NULL; - page = pfn_to_page(pfn); - if (zone && page_zone(page) != zone) - return NULL; - zone = page_zone(page); - } - } - - return zone; -} - /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, * non-lru movable pages and hugepages). Will skip over most unmovable @@ -1803,15 +1771,15 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, } int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, - struct memory_group *group) + struct zone *zone, struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; + const int node = zone_to_nid(zone); unsigned long flags; - struct zone *zone; struct memory_notify arg; - int ret, node; char *reason; + int ret; /* * {on,off}lining is constrained to full memory sections (or more @@ -1843,15 +1811,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, goto failed_removal; } - /* This makes hotplug much easier...and readable. - we assume this for now. .*/ - zone = test_pages_in_a_zone(start_pfn, end_pfn); - if (!zone) { + /* + * We only support offlining of memory blocks managed by a single zone, + * checked by calling code. This is just a sanity check that we might + * want to remove in the future. + */ + if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || + page_zone(pfn_to_page(end_pfn - 1)) != zone)) { ret = -EINVAL; reason = "multizone range"; goto failed_removal; } - node = zone_to_nid(zone); /* * Disable pcplists so that page isolation cannot race with freeing From bac309ca0eeea11408e30b5a80432e52d8cd332a Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Wed, 16 Feb 2022 15:31:33 +1100 Subject: [PATCH 223/334] mm/rmap: convert from atomic_t to refcount_t on anon_vma->refcount refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Link: https://lkml.kernel.org/r/1626665029-49104-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Cc: Alistair Popple Cc: Yang Shi Cc: Shakeel Butt Cc: Hugh Dickins Cc: Xiyu Yang Cc: Miaohe Lin Cc: Cc: Xin Tan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/rmap.h | 8 +++++--- mm/rmap.c | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index ac29b076082b7..73cce292d32c0 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,8 @@ #include #include +#include + /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: @@ -36,7 +38,7 @@ struct anon_vma { * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ - atomic_t refcount; + refcount_t refcount; /* * Count of child anon_vmas and VMAs which points to this anon_vma. @@ -100,14 +102,14 @@ enum ttu_flags { #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { - atomic_inc(&anon_vma->refcount); + refcount_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { - if (atomic_dec_and_test(&anon_vma->refcount)) + if (refcount_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index 5df685da578d3..bf323a515d5f7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -91,7 +91,7 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { - atomic_set(&anon_vma->refcount, 1); + refcount_set(&anon_vma->refcount, 1); anon_vma->degree = 1; /* Reference for first vma */ anon_vma->parent = anon_vma; /* @@ -106,7 +106,7 @@ static inline struct anon_vma *anon_vma_alloc(void) static inline void anon_vma_free(struct anon_vma *anon_vma) { - VM_BUG_ON(atomic_read(&anon_vma->refcount)); + VM_BUG_ON(refcount_read(&anon_vma->refcount)); /* * Synchronize against page_lock_anon_vma_read() such that @@ -448,7 +448,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); - atomic_set(&anon_vma->refcount, 0); + refcount_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } @@ -498,7 +498,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -557,7 +557,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } /* trylock failed, we got to sleep */ - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -572,7 +572,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) rcu_read_unlock(); anon_vma_lock_read(anon_vma); - if (atomic_dec_and_test(&anon_vma->refcount)) { + if (refcount_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because @@ -2210,7 +2210,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); - if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + if (root != anon_vma && refcount_dec_and_test(&root->refcount)) anon_vma_free(root); } From 2b4df9588038ada6ad855fec8eaae7a591f55f8d Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Wed, 16 Feb 2022 15:31:33 +1100 Subject: [PATCH 224/334] mm/zswap.c: allow handling just same-value filled pages Zswap has an ability to efficiently store same-value filled pages, which can be turned on and off using the "same_filled_pages_enabled" parameter. However, there is currently no way to enable just this (lightweight) functionality, while not making use of the whole compressed page storage machinery. Add a "non_same_filled_pages_enabled" parameter which allows disabling handling of pages that aren't same-value filled. This way zswap can be run in such lightweight same-value filled pages only mode. Link: https://lkml.kernel.org/r/7dbafa963e8bab43608189abbe2067f4b9287831.1641247624.git.maciej.szmigiero@oracle.com Signed-off-by: Maciej S. Szmigiero Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/mm/zswap.rst | 22 +++++++++++++++++++--- mm/zswap.c | 15 ++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 8edb8d578caf7..6e6f7b0d6562b 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -130,9 +130,25 @@ attribute, e.g.:: echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled When zswap same-filled page identification is disabled at runtime, it will stop -checking for the same-value filled pages during store operation. However, the -existing pages which are marked as same-value filled pages remain stored -unchanged in zswap until they are either loaded or invalidated. +checking for the same-value filled pages during store operation. +In other words, every page will be then considered non-same-value filled. +However, the existing pages which are marked as same-value filled pages remain +stored unchanged in zswap until they are either loaded or invalidated. + +In some circumstances it might be advantageous to make use of just the zswap +ability to efficiently store same-filled pages without enabling the whole +compressed page storage. +In this case the handling of non-same-value pages by zswap (enabled by default) +can be disabled by setting the ``non_same_filled_pages_enabled`` attribute +to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``. +It can also be enabled and disabled at runtime using the sysfs +``non_same_filled_pages_enabled`` attribute, e.g.:: + + echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled + +Disabling both ``zswap.same_filled_pages_enabled`` and +``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new +pages by zswap. To prevent zswap from shrinking pool when zswap is full and there's a high pressure on swap (this will result in flipping pages in and out zswap pool diff --git a/mm/zswap.c b/mm/zswap.c index cdf6950fcb2e3..3efd8cae315e7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -120,11 +120,19 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* Enable/disable handling same-value filled pages (enabled by default) */ +/* + * Enable/disable handling same-value filled pages (enabled by default). + * If disabled every page is considered non-same-value filled. + */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); +/* Enable/disable handling non-same-value filled pages (enabled by default) */ +static bool zswap_non_same_filled_pages_enabled = true; +module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -1147,6 +1155,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, kunmap_atomic(src); } + if (!zswap_non_same_filled_pages_enabled) { + ret = -EINVAL; + goto freepage; + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { From cfe5a0139effcc15c6a9c4ab190863ee9773d8df Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 16 Feb 2022 15:31:33 +1100 Subject: [PATCH 225/334] mm: remove usercopy_warn() Users of usercopy_warn() were removed by commit 53944f171a89 ("mm: remove HARDENED_USERCOPY_FALLBACK") Remove it. Link: https://lkml.kernel.org/r/5f26643fc70b05f8455b60b99c30c17d635fa640.1644231910.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Miaohe Lin Reviewed-by: Stephen Kitt Reviewed-by: Muchun Song Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/uaccess.h | 2 -- mm/usercopy.c | 11 ----------- 2 files changed, 13 deletions(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ac0394087f7d4..bca27b4e5eb2d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -401,8 +401,6 @@ static inline void user_access_restore(unsigned long flags) { } #endif #ifdef CONFIG_HARDENED_USERCOPY -void usercopy_warn(const char *name, const char *detail, bool to_user, - unsigned long offset, unsigned long len); void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); diff --git a/mm/usercopy.c b/mm/usercopy.c index d0d268135d96d..e7b0cb49daa1b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -70,17 +70,6 @@ static noinline int check_stack_object(const void *obj, unsigned long len) * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ -void usercopy_warn(const char *name, const char *detail, bool to_user, - unsigned long offset, unsigned long len) -{ - WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", - to_user ? "exposure" : "overwrite", - to_user ? "from" : "to", - name ? : "unknown?!", - detail ? " '" : "", detail ? : "", detail ? "'" : "", - offset, len); -} - void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) From 94ceef3bc9d0b5b4d2ed625868ea76bcc642d277 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 16 Feb 2022 15:31:34 +1100 Subject: [PATCH 226/334] mm: uninline copy_overflow() While building a small config with CONFIG_CC_OPTIMISE_FOR_SIZE, I ended up with more than 50 times the following function in vmlinux because GCC doesn't honor the 'inline' keyword: c00243bc : c00243bc: 94 21 ff f0 stwu r1,-16(r1) c00243c0: 7c 85 23 78 mr r5,r4 c00243c4: 7c 64 1b 78 mr r4,r3 c00243c8: 3c 60 c0 62 lis r3,-16286 c00243cc: 7c 08 02 a6 mflr r0 c00243d0: 38 63 5e e5 addi r3,r3,24293 c00243d4: 90 01 00 14 stw r0,20(r1) c00243d8: 4b ff 82 45 bl c001c61c <__warn_printk> c00243dc: 0f e0 00 00 twui r0,0 c00243e0: 80 01 00 14 lwz r0,20(r1) c00243e4: 38 21 00 10 addi r1,r1,16 c00243e8: 7c 08 03 a6 mtlr r0 c00243ec: 4e 80 00 20 blr With -Winline, GCC tells: /include/linux/thread_info.h:212:20: warning: inlining failed in call to 'copy_overflow': call is unlikely and code size would grow [-Winline] copy_overflow() is a non conditional warning called by check_copy_size() on an error path. check_copy_size() have to remain inlined in order to benefit from constant folding, but copy_overflow() is not worth inlining. Uninline the warning when CONFIG_BUG is selected. When CONFIG_BUG is not selected, WARN() does nothing so skip it. This reduces the size of vmlinux by almost 4kbytes. Link: https://lkml.kernel.org/r/e1723b9cfa924bcefcd41f69d0025b38e4c9364e.1644819985.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: David Laight Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/thread_info.h | 5 ++++- mm/maccess.c | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 73a6f34b3847a..9f392ec76f2bb 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -209,9 +209,12 @@ __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); +void __copy_overflow(int size, unsigned long count); + static inline void copy_overflow(int size, unsigned long count) { - WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); + if (IS_ENABLED(CONFIG_BUG)) + __copy_overflow(size, count); } static __always_inline __must_check bool diff --git a/mm/maccess.c b/mm/maccess.c index d3f1a1f0b1c1a..3fed2b876539d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -335,3 +335,9 @@ long strnlen_user_nofault(const void __user *unsafe_addr, long count) return ret; } + +void __copy_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} +EXPORT_SYMBOL(__copy_overflow); From a0b8dfeb52577c3c5bab27bb4e9622fd31acf084 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 16 Feb 2022 15:31:34 +1100 Subject: [PATCH 227/334] highmem: document kunmap_local() Some users of kmap() add an offset to the kmap() address to be used during the mapping. When converting to kmap_local_page() the base address does not need to be stored because any address within the page can be used in kunmap_local(). However, this was not clear from the documentation and cause some questions.[1] Document that any address in the page can be used in kunmap_local() to clarify this for future users. [1] https://lore.kernel.org/lkml/20211213154543.GM3538886@iweiny-DESK2.sc.intel.com/ Link: https://lkml.kernel.org/r/20220124013045.806718-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem-internal.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 0a0b2b09b1b8d..fb2d3e033c013 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -246,6 +246,17 @@ do { \ __kunmap_atomic(__addr); \ } while (0) +/** + * kunmap_local - Unmap a page mapped via kmap_local_page(). + * @__addr: An address within the page mapped + * + * __addr is often an address returned from kmap_local_page(). However, + * this address can be any address within the mapped page. It does not need to + * be the exact address returned from kmap_local_page() + * + * Unmapping should be done in the reverse order of the mapping. See + * kmap_local_page() for details. + */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ From e593ffdc2b667e851d7af9f02f78eccf0d0fbe65 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 16 Feb 2022 15:31:34 +1100 Subject: [PATCH 228/334] highmem-document-kunmap_local-v2 updates per Christoph Link: https://lkml.kernel.org/r/20220124182138.816693-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/highmem-internal.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index fb2d3e033c013..a77be56302094 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -250,9 +250,8 @@ do { \ * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * - * __addr is often an address returned from kmap_local_page(). However, - * this address can be any address within the mapped page. It does not need to - * be the exact address returned from kmap_local_page() + * @__addr can be any address within the mapped page. Commonly it is the + * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. From 76b3778250cd54ebd9a04741f3079fbaa744a5af Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:34 +1100 Subject: [PATCH 229/334] mm/highmem: remove unnecessary done label Remove unnecessary done label to simplify the code. Link: https://lkml.kernel.org/r/20220126092542.64659-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/highmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 762679050c9a0..0cc0c4da7ed9f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -736,11 +736,11 @@ void *page_address(const struct page *page) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + spin_unlock_irqrestore(&pas->lock, flags); return ret; } @@ -773,13 +773,12 @@ void set_page_address(struct page *page, void *virtual) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - goto done; + break; } } spin_unlock_irqrestore(&pas->lock, flags); } -done: + return; } From 89c322af9a3743c838f70bd34bd70654bcf3cfd1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 16 Feb 2022 15:31:34 +1100 Subject: [PATCH 230/334] mm/hmm.c: remove unneeded local variable ret The local variable ret is always 0. Remove it to make code more tight. Link: https://lkml.kernel.org/r/20220125124833.39718-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/hmm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index bd56641c79d4e..af71aac3140e4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -417,7 +417,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned long addr = start; pud_t pud; - int ret = 0; spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); if (!ptl) @@ -466,7 +465,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, out_unlock: spin_unlock(ptl); - return ret; + return 0; } #else #define hmm_vma_walk_pud NULL From c0cf454718a535acb2327fb838a0cfda62fbf0a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: [PATCH 231/334] mm: remove a pointless CONFIG_ZONE_DEVICE check in memremap_pages Patch series "start sorting out the ZONE_DEVICE refcount mess", v2. This series removes the offset by one refcount for ZONE_DEVICE pages that are freed back to the driver owning them, which is just device private ones for now, but also the planned device coherent pages and the ehanced p2p ones pending. It does not address the fsdax pages yet, which will be attacked in a follow on series. This patch (of 27): memremap.c is only built when CONFIG_ZONE_DEVICE is set, so remove the superflous extra check. Link: https://lkml.kernel.org/r/20220210072828.2930359-1-hch@lst.de Link: https://lkml.kernel.org/r/20220210072828.2930359-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Reviewed-by: Dan Williams Reviewed-by: Miaohe Lin Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Felix Kuehling Cc: Alex Deucher Cc: Christian Knig Cc: "Pan, Xinhui" Cc: Ben Skeggs Cc: Karol Herbst Cc: Lyude Paul Cc: Alistair Popple Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memremap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index d9e05952fff66..6c5c9a10970e6 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -349,8 +349,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_FS_DAX: - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { WARN(1, "File system DAX not supported\n"); return ERR_PTR(-EINVAL); } From d2a157420c4f575ba76daa11538a87230557545c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: [PATCH 232/334] mm: remove the __KERNEL__ guard from __KERNEL__ ifdefs don't make sense outside of include/uapi/. Link: https://lkml.kernel.org/r/20220210072828.2930359-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ac15f116e95b1..fe0e9e73fe0a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3,9 +3,6 @@ #define _LINUX_MM_H #include - -#ifdef __KERNEL__ - #include #include #include @@ -3372,5 +3369,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif -#endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ From 2f72efd5a5d1f7ec689b773bef685b48446db90a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: [PATCH 233/334] mm: remove pointless includes from hmm.h pulls in the world for no good reason at all. Remove the includes and push a few ones into the users instead. Link: https://lkml.kernel.org/r/20220210072828.2930359-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 + drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + include/linux/hmm.h | 9 ++------- lib/test_hmm.c | 2 ++ 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index ed5385137f483..cb835f95a76e6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "amdgpu_sync.h" #include "amdgpu_object.h" #include "amdgpu_vm.h" diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 3828aafd3ac46..e886a3b9e08c7 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -39,6 +39,7 @@ #include #include +#include /* * FIXME: this is ugly right now we are using TTM to allocate vram and we pin diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2fd2e91d5107c..d5a6f101f843e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -9,14 +9,9 @@ #ifndef LINUX_HMM_H #define LINUX_HMM_H -#include -#include +#include -#include -#include -#include -#include -#include +struct mmu_interval_notifier; /* * On output: diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 767538089a62e..396beee6b061d 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include "test_hmm_uapi.h" From 9e3c681d0096e2c9437a57f7ad8f58ccd469e4ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: [PATCH 234/334] mm: move free_devmap_managed_page to memremap.c free_devmap_managed_page has nothing to do with the code in swap.c, move it to live with the rest of the code for devmap handling. Link: https://lkml.kernel.org/r/20220210072828.2930359-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 1 - mm/memremap.c | 21 +++++++++++++++++++++ mm/swap.c | 23 ----------------------- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fe0e9e73fe0a0..c5907d52f3404 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1087,7 +1087,6 @@ static inline bool is_zone_movable_page(const struct page *page) } #ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool page_is_devmap_managed(struct page *page) diff --git a/mm/memremap.c b/mm/memremap.c index 6c5c9a10970e6..49ce9ac52f97f 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -522,4 +522,25 @@ void free_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } + +void put_devmap_managed_page(struct page *page) +{ + int count; + + if (WARN_ON_ONCE(!page_is_devmap_managed(page))) + return; + + count = page_ref_dec_return(page); + + /* + * devmap page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (count == 1) + free_devmap_managed_page(page); + else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index 842d5cd92cf64..e499df864ef78 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1154,26 +1154,3 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void put_devmap_managed_page(struct page *page) -{ - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); -} -EXPORT_SYMBOL(put_devmap_managed_page); -#endif From fcabdff100a691474e25b001f26fb22a48bb4761 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: [PATCH 235/334] mm: simplify freeing of devmap managed pages Make put_devmap_managed_page return if it took charge of the page or not and remove the separate page_is_devmap_managed helper. Link: https://lkml.kernel.org/r/20220210072828.2930359-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 34 ++++++++++------------------------ mm/memremap.c | 20 +++++++++----------- mm/swap.c | 10 +--------- 3 files changed, 20 insertions(+), 44 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c5907d52f3404..7c78b6eedefe2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1089,33 +1089,24 @@ static inline bool is_zone_movable_page(const struct page *page) #ifdef CONFIG_DEV_PAGEMAP_OPS DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool page_is_devmap_managed(struct page *page) +bool __put_devmap_managed_page(struct page *page); +static inline bool put_devmap_managed_page(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; - switch (page->pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_FS_DAX: - return true; - default: - break; - } - return false; + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + return __put_devmap_managed_page(page); } -void put_devmap_managed_page(struct page *page); - #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool page_is_devmap_managed(struct page *page) +static inline bool put_devmap_managed_page(struct page *page) { return false; } - -static inline void put_devmap_managed_page(struct page *page) -{ -} #endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) @@ -1215,16 +1206,11 @@ static inline void put_page(struct page *page) struct folio *folio = page_folio(page); /* - * For devmap managed pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the page is free and we - * need to inform the device driver through callback. See - * include/linux/memremap.h and HMM for details. + * For some devmap managed pages we need to catch refcount transition + * from 2 to 1: */ - if (page_is_devmap_managed(&folio->page)) { - put_devmap_managed_page(&folio->page); + if (put_devmap_managed_page(&folio->page)) return; - } - folio_put(folio); } diff --git a/mm/memremap.c b/mm/memremap.c index 49ce9ac52f97f..c5df70271ae6e 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -523,24 +523,22 @@ void free_devmap_managed_page(struct page *page) page->pgmap->ops->page_free(page); } -void put_devmap_managed_page(struct page *page) +bool __put_devmap_managed_page(struct page *page) { - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - /* * devmap page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (count == 1) + switch (page_ref_dec_return(page)) { + case 1: free_devmap_managed_page(page); - else if (!count) + break; + case 0: __put_page(page); + break; + } + return true; } -EXPORT_SYMBOL(put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index e499df864ef78..db8d0eea13d72 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -930,16 +930,8 @@ void release_pages(struct page **pages, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - /* - * ZONE_DEVICE pages that return 'false' from - * page_is_devmap_managed() do not require special - * processing, and instead, expect a call to - * put_page_testzero(). - */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (put_devmap_managed_page(page)) continue; - } if (put_page_testzero(page)) put_dev_pagemap(page->pgmap); continue; From aca251725252ec16ca25a6db6e6138cabc7e504e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: [PATCH 236/334] mm: don't include in Move the check for the actual pgmap types that need the free at refcount one behavior into the out of line helper, and thus avoid the need to pull memremap.h into mm.h. Link: https://lkml.kernel.org/r/20220210072828.2930359-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/mm/mmu.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/drm_cache.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + drivers/gpu/drm/nouveau/nouveau_svm.c | 1 + drivers/infiniband/core/rw.c | 1 + drivers/nvdimm/pmem.h | 1 + drivers/nvme/host/pci.c | 1 + drivers/nvme/target/io-cmd-bdev.c | 1 + fs/fuse/virtio_fs.c | 1 + include/linux/memremap.h | 18 ++++++++++++++++++ include/linux/mm.h | 20 -------------------- lib/test_hmm.c | 1 + mm/memcontrol.c | 1 + mm/memremap.c | 6 +++++- 15 files changed, 35 insertions(+), 22 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index acfae9b41cc8c..580abae6c0b93 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index ea68f3b3a4e9c..6d643b4b791d8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -25,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index f19d9acbe9593..50b8a088f763a 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -27,11 +27,11 @@ /* * Authors: Thomas Hellström */ - #include #include #include #include +#include #include #include diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index e886a3b9e08c7..a5cdfbe32b5e5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -39,6 +39,7 @@ #include #include +#include #include /* diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 266809e511e2c..090b9b47708cc 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -35,6 +35,7 @@ #include #include #include +#include #include struct nouveau_svm { diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 5a3bd41b331c9..4d98f931a13dd 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2016 HGST, a Western Digital Company. */ +#include #include #include #include diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a85..1f51a23614299 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -3,6 +3,7 @@ #define __NVDIMM_PMEM_H__ #include #include +#include #include #include #include diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6a99ed6809158..ab15bc72710db 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 70ca9dfc1771a..a141446db1bea 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include "nvmet.h" diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 9d737904d07c0..86b7dbb6a0d43 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/memremap.h b/include/linux/memremap.h index eea1b5cf25716..844409e83d0f7 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ + +#include #include #include #include @@ -129,6 +131,22 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } +static inline bool is_device_private_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; +} + +static inline bool is_pci_p2pdma_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_PCI_P2PDMA) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; +} + #ifdef CONFIG_ZONE_DEVICE bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c78b6eedefe2..e1bb29c4b15be 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -1096,9 +1095,6 @@ static inline bool put_devmap_managed_page(struct page *page) return false; if (!is_zone_device_page(page)) return false; - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) - return false; return __put_devmap_managed_page(page); } @@ -1109,22 +1105,6 @@ static inline bool put_devmap_managed_page(struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool is_device_private_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} - -static inline bool is_pci_p2pdma_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; -} - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 396beee6b061d..e5fc14ba71f33 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 760be043c3bfb..331c72a0a57ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memremap.c b/mm/memremap.c index c5df70271ae6e..3218d0acd2b85 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -525,6 +525,10 @@ void free_devmap_managed_page(struct page *page) bool __put_devmap_managed_page(struct page *page) { + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + /* * devmap page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is From 597dfdbec94a7a68cf272c3ae4905824338362ee Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: [PATCH 237/334] mm-dont-include-linux-memremaph-in-linux-mmh-fix fix fx/proc/page.c fs/proc/page.c: In function 'stable_page_flags': fs/proc/page.c:120:13: error: implicit declaration of function 'pfn_zone_device_reserved' [-Werror=implicit-function-declaration] 120 | if (pfn_zone_device_reserved(page_to_pfn(page))) | ^~~~~~~~~~~~~~~~~~~~~~~~ Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Christoph Hellwig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Cc: "Sierra Guiza, Alejandro (Alex)" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/page.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/page.c b/fs/proc/page.c index 4dcbcd506cb6e..f7d65b89c3128 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include From 93e92c5b2daca8c6e0c0b1e24731250afcfb623c Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: [PATCH 238/334] fix for "mm: don't include in " fix arch/powerpc/kvm/book3s_hv_uvmem.c Link: https://lkml.kernel.org/r/20220214180040.44f8316a@canb.auug.org.au Signed-off-by: Stephen Rothwell Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e414ca44839fd..881951604227a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include From ae16148321e5c1db4971bfca05af3d89734eb17c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: [PATCH 239/334] mm: remove the extra ZONE_DEVICE struct page refcount ZONE_DEVICE struct pages have an extra reference count that complicates the code for put_page() and several places in the kernel that need to check the reference count to see that a page is not being used (gup, compaction, migration, etc.). Clean up the code so the reference count doesn't need to be treated specially for ZONE_DEVICE pages. Note that this excludes the special idle page wakeup for fsdax pages, which still happens at refcount 1. This is a separate issue and will be sorted out later. Given that only fsdax pages require the notifiacation when the refcount hits 1 now, the PAGEMAP_OPS Kconfig symbol can go away and be replaced with a FS_DAX check for this hook in the put_page fastpath. Based on an earlier patch from Ralph Campbell . Link: https://lkml.kernel.org/r/20220210072828.2930359-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 - drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 - fs/Kconfig | 1 - include/linux/memremap.h | 12 +++-- include/linux/mm.h | 6 +-- lib/test_hmm.c | 1 - mm/Kconfig | 4 -- mm/internal.h | 2 + mm/memcontrol.c | 11 ++--- mm/memremap.c | 57 ++++++++---------------- mm/migrate.c | 6 --- mm/swap.c | 16 ++----- 13 files changed, 36 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 881951604227a..8cabdb39cbbca 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -713,7 +713,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - get_page(dpage); lock_page(dpage); return dpage; out_clear: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index cb835f95a76e6..e27ca37587623 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -225,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - get_page(page); lock_page(page); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a5cdfbe32b5e5..7ba66ad68a8a1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - get_page(page); lock_page(page); return page; } diff --git a/fs/Kconfig b/fs/Kconfig index 6c7dc1387beb0..e9433bbc48010 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,7 +48,6 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 844409e83d0f7..e2b1d2f08380a 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -68,9 +68,9 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never - * reach 0 refcount unless there is a refcount bug. This allows the - * device driver to implement its own memory management.) + * Called once the page refcount reaches 0. The reference count will be + * reset to one by the core code after the method is called to prepare + * for handing out the page again. */ void (*page_free)(struct page *page); @@ -133,16 +133,14 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) static inline bool is_device_private_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && + return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } diff --git a/include/linux/mm.h b/include/linux/mm.h index e1bb29c4b15be..49692a64d6454 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1085,7 +1085,7 @@ static inline bool is_zone_movable_page(const struct page *page) return page_zonenum(page) == ZONE_MOVABLE; } -#ifdef CONFIG_DEV_PAGEMAP_OPS +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_page(struct page *page); @@ -1098,12 +1098,12 @@ static inline bool put_devmap_managed_page(struct page *page) return __put_devmap_managed_page(page); } -#else /* CONFIG_DEV_PAGEMAP_OPS */ +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page(struct page *page) { return false; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e5fc14ba71f33..cfe6320478391 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -566,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) } dpage->zone_device_data = rpage; - get_page(dpage); lock_page(dpage); return dpage; diff --git a/mm/Kconfig b/mm/Kconfig index 67998bd3352e2..26d9f5dd316c1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -785,9 +785,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config DEV_PAGEMAP_OPS - bool - # # Helpers to mirror range of the CPU page tables of a process into device page # tables. @@ -799,7 +796,6 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE - select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device diff --git a/mm/internal.h b/mm/internal.h index 9a5674bd0a742..bbea49756ef7f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -719,4 +719,6 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +void free_zone_device_page(struct page *page); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 331c72a0a57ca..c1bc5d18d8eab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5498,17 +5498,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to - * a device and because they are not accessible by CPU they are store - * as special swap entry in the CPU page table. + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. */ if (is_device_private_entry(ent)) { page = pfn_swap_entry_to_page(ent); - /* - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have - * a refcount of 1 when free (unlike normal page) - */ - if (!page_ref_add_unless(page, 1, 1)) + if (!get_page_unless_zero(page)) return NULL; return page; } diff --git a/mm/memremap.c b/mm/memremap.c index 3218d0acd2b85..f7ad74f5026bb 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" static DEFINE_XARRAY(pgmap_array); @@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_DEV_PAGEMAP_OPS +#ifdef CONFIG_FS_DAX DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } #else @@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap) static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ static void pgmap_array_delete(struct range *range) { @@ -102,13 +101,6 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) -{ - if (pfn % (1024 << pgmap->vmemmap_shift)) - cond_resched(); - return pfn + pgmap_vmemmap_nr(pgmap); -} - static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { return (pfn_end(pgmap, range_id) - @@ -135,10 +127,6 @@ bool pfn_zone_device_reserved(unsigned long pfn) return ret; } -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ - pfn = pfn_next(map, pfn)) - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@ -167,13 +155,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) void memunmap_pages(struct dev_pagemap *pgmap) { - unsigned long pfn; int i; percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) - for_each_device_pfn(pfn, pgmap, i) - put_page(pfn_to_page(pfn)); + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); wait_for_completion(&pgmap->done); percpu_ref_exit(&pgmap->ref); @@ -485,14 +471,10 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page) +void free_zone_device_page(struct page *page) { - /* notify page idle for dax */ - if (!is_device_private_page(page)) { - wake_up_var(&page->_refcount); + if (WARN_ON_ONCE(!is_device_private_page(page))) return; - } __ClearPageWaiters(page); @@ -521,28 +503,27 @@ void free_devmap_managed_page(struct page *page) */ page->mapping = NULL; page->pgmap->ops->page_free(page); + + /* + * Reset the page count to 1 to prepare for handing out the page again. + */ + set_page_count(page, 1); } +#ifdef CONFIG_FS_DAX bool __put_devmap_managed_page(struct page *page) { - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* - * devmap page refcounts are 1-based, rather than 0-based: if + * fsdax page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - switch (page_ref_dec_return(page)) { - case 1: - free_devmap_managed_page(page); - break; - case 0: - __put_page(page); - break; - } + if (page_ref_dec_return(page) == 1) + wake_up_var(&page->_refcount); return true; } EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index 97048d4e1270e..42be56500d629 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -341,14 +341,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; - /* - * Device private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); - return expected_count; } diff --git a/mm/swap.c b/mm/swap.c index db8d0eea13d72..fc3b7989f5b20 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -122,17 +122,9 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { - if (is_zone_device_page(page)) { - put_dev_pagemap(page->pgmap); - - /* - * The page belongs to the device that created pgmap. Do - * not return it to page allocator. - */ - return; - } - - if (unlikely(PageCompound(page))) + if (unlikely(is_zone_device_page(page))) + free_zone_device_page(page); + else if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); @@ -933,7 +925,7 @@ void release_pages(struct page **pages, int nr) if (put_devmap_managed_page(page)) continue; if (put_page_testzero(page)) - put_dev_pagemap(page->pgmap); + free_zone_device_page(page); continue; } From 0d0719350df9c7884a955cc756897880a9229df1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: [PATCH 240/334] fsdax: depend on ZONE_DEVICE || FS_DAX_LIMITED Add a depends on ZONE_DEVICE support or the s390-specific limited DAX support, as one of the two is required at runtime for fsdax code to actually work. Link: https://lkml.kernel.org/r/20220210072828.2930359-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/Kconfig b/fs/Kconfig index e9433bbc48010..7f2455e8e18ae 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,6 +48,7 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) + depends on ZONE_DEVICE || FS_DAX_LIMITED select FS_IOMAP select DAX help From b7c61e8210fc7830a5ab5114adc4b24402b3ef44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: [PATCH 241/334] mm: generalize the pgmap based page_free infrastructure Key off on the existence of ->page_free to prepare for adding support for more pgmap types that are device managed and thus need the free callback. Link: https://lkml.kernel.org/r/20220210072828.2930359-10-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index f7ad74f5026bb..77922404b0bc4 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -473,7 +473,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); void free_zone_device_page(struct page *page) { - if (WARN_ON_ONCE(!is_device_private_page(page))) + if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) return; __ClearPageWaiters(page); @@ -481,7 +481,7 @@ void free_zone_device_page(struct page *page) mem_cgroup_uncharge(page_folio(page)); /* - * When a device_private page is freed, the page->mapping field + * When a device managed page is freed, the page->mapping field * may still contain a (stale) mapping value. For example, the * lower bits of page->mapping may still identify the page as an * anonymous page. Ultimately, this entire field is just stale From 6d20b64684fa5ee0422c145c2173b4e9e62f047e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: [PATCH 242/334] mm: refactor check_and_migrate_movable_pages Remove up to two levels of indentation by using continue statements and move variables to local scope where possible. Link: https://lkml.kernel.org/r/20220210072828.2930359-11-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 81 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 45f828be6b1d1..c8ac8792d6e36 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1844,72 +1844,79 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - unsigned long i; - unsigned long isolation_error_count = 0; - bool drain_allow = true; - LIST_HEAD(movable_page_list); - long ret = 0; + unsigned long isolation_error_count = 0, i; struct page *prev_head = NULL; - struct page *head; - struct migration_target_control mtc = { - .nid = NUMA_NO_NODE, - .gfp_mask = GFP_USER | __GFP_NOWARN, - }; + LIST_HEAD(movable_page_list); + bool drain_allow = true; + int ret = 0; for (i = 0; i < nr_pages; i++) { - head = compound_head(pages[i]); + struct page *head = compound_head(pages[i]); + if (head == prev_head) continue; prev_head = head; + + if (is_pinnable_page(head)) + continue; + /* - * If we get a movable page, since we are going to be pinning - * these entries, try to move them out if possible. + * Try to move out any movable page before pinning the range. */ - if (!is_pinnable_page(head)) { - if (PageHuge(head)) { - if (!isolate_huge_page(head, &movable_page_list)) - isolation_error_count++; - } else { - if (!PageLRU(head) && drain_allow) { - lru_add_drain_all(); - drain_allow = false; - } + if (PageHuge(head)) { + if (!isolate_huge_page(head, &movable_page_list)) + isolation_error_count++; + continue; + } - if (isolate_lru_page(head)) { - isolation_error_count++; - continue; - } - list_add_tail(&head->lru, &movable_page_list); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + - page_is_file_lru(head), - thp_nr_pages(head)); - } + if (!PageLRU(head) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (isolate_lru_page(head)) { + isolation_error_count++; + continue; } + list_add_tail(&head->lru, &movable_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_lru(head), + thp_nr_pages(head)); } + if (!list_empty(&movable_page_list) || isolation_error_count) + goto unpin_pages; + /* * If list is empty, and no isolation errors, means that all pages are * in the correct zone. */ - if (list_empty(&movable_page_list) && !isolation_error_count) - return nr_pages; + return nr_pages; +unpin_pages: if (gup_flags & FOLL_PIN) { unpin_user_pages(pages, nr_pages); } else { for (i = 0; i < nr_pages; i++) put_page(pages[i]); } + if (!list_empty(&movable_page_list)) { + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_NOWARN, + }; + ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL); - if (ret && !list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); + if (ret > 0) /* number of pages not migrated */ + ret = -ENOMEM; } - return ret > 0 ? -ENOMEM : ret; + if (ret && !list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); + return ret; } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, From 43028c7c3fdcf1543b3503ecfb32c93f0c4f5a4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: [PATCH 243/334] mm: refactor the ZONE_DEVICE handling in migrate_vma_insert_page Make the flow a little more clear and prepare for adding a new ZONE_DEVICE memory type. Link: https://lkml.kernel.org/r/20220210072828.2930359-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 42be56500d629..9764608aec10b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2704,26 +2704,25 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, */ __SetPageUptodate(page); - if (is_zone_device_page(page)) { - if (is_device_private_page(page)) { - swp_entry_t swp_entry; + if (is_device_private_page(page)) { + swp_entry_t swp_entry; - if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); - else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); - entry = swp_entry_to_pte(swp_entry); - } else { - /* - * For now we only support migrating to un-addressable - * device memory. - */ + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable device + * memory. + */ + if (is_zone_device_page(page)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } - } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); From 4661d743191746cfe3ede8202655e9fc34201863 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:37 +1100 Subject: [PATCH 244/334] mm: refactor the ZONE_DEVICE handling in migrate_vma_pages Make the flow a little more clear and prepare for adding a new ZONE_DEVICE memory type. Link: https://lkml.kernel.org/r/20220210072828.2930359-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 9764608aec10b..5efe7875bd829 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2822,24 +2822,21 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); - if (is_zone_device_page(newpage)) { - if (is_device_private_page(newpage)) { - /* - * For now only support private anonymous when - * migrating to un-addressable device memory. - */ - if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - } else { - /* - * Other types of ZONE_DEVICE page are not - * supported. - */ + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when migrating + * to un-addressable device memory. + */ + if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } + } else if (is_zone_device_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; } r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); From 71c03bd68bdc455144c605ee7c24649d9802b899 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:38 +1100 Subject: [PATCH 245/334] mm: move the migrate_vma_* device migration code into its own file Split the code used to migrate to and from ZONE_DEVICE memory from migrate.c into a new file. Link: https://lkml.kernel.org/r/20220210072828.2930359-14-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 3 + mm/Makefile | 1 + mm/migrate.c | 753 ------------------------------------------- mm/migrate_device.c | 765 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 769 insertions(+), 753 deletions(-) create mode 100644 mm/migrate_device.c diff --git a/mm/Kconfig b/mm/Kconfig index 26d9f5dd316c1..f77bb9b25056f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -249,6 +249,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config DEVICE_MIGRATION + def_bool MIGRATION && DEVICE_PRIVATE + config ARCH_ENABLE_HUGEPAGE_MIGRATION bool diff --git a/mm/Makefile b/mm/Makefile index 70d4309c9ce33..4cc13f3179a51 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/migrate.c b/mm/migrate.c index 5efe7875bd829..88b59f9f8d29b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,12 +38,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -2159,757 +2157,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ -#ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_skip(unsigned long start, - unsigned long end, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = 0; - } - - return 0; -} - -static int migrate_vma_collect_hole(unsigned long start, - unsigned long end, - __always_unused int depth, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) - return migrate_vma_collect_skip(start, end, walk); - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; - } - - return 0; -} - -static int migrate_vma_collect_pmd(pmd_t *pmdp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - struct vm_area_struct *vma = walk->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start, unmapped = 0; - spinlock_t *ptl; - pte_t *ptep; - -again: - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, walk); - - if (pmd_trans_huge(*pmdp)) { - struct page *page; - - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { - spin_unlock(ptl); - goto again; - } - - page = pmd_page(*pmdp); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - if (pmd_trans_unstable(pmdp)) - return migrate_vma_collect_skip(start, end, - walk); - } else { - int ret; - - get_page(page); - spin_unlock(ptl); - if (unlikely(!trylock_page(page))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, - walk); - } - } - - if (unlikely(pmd_bad(*pmdp))) - return migrate_vma_collect_skip(start, end, walk); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - arch_enter_lazy_mmu_mode(); - - for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn = 0, pfn; - struct page *page; - swp_entry_t entry; - pte_t pte; - - pte = *ptep; - - if (pte_none(pte)) { - if (vma_is_anonymous(vma)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - } - goto next; - } - - if (!pte_present(pte)) { - /* - * Only care about unaddressable device page special - * page table entry. Other special swap entries are not - * migratable, and we ignore regular swapped page. - */ - entry = pte_to_swp_entry(pte); - if (!is_device_private_entry(entry)) - goto next; - - page = pfn_swap_entry_to_page(entry); - if (!(migrate->flags & - MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || - page->pgmap->owner != migrate->pgmap_owner) - goto next; - - mpfn = migrate_pfn(page_to_pfn(page)) | - MIGRATE_PFN_MIGRATE; - if (is_writable_device_private_entry(entry)) - mpfn |= MIGRATE_PFN_WRITE; - } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; - pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - goto next; - } - page = vm_normal_page(migrate->vma, addr, pte); - mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; - } - - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = 0; - goto next; - } - - /* - * By getting a reference on the page we pin it and that blocks - * any kind of migration. Side effect is that it "freezes" the - * pte. - * - * We drop this reference after isolating the page from the lru - * for non device page (device page are not on the lru and thus - * can't be dropped from it). - */ - get_page(page); - - /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. - */ - if (trylock_page(page)) { - pte_t swp_pte; - - migrate->cpages++; - ptep_get_and_clear(mm, addr, ptep); - - /* Setup special migration page table entry */ - if (mpfn & MIGRATE_PFN_WRITE) - entry = make_writable_migration_entry( - page_to_pfn(page)); - else - entry = make_readable_migration_entry( - page_to_pfn(page)); - swp_pte = swp_entry_to_pte(entry); - if (pte_present(pte)) { - if (pte_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - } else { - if (pte_swp_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_swp_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - } - set_pte_at(mm, addr, ptep, swp_pte); - - /* - * This is like regular unmap: we remove the rmap and - * drop page refcount. Page won't be freed, as we took - * a reference just above. - */ - page_remove_rmap(page, vma, false); - put_page(page); - - if (pte_present(pte)) - unmapped++; - } else { - put_page(page); - mpfn = 0; - } - -next: - migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = mpfn; - } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); - - /* Only flush the TLB if we actually modified any entries */ - if (unmapped) - flush_tlb_range(walk->vma, start, end); - - return 0; -} - -static const struct mm_walk_ops migrate_vma_walk_ops = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, -}; - -/* - * migrate_vma_collect() - collect pages over a range of virtual addresses - * @migrate: migrate struct containing all migration information - * - * This will walk the CPU page table. For each virtual address backed by a - * valid page, it updates the src array and takes a reference on the page, in - * order to pin the page until we lock it and unmap it. - */ -static void migrate_vma_collect(struct migrate_vma *migrate) -{ - struct mmu_notifier_range range; - - /* - * Note that the pgmap_owner is passed to the mmu notifier callback so - * that the registered device driver can skip invalidating device - * private page mappings that won't be migrated. - */ - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, - migrate->pgmap_owner); - mmu_notifier_invalidate_range_start(&range); - - walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, - &migrate_vma_walk_ops, migrate); - - mmu_notifier_invalidate_range_end(&range); - migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); -} - -/* - * migrate_vma_check_page() - check if page is pinned or not - * @page: struct page to check - * - * Pinned pages cannot be migrated. This is the same test as in - * folio_migrate_mapping(), except that here we allow migration of a - * ZONE_DEVICE page. - */ -static bool migrate_vma_check_page(struct page *page) -{ - /* - * One extra ref because caller holds an extra reference, either from - * isolate_lru_page() for a regular page, or migrate_vma_collect() for - * a device page. - */ - int extra = 1; - - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (PageCompound(page)) - return false; - - /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) - extra++; - - /* For file back page */ - if (page_mapping(page)) - extra += 1 + page_has_private(page); - - if ((page_count(page) - extra) > page_mapcount(page)) - return false; - - return true; -} - -/* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. - */ -static void migrate_vma_unmap(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - unsigned long i, restore = 0; - bool allow_drain = true; - - lru_add_drain(); - - for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page) - continue; - - /* ZONE_DEVICE pages are not on LRU */ - if (!is_zone_device_page(page)) { - if (!PageLRU(page) && allow_drain) { - /* Drain CPU's pagevec */ - lru_add_drain_all(); - allow_drain = false; - } - - if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - continue; - } - - /* Drop the reference we took in collect */ - put_page(page); - } - - if (page_mapped(page)) - try_to_migrate(page, 0); - - if (page_mapped(page) || !migrate_vma_check_page(page)) { - if (!is_zone_device_page(page)) { - get_page(page); - putback_lru_page(page); - } - - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - continue; - } - } - - for (i = 0; i < npages && restore; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; - - remove_migration_ptes(page, page, false); - - migrate->src[i] = 0; - unlock_page(page); - put_page(page); - restore--; - } -} - -/** - * migrate_vma_setup() - prepare to migrate a range of memory - * @args: contains the vma, start, and pfns arrays for the migration - * - * Returns: negative errno on failures, 0 when 0 or more pages were migrated - * without an error. - * - * Prepare to migrate a range of memory virtual address range by collecting all - * the pages backing each virtual address in the range, saving them inside the - * src array. Then lock those pages and unmap them. Once the pages are locked - * and unmapped, check whether each page is pinned or not. Pages that aren't - * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the - * corresponding src array entry. Then restores any pages that are pinned, by - * remapping and unlocking those pages. - * - * The caller should then allocate destination memory and copy source memory to - * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE - * flag set). Once these are allocated and copied, the caller must update each - * corresponding entry in the dst array with the pfn value of the destination - * page and with MIGRATE_PFN_VALID. Destination pages must be locked via - * lock_page(). - * - * Note that the caller does not have to migrate all the pages that are marked - * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from - * device memory to system memory. If the caller cannot migrate a device page - * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe - * consequences for the userspace process, so it must be avoided if at all - * possible. - * - * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing the caller to allocate device memory for those unbacked virtual - * addresses. For this the caller simply has to allocate device memory and - * properly set the destination entry like for regular migration. Note that - * this can still fail, and thus inside the device driver you must check if the - * migration was successful for those entries after calling migrate_vma_pages(), - * just like for regular migration. - * - * After that, the callers must call migrate_vma_pages() to go over each entry - * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, - * then migrate_vma_pages() to migrate struct page information from the source - * struct page to the destination struct page. If it fails to migrate the - * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the - * src array. - * - * At this point all successfully migrated pages have an entry in the src - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst - * array entry with MIGRATE_PFN_VALID flag set. - * - * Once migrate_vma_pages() returns the caller may inspect which pages were - * successfully migrated, and which were not. Successfully migrated pages will - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. - * - * It is safe to update device page table after migrate_vma_pages() because - * both destination and source page are still locked, and the mmap_lock is held - * in read mode (hence no one can unmap the range being migrated). - * - * Once the caller is done cleaning up things and updating its page table (if it - * chose to do so, this is not an obligation) it finally calls - * migrate_vma_finalize() to update the CPU page table to point to new pages - * for successfully migrated pages or otherwise restore the CPU page table to - * point to the original source pages. - */ -int migrate_vma_setup(struct migrate_vma *args) -{ - long nr_pages = (args->end - args->start) >> PAGE_SHIFT; - - args->start &= PAGE_MASK; - args->end &= PAGE_MASK; - if (!args->vma || is_vm_hugetlb_page(args->vma) || - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) - return -EINVAL; - if (nr_pages <= 0) - return -EINVAL; - if (args->start < args->vma->vm_start || - args->start >= args->vma->vm_end) - return -EINVAL; - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) - return -EINVAL; - if (!args->src || !args->dst) - return -EINVAL; - - memset(args->src, 0, sizeof(*args->src) * nr_pages); - args->cpages = 0; - args->npages = 0; - - migrate_vma_collect(args); - - if (args->cpages) - migrate_vma_unmap(args); - - /* - * At this point pages are locked and unmapped, and thus they have - * stable content and can safely be copied to destination memory that - * is allocated by the drivers. - */ - return 0; - -} -EXPORT_SYMBOL(migrate_vma_setup); - -/* - * This code closely matches the code in: - * __handle_mm_fault() - * handle_pte_fault() - * do_anonymous_page() - * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. - */ -static void migrate_vma_insert_page(struct migrate_vma *migrate, - unsigned long addr, - struct page *page, - unsigned long *src) -{ - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - bool flush = false; - spinlock_t *ptl; - pte_t entry; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - /* Only allow populating anonymous memory */ - if (!vma_is_anonymous(vma)) - goto abort; - - pgdp = pgd_offset(mm, addr); - p4dp = p4d_alloc(mm, pgdp, addr); - if (!p4dp) - goto abort; - pudp = pud_alloc(mm, p4dp, addr); - if (!pudp) - goto abort; - pmdp = pmd_alloc(mm, pudp, addr); - if (!pmdp) - goto abort; - - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) - goto abort; - - /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). - */ - if (pte_alloc(mm, pmdp)) - goto abort; - - /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(pmdp))) - goto abort; - - if (unlikely(anon_vma_prepare(vma))) - goto abort; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) - goto abort; - - /* - * The memory barrier inside __SetPageUptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. - */ - __SetPageUptodate(page); - - if (is_device_private_page(page)) { - swp_entry_t swp_entry; - - if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); - else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); - entry = swp_entry_to_pte(swp_entry); - } else { - /* - * For now we only support migrating to un-addressable device - * memory. - */ - if (is_zone_device_page(page)) { - pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); - goto abort; - } - entry = mk_pte(page, vma->vm_page_prot); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - } - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - - if (check_stable_address_space(mm)) - goto unlock_abort; - - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); - - if (!is_zero_pfn(pfn)) - goto unlock_abort; - flush = true; - } else if (!pte_none(*ptep)) - goto unlock_abort; - - /* - * Check for userfaultfd but do not deliver the fault. Instead, - * just back off. - */ - if (userfaultfd_missing(vma)) - goto unlock_abort; - - inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); - if (!is_zone_device_page(page)) - lru_cache_add_inactive_or_unevictable(page, vma); - get_page(page); - - if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } - - pte_unmap_unlock(ptep, ptl); - *src = MIGRATE_PFN_MIGRATE; - return; - -unlock_abort: - pte_unmap_unlock(ptep, ptl); -abort: - *src &= ~MIGRATE_PFN_MIGRATE; -} - -/** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information - * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. - */ -void migrate_vma_pages(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - struct mmu_notifier_range range; - unsigned long addr, i; - bool notified = false; - - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); - struct address_space *mapping; - int r; - - if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - - if (!page) { - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; - if (!notified) { - notified = true; - - mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, - migrate->vma->vm_mm, addr, migrate->end, - migrate->pgmap_owner); - mmu_notifier_invalidate_range_start(&range); - } - migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); - continue; - } - - mapping = page_mapping(page); - - if (is_device_private_page(newpage)) { - /* - * For now only support private anonymous when migrating - * to un-addressable device memory. - */ - if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - } else if (is_zone_device_page(newpage)) { - /* - * Other types of ZONE_DEVICE page are not supported. - */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); - if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - } - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() - * did already call it. - */ - if (notified) - mmu_notifier_invalidate_range_only_end(&range); -} -EXPORT_SYMBOL(migrate_vma_pages); - -/** - * migrate_vma_finalize() - restore CPU page table entry - * @migrate: migrate struct containing all migration information - * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. - * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. - */ -void migrate_vma_finalize(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - unsigned long i; - - for (i = 0; i < npages; i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); - } - continue; - } - - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); - } - newpage = page; - } - - remove_migration_ptes(page, newpage, false); - unlock_page(page); - - if (is_zone_device_page(page)) - put_page(page); - else - putback_lru_page(page); - - if (newpage != page) { - unlock_page(newpage); - if (is_zone_device_page(newpage)) - put_page(newpage); - else - putback_lru_page(newpage); - } - } -} -EXPORT_SYMBOL(migrate_vma_finalize); -#endif /* CONFIG_DEVICE_PRIVATE */ - /* * node_demotion[] example: * diff --git a/mm/migrate_device.c b/mm/migrate_device.c new file mode 100644 index 0000000000000..a65d58d0b3bc2 --- /dev/null +++ b/mm/migrate_device.c @@ -0,0 +1,765 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device Memory Migration functionality. + * + * Originally written by Jérôme Glisse. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + __always_unused int depth, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn = 0, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + + if (pte_none(pte)) { + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } + goto next; + } + + if (!pte_present(pte)) { + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = pfn_swap_entry_to_page(entry); + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + page->pgmap->owner != migrate->pgmap_owner) + goto next; + + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; + if (is_writable_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + pfn = pte_pfn(pte); + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + goto next; + } + page = vm_normal_page(migrate->vma, addr, pte); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = 0; + goto next; + } + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + migrate->cpages++; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); + swp_pte = swp_entry_to_pte(entry); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, vma, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } else { + put_page(page); + mpfn = 0; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; +} + +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mmu_notifier_range range; + + /* + * Note that the pgmap_owner is passed to the mmu notifier callback so + * that the registered device driver can skip invalidating device + * private page mappings that won't be migrated. + */ + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * folio_migrate_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) + extra++; + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) + continue; + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + if (page_mapped(page)) + try_to_migrate(page, 0); + + if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + } + + for (i = 0; i < npages && restore; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_ptes(page, page, false); + + migrate->src[i] = 0; + unlock_page(page); + put_page(page); + restore--; + } +} + +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via + * lock_page(). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_lock is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when + * parallel threads are excluded by other means. + * + * Here we only have mmap_read_lock(mm). + */ + if (pte_alloc(mm, pmdp)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable device + * memory. + */ + if (is_zone_device_page(page)) { + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; + } + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (check_stable_address_space(mm)) + goto unlock_abort; + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) + goto unlock_abort; + flush = true; + } else if (!pte_none(*ptep)) + goto unlock_abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + if (!is_zone_device_page(page)) + lru_cache_add_inactive_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +unlock_abort: + pte_unmap_unlock(ptep, ptl); +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +/** + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct mmu_notifier_range range; + unsigned long addr, i; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + if (!notified) { + notified = true; + + mmu_notifier_range_init_owner(&range, + MMU_NOTIFY_MIGRATE, 0, migrate->vma, + migrate->vma->vm_mm, addr, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when migrating + * to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (is_zone_device_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ + if (notified) + mmu_notifier_invalidate_range_only_end(&range); +} +EXPORT_SYMBOL(migrate_vma_pages); + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + remove_migration_ptes(page, newpage, false); + unlock_page(page); + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} +EXPORT_SYMBOL(migrate_vma_finalize); From 024ca44dc862cb1293b27d049303eab58c8bf9fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:38 +1100 Subject: [PATCH 246/334] mm: include in migrate_device.c Fixup the split of migrate.c by adding a missing include in the new file. Link: https://lkml.kernel.org/r/20220214072429.3302759-1-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a65d58d0b3bc2..0326b901d2fdf 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" static int migrate_vma_collect_skip(unsigned long start, From 2b5180fef629b5d66260235f0b71acfcfed9e6c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:38 +1100 Subject: [PATCH 247/334] mm: build migrate_vma_* for all configs with ZONE_DEVICE support This code will be used for device coherent memory as well in a bit, so relax the ifdef a bit. Link: https://lkml.kernel.org/r/20220210072828.2930359-15-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index f77bb9b25056f..c313bad5167a3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -250,7 +250,7 @@ config MIGRATION allocation instead of reclaiming. config DEVICE_MIGRATION - def_bool MIGRATION && DEVICE_PRIVATE + def_bool MIGRATION && ZONE_DEVICE config ARCH_ENABLE_HUGEPAGE_MIGRATION bool From b78bc7a9d062ac543a6db879432bd9533e1df49b Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:38 +1100 Subject: [PATCH 248/334] mm: add zone device coherent type memory support Device memory that is cache coherent from device and CPU point of view. This is used on platforms that have an advanced system bus (like CAPI or CXL). Any page of a process can be migrated to such memory. However, no one should be allowed to pin such memory so that it can always be evicted. [hch@lst.de: rebased ontop of the refcount changes, removed is_dev_private_or_coherent_page] Link: https://lkml.kernel.org/r/20220210072828.2930359-16-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memremap.h | 14 ++++++++++++++ mm/memcontrol.c | 7 ++++--- mm/memory-failure.c | 8 ++++++-- mm/memremap.c | 10 ++++++++++ mm/migrate_device.c | 16 +++++++--------- mm/rmap.c | 5 +++-- 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e2b1d2f08380a..c1713cd8bfba3 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -41,6 +41,13 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. * + * MEMORY_DEVICE_COHERENT: + * Device memory that is cache coherent from device and CPU point of view. This + * is used on platforms that have an advanced system bus (like CAPI or CXL). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allowed to pin such memory so that it can always be evicted. + * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -61,6 +68,7 @@ struct vmem_altmap { enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, + MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, @@ -138,6 +146,12 @@ static inline bool is_device_private_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PRIVATE; } +static inline bool is_device_coherent_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_COHERENT; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1bc5d18d8eab..3c4816147273a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5681,8 +5681,8 @@ static int mem_cgroup_move_account(struct page *page, * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5716,7 +5716,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f092013c7f5df..3e404b06efdc2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1607,12 +1607,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto unlock; } - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: /* - * TODO: Handle HMM pages which may need coordination + * TODO: Handle device pages which may need coordination * with device-side memory. */ goto unlock; + default: + break; } /* diff --git a/mm/memremap.c b/mm/memremap.c index 77922404b0bc4..2e9148a3421ae 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -334,6 +334,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; + case MEMORY_DEVICE_COHERENT: + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; case MEMORY_DEVICE_FS_DAX: if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { WARN(1, "File system DAX not supported\n"); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 0326b901d2fdf..5ed4d25f22777 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -495,7 +495,7 @@ EXPORT_SYMBOL(migrate_vma_setup); * handle_pte_fault() * do_anonymous_page() * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * private or coherent page. */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, @@ -571,11 +571,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { - /* - * For now we only support migrating to un-addressable device - * memory. - */ - if (is_zone_device_page(page)) { + if (is_zone_device_page(page) && + !is_device_coherent_page(page)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } @@ -678,10 +675,11 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); - if (is_device_private_page(newpage)) { + if (is_device_private_page(newpage) || + is_device_coherent_page(newpage)) { /* - * For now only support private anonymous when migrating - * to un-addressable device memory. + * For now only support anonymous memory migrating to + * device private or coherent memory. */ if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; diff --git a/mm/rmap.c b/mm/rmap.c index bf323a515d5f7..a13487385820b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1852,7 +1852,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { unsigned long pfn = page_to_pfn(page); swp_entry_t entry; pte_t swp_pte; @@ -1999,7 +1999,8 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) TTU_SYNC))) return; - if (is_zone_device_page(page) && !is_device_private_page(page)) + if (is_zone_device_page(page) && + (!is_device_private_page(page) && !is_device_coherent_page(page))) return; /* From de4b9c36129ce660704cd4d30d417142295bf4fa Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:38 +1100 Subject: [PATCH 249/334] mm: add device coherent vma selection for memory migration This case is used to migrate pages from device memory, back to system memory. Device coherent type memory is cache coherent from device and CPU point of view. Link: https://lkml.kernel.org/r/20220210072828.2930359-17-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Reviewed-by: Alistair Poppple Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/migrate.h | 1 + mm/migrate_device.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index db96e10eb8da2..66a34eae8cb63 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -130,6 +130,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, + MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, }; struct migrate_vma { diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ed4d25f22777..f27486b501fee 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -148,15 +148,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { + if (is_zero_pfn(pfn) && + (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; goto next; } page = vm_normal_page(migrate->vma, addr, pte); + if (page && !is_zone_device_page(page) && + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + else if (page && is_device_coherent_page(page) && + (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + page->pgmap->owner != migrate->pgmap_owner)) + goto next; mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } From 74645abb2ba95beef27c9f4e4f57679381f6bd3d Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:39 +1100 Subject: [PATCH 250/334] mm/gup: fail get_user_pages for LONGTERM dev coherent type Avoid long term pinning for Coherent device type pages. This could interfere with their own device memory manager. For now, we are just returning error for PIN_LONGTERM Coherent device type pages. Eventually, these type of pages will get migrated to system memory, once the device migration pages support is added. [hch@lst.de: rebased on previous cleanups, split the two checks] Link: https://lkml.kernel.org/r/20220210072828.2930359-18-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index c8ac8792d6e36..d0845d97cb857 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1857,6 +1857,19 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_head = head; + /* + * Device private pages will get faulted in during gup so it + * shouldn't be possible to see one here. + */ + if (WARN_ON_ONCE(is_device_private_page(head))) { + ret = -EFAULT; + goto unpin_pages; + } + if (is_device_coherent_page(head)) { + ret = -EFAULT; + goto unpin_pages; + } + if (is_pinnable_page(head)) continue; @@ -1901,7 +1914,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, put_page(pages[i]); } - if (!list_empty(&movable_page_list)) { + if (!ret && !list_empty(&movable_page_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, From 56372b1e8890ff408e5654e3d8f71eeb0f29e3d8 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:39 +1100 Subject: [PATCH 251/334] drm/amdkfd: add SPM support for SVM When CPU is connected throug XGMI, it has coherent access to VRAM resource. In this case that resource is taken from a table in the device gmc aperture base. This resource is used along with the device type, which could be DEVICE_PRIVATE or DEVICE_COHERENT to create the device page map region. Link: https://lkml.kernel.org/r/20220210072828.2930359-19-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Reviewed-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 28 ++++++++++++++---------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index e27ca37587623..2c51f2ac3b46a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -933,7 +933,7 @@ int svm_migrate_init(struct amdgpu_device *adev) { struct kfd_dev *kfddev = adev->kfd.dev; struct dev_pagemap *pgmap; - struct resource *res; + struct resource *res = NULL; unsigned long size; void *r; @@ -948,28 +948,34 @@ int svm_migrate_init(struct amdgpu_device *adev) * should remove reserved size */ size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20); - res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); - if (IS_ERR(res)) - return -ENOMEM; + if (adev->gmc.xgmi.connected_to_cpu) { + pgmap->range.start = adev->gmc.aper_base; + pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1; + pgmap->type = MEMORY_DEVICE_COHERENT; + } else { + res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); + if (IS_ERR(res)) + return -ENOMEM; + pgmap->range.start = res->start; + pgmap->range.end = res->end; + pgmap->type = MEMORY_DEVICE_PRIVATE; + } - pgmap->type = MEMORY_DEVICE_PRIVATE; pgmap->nr_range = 1; - pgmap->range.start = res->start; - pgmap->range.end = res->end; pgmap->ops = &svm_migrate_pgmap_ops; pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev); - pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; - + pgmap->flags = 0; /* Device manager releases device-specific resources, memory region and * pgmap when driver disconnects from device. */ r = devm_memremap_pages(adev->dev, pgmap); if (IS_ERR(r)) { pr_err("failed to register HMM device memory\n"); - /* Disable SVM support capability */ pgmap->type = 0; - devm_release_mem_region(adev->dev, res->start, resource_size(res)); + if (pgmap->type == MEMORY_DEVICE_PRIVATE) + devm_release_mem_region(adev->dev, res->start, + res->end - res->start + 1); return PTR_ERR(r); } From 18a1da69bb83ab84207b270b06c5a3178ef412b6 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:39 +1100 Subject: [PATCH 252/334] drm/amdkfd: coherent type as sys mem on migration to ram Coherent device type memory on VRAM to RAM migration, has similar access as System RAM from the CPU. This flag sets the source from the sender. Which in Coherent type case, should be set as MIGRATE_VMA_SELECT_DEVICE_COHERENT. Link: https://lkml.kernel.org/r/20220210072828.2930359-20-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Reviewed-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 2c51f2ac3b46a..6646291d75d57 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -659,9 +659,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.vma = vma; migrate.start = start; migrate.end = end; - migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev); + if (adev->gmc.xgmi.connected_to_cpu) + migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT; + else + migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; size = 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t); size *= npages; buf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO); From 70fff360c14f03b8398aca1295b5239a8118a3d7 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:39 +1100 Subject: [PATCH 253/334] lib: test_hmm add ioctl to get zone device type new ioctl cmd added to query zone device type. This will be used once the test_hmm adds zone device coherent type. Link: https://lkml.kernel.org/r/20220210072828.2930359-21-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_hmm.c | 23 +++++++++++++++++++++-- lib/test_hmm_uapi.h | 8 ++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index cfe6320478391..7a27584484ce0 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -87,6 +87,7 @@ struct dmirror_chunk { struct dmirror_device { struct cdev cdevice; struct hmm_devmem *devmem; + unsigned int zone_device_type; unsigned int devmem_capacity; unsigned int devmem_count; @@ -1026,6 +1027,15 @@ static int dmirror_snapshot(struct dmirror *dmirror, return ret; } +static int dmirror_get_device_type(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + mutex_lock(&dmirror->mutex); + cmd->zone_device_type = dmirror->mdevice->zone_device_type; + mutex_unlock(&dmirror->mutex); + + return 0; +} static long dmirror_fops_unlocked_ioctl(struct file *filp, unsigned int command, unsigned long arg) @@ -1076,6 +1086,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_snapshot(dmirror, &cmd); break; + case HMM_DMIRROR_GET_MEM_DEV_TYPE: + ret = dmirror_get_device_type(dmirror, &cmd); + break; default: return -EINVAL; } @@ -1260,14 +1273,20 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) static int __init hmm_dmirror_init(void) { int ret; - int id; + int id = 0; + int ndevices = 0; ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, "HMM_DMIRROR"); if (ret) goto err_unreg; - for (id = 0; id < DMIRROR_NDEVICES; id++) { + memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0])); + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + for (id = 0; id < ndevices; id++) { ret = dmirror_device_init(dmirror_devices + id, id); if (ret) goto err_chrdev; diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index f14dea5dcd062..17f842f1aa02c 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -19,6 +19,7 @@ * @npages: (in) number of pages to read/write * @cpages: (out) number of pages copied * @faults: (out) number of device page faults seen + * @zone_device_type: (out) zone device memory type */ struct hmm_dmirror_cmd { __u64 addr; @@ -26,6 +27,7 @@ struct hmm_dmirror_cmd { __u64 npages; __u64 cpages; __u64 faults; + __u64 zone_device_type; }; /* Expose the address space of the calling process through hmm device file */ @@ -35,6 +37,7 @@ struct hmm_dmirror_cmd { #define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) #define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) #define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. @@ -62,4 +65,9 @@ enum { HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, }; +enum { + /* 0 is reserved to catch uninitialized type fields */ + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1, +}; + #endif /* _LIB_TEST_HMM_UAPI_H */ From 32f39914b50def63347788105bc074d55043539f Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:39 +1100 Subject: [PATCH 254/334] lib: test_hmm add module param for zone device type In order to configure device coherent in test_hmm, two module parameters should be passed, which correspond to the SP start address of each device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed, private device type is configured. Link: https://lkml.kernel.org/r/20220210072828.2930359-22-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_hmm.c | 73 ++++++++++++++++++++++++++++++++------------- lib/test_hmm_uapi.h | 1 + 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 7a27584484ce0..15747f70c5bc9 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -37,6 +37,16 @@ #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) #define DEVMEM_CHUNKS_RESERVE 16 +static unsigned long spm_addr_dev0; +module_param(spm_addr_dev0, long, 0644); +MODULE_PARM_DESC(spm_addr_dev0, + "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); + +static unsigned long spm_addr_dev1; +module_param(spm_addr_dev1, long, 0644); +MODULE_PARM_DESC(spm_addr_dev1, + "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); + static const struct dev_pagemap_ops dmirror_devmem_ops; static const struct mmu_interval_notifier_ops dmirror_min_ops; static dev_t dmirror_dev; @@ -455,28 +465,44 @@ static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) return ret; } -static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, +static int dmirror_allocate_chunk(struct dmirror_device *mdevice, struct page **ppage) { struct dmirror_chunk *devmem; - struct resource *res; + struct resource *res = NULL; unsigned long pfn; unsigned long pfn_first; unsigned long pfn_last; void *ptr; + int ret = -ENOMEM; devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); if (!devmem) - return false; + return ret; - res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, - "hmm_dmirror"); - if (IS_ERR(res)) + switch (mdevice->zone_device_type) { + case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE: + res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, + "hmm_dmirror"); + if (IS_ERR_OR_NULL(res)) + goto err_devmem; + devmem->pagemap.range.start = res->start; + devmem->pagemap.range.end = res->end; + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + break; + case HMM_DMIRROR_MEMORY_DEVICE_COHERENT: + devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ? + spm_addr_dev0 : + spm_addr_dev1; + devmem->pagemap.range.end = devmem->pagemap.range.start + + DEVMEM_CHUNK_SIZE - 1; + devmem->pagemap.type = MEMORY_DEVICE_COHERENT; + break; + default: + ret = -EINVAL; goto err_devmem; + } - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.range.start = res->start; - devmem->pagemap.range.end = res->end; devmem->pagemap.nr_range = 1; devmem->pagemap.ops = &dmirror_devmem_ops; devmem->pagemap.owner = mdevice; @@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, mdevice->devmem_capacity = new_capacity; mdevice->devmem_chunks = new_chunks; } - ptr = memremap_pages(&devmem->pagemap, numa_node_id()); - if (IS_ERR(ptr)) + if (IS_ERR_OR_NULL(ptr)) { + if (ptr) + ret = PTR_ERR(ptr); + else + ret = -EFAULT; goto err_release; + } devmem->mdevice = mdevice; pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; @@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, } spin_unlock(&mdevice->lock); - return true; + return 0; err_release: mutex_unlock(&mdevice->devmem_lock); - release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); + if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); err_devmem: kfree(devmem); - return false; + return ret; } static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) @@ -562,7 +594,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) spin_unlock(&mdevice->lock); } else { spin_unlock(&mdevice->lock); - if (!dmirror_allocate_chunk(mdevice, &dpage)) + if (dmirror_allocate_chunk(mdevice, &dpage)) goto error; } @@ -1244,10 +1276,8 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) if (ret) return ret; - /* Build a list of free ZONE_DEVICE private struct pages */ - dmirror_allocate_chunk(mdevice, NULL); - - return 0; + /* Build a list of free ZONE_DEVICE struct pages */ + return dmirror_allocate_chunk(mdevice, NULL); } static void dmirror_device_remove(struct dmirror_device *mdevice) @@ -1260,8 +1290,9 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) mdevice->devmem_chunks[i]; memunmap_pages(&devmem->pagemap); - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); + if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); kfree(devmem); } kfree(mdevice->devmem_chunks); diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index 17f842f1aa02c..625f3690d086b 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -68,6 +68,7 @@ enum { enum { /* 0 is reserved to catch uninitialized type fields */ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1, + HMM_DMIRROR_MEMORY_DEVICE_COHERENT, }; #endif /* _LIB_TEST_HMM_UAPI_H */ From fc1a6ce3670c600321f754df5b2cff594b51be17 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:40 +1100 Subject: [PATCH 255/334] lib: add support for device coherent type in test_hmm Device Coherent type uses device memory that is coherently accesible by the CPU. This could be shown as SP (special purpose) memory range at the BIOS-e820 memory enumeration. If no SP memory is supported in system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP. Currently, test_hmm only supports two different SP ranges of at least 256MB size. This could be specified in the kernel parameter variable efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 & 0x140000000 physical address. Ex. efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000 Private and coherent device mirror instances can be created in the same probed. This is done by passing the module parameters spm_addr_dev0 & spm_addr_dev1. In this case, it will create four instances of device_mirror. The first two correspond to private device type, the last two to coherent type. Then, they can be easily accessed from user space through /dev/hmm_mirror. Usually num_device 0 and 1 are for private, and 2 and 3 for coherent types. If no module parameters are passed, two instances of private type device_mirror will be created only. Link: https://lkml.kernel.org/r/20220210072828.2930359-23-hch@lst.de Signed-off-by: Alex Sierra Reviewed-by: Alistair Poppple Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_hmm.c | 253 +++++++++++++++++++++++++++++++++----------- lib/test_hmm_uapi.h | 15 ++- 2 files changed, 202 insertions(+), 66 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 15747f70c5bc9..361a026c5d212 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -32,11 +32,22 @@ #include "test_hmm_uapi.h" -#define DMIRROR_NDEVICES 2 +#define DMIRROR_NDEVICES 4 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) #define DEVMEM_CHUNKS_RESERVE 16 +/* + * For device_private pages, dpage is just a dummy struct page + * representing a piece of device memory. dmirror_devmem_alloc_page + * allocates a real system memory page as backing storage to fake a + * real device. zone_device_data points to that backing page. But + * for device_coherent memory, the struct page represents real + * physical CPU-accessible memory that we can use directly. + */ +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \ + (page)->zone_device_data : (page)) + static unsigned long spm_addr_dev0; module_param(spm_addr_dev0, long, 0644); MODULE_PARM_DESC(spm_addr_dev0, @@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce, return 0; } +static bool dmirror_is_private_zone(struct dmirror_device *mdevice) +{ + return (mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false; +} + +static enum migrate_vma_direction +dmirror_select_device(struct dmirror *dmirror) +{ + return (dmirror->mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : + MIGRATE_VMA_SELECT_DEVICE_COHERENT; +} + static void dmirror_bounce_fini(struct dmirror_bounce *bounce) { vfree(bounce->ptr); @@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) { struct page *dpage = NULL; - struct page *rpage; + struct page *rpage = NULL; /* - * This is a fake device so we alloc real system memory to store - * our device memory. + * For ZONE_DEVICE private type, this is a fake device so we allocate + * real system memory to store our device memory. + * For ZONE_DEVICE coherent type we use the actual dpage to store the + * data and ignore rpage. */ - rpage = alloc_page(GFP_HIGHUSER); - if (!rpage) - return NULL; - + if (dmirror_is_private_zone(mdevice)) { + rpage = alloc_page(GFP_HIGHUSER); + if (!rpage) + return NULL; + } spin_lock(&mdevice->lock); if (mdevice->free_pages) { @@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) return dpage; error: - __free_page(rpage); + if (rpage) + __free_page(rpage); return NULL; } @@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, * unallocated pte_none() or read-only zero page. */ spage = migrate_pfn_to_page(*src); + if (WARN(spage && is_zone_device_page(spage), + "page already in device spage pfn: 0x%lx\n", + page_to_pfn(spage))) + continue; dpage = dmirror_devmem_alloc_page(mdevice); if (!dpage) continue; - rpage = dpage->zone_device_data; + rpage = BACKING_PAGE(dpage); if (spage) copy_highpage(rpage, spage); else @@ -648,6 +682,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, */ rpage->zone_device_data = dmirror; + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); *dst = migrate_pfn(page_to_pfn(dpage)); if ((*src & MIGRATE_PFN_WRITE) || (!spage && args->vma->vm_flags & VM_WRITE)) @@ -725,11 +761,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, if (!dpage) continue; - /* - * Store the page that holds the data so the page table - * doesn't have to deal with ZONE_DEVICE private pages. - */ - entry = dpage->zone_device_data; + entry = BACKING_PAGE(dpage); if (*dst & MIGRATE_PFN_WRITE) entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); @@ -809,15 +841,126 @@ static int dmirror_exclusive(struct dmirror *dmirror, return ret; } -static int dmirror_migrate(struct dmirror *dmirror, - struct hmm_dmirror_cmd *cmd) +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, + struct dmirror *dmirror) +{ + const unsigned long *src = args->src; + unsigned long *dst = args->dst; + unsigned long start = args->start; + unsigned long end = args->end; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE, + src++, dst++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(*src); + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_device_private_page(spage) && + !is_device_coherent_page(spage))) + continue; + spage = BACKING_PAGE(spage); + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + continue; + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); + + lock_page(dpage); + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dpage, spage); + *dst = migrate_pfn(page_to_pfn(dpage)); + if (*src & MIGRATE_PFN_WRITE) + *dst |= MIGRATE_PFN_WRITE; + } + return 0; +} + +static unsigned long +dmirror_successful_migrated_pages(struct migrate_vma *migrate) +{ + unsigned long cpages = 0; + unsigned long i; + + for (i = 0; i < migrate->npages; i++) { + if (migrate->src[i] & MIGRATE_PFN_VALID && + migrate->src[i] & MIGRATE_PFN_MIGRATE) + cpages++; + } + return cpages; +} + +static int dmirror_migrate_to_system(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) { unsigned long start, end, addr; unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[64]; - unsigned long dst_pfns[64]; + unsigned long src_pfns[64] = { 0 }; + unsigned long dst_pfns[64] = { 0 }; + struct migrate_vma args; + unsigned long next; + int ret; + + start = cmd->addr; + end = start + size; + if (end < start) + return -EINVAL; + + /* Since the mm is for the mirrored process, get a reference first. */ + if (!mmget_not_zero(mm)) + return -EINVAL; + + cmd->cpages = 0; + mmap_read_lock(mm); + for (addr = start; addr < end; addr = next) { + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_READ)) { + ret = -EINVAL; + goto out; + } + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + if (next > vma->vm_end) + next = vma->vm_end; + + args.vma = vma; + args.src = src_pfns; + args.dst = dst_pfns; + args.start = addr; + args.end = next; + args.pgmap_owner = dmirror->mdevice; + args.flags = dmirror_select_device(dmirror); + + ret = migrate_vma_setup(&args); + if (ret) + goto out; + + pr_debug("Migrating from device mem to sys mem\n"); + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); + + migrate_vma_pages(&args); + cmd->cpages += dmirror_successful_migrated_pages(&args); + migrate_vma_finalize(&args); + } +out: + mmap_read_unlock(mm); + mmput(mm); + + return ret; +} + +static int dmirror_migrate_to_device(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + unsigned long start, end, addr; + unsigned long size = cmd->npages << PAGE_SHIFT; + struct mm_struct *mm = dmirror->notifier.mm; + struct vm_area_struct *vma; + unsigned long src_pfns[64] = { 0 }; + unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; struct migrate_vma args; unsigned long next; @@ -854,6 +997,7 @@ static int dmirror_migrate(struct dmirror *dmirror, if (ret) goto out; + pr_debug("Migrating from sys mem to device mem\n"); dmirror_migrate_alloc_and_copy(&args, dmirror); migrate_vma_pages(&args); dmirror_migrate_finalize_and_map(&args, dmirror); @@ -862,7 +1006,10 @@ static int dmirror_migrate(struct dmirror *dmirror, mmap_read_unlock(mm); mmput(mm); - /* Return the migrated data for verification. */ + /* + * Return the migrated data for verification. + * Only for pages in device zone + */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) return ret; @@ -905,6 +1052,12 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; else *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; + } else if (is_device_coherent_page(page)) { + /* Is the page migrated to this device or some other? */ + if (dmirror->mdevice == dmirror_page_to_device(page)) + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL; + else + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE; } else if (is_zero_pfn(page_to_pfn(page))) *perm = HMM_DMIRROR_PROT_ZERO; else @@ -1101,8 +1254,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_write(dmirror, &cmd); break; - case HMM_DMIRROR_MIGRATE: - ret = dmirror_migrate(dmirror, &cmd); + case HMM_DMIRROR_MIGRATE_TO_DEV: + ret = dmirror_migrate_to_device(dmirror, &cmd); + break; + + case HMM_DMIRROR_MIGRATE_TO_SYS: + ret = dmirror_migrate_to_system(dmirror, &cmd); break; case HMM_DMIRROR_EXCLUSIVE: @@ -1167,14 +1324,13 @@ static const struct file_operations dmirror_fops = { static void dmirror_devmem_free(struct page *page) { - struct page *rpage = page->zone_device_data; + struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; - if (rpage) + if (rpage != page) __free_page(rpage); mdevice = dmirror_page_to_device(page); - spin_lock(&mdevice->lock); mdevice->cfree++; page->zone_device_data = mdevice->free_pages; @@ -1182,43 +1338,11 @@ static void dmirror_devmem_free(struct page *page) spin_unlock(&mdevice->lock); } -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, - struct dmirror *dmirror) -{ - const unsigned long *src = args->src; - unsigned long *dst = args->dst; - unsigned long start = args->start; - unsigned long end = args->end; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { - struct page *dpage, *spage; - - spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; - spage = spage->zone_device_data; - - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - - lock_page(dpage); - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); - copy_highpage(dpage, spage); - *dst = migrate_pfn(page_to_pfn(dpage)); - if (*src & MIGRATE_PFN_WRITE) - *dst |= MIGRATE_PFN_WRITE; - } - return 0; -} - static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args; - unsigned long src_pfns; - unsigned long dst_pfns; + unsigned long src_pfns = 0; + unsigned long dst_pfns = 0; struct page *rpage; struct dmirror *dmirror; vm_fault_t ret; @@ -1238,7 +1362,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.src = &src_pfns; args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; - args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + args.flags = dmirror_select_device(dmirror); if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; @@ -1317,6 +1441,12 @@ static int __init hmm_dmirror_init(void) HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; dmirror_devices[ndevices++].zone_device_type = HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + if (spm_addr_dev0 && spm_addr_dev1) { + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + } for (id = 0; id < ndevices; id++) { ret = dmirror_device_init(dmirror_devices + id, id); if (ret) @@ -1339,7 +1469,8 @@ static void __exit hmm_dmirror_exit(void) int id; for (id = 0; id < DMIRROR_NDEVICES; id++) - dmirror_device_remove(dmirror_devices + id); + if (dmirror_devices[id].zone_device_type) + dmirror_device_remove(dmirror_devices + id); unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index 625f3690d086b..e190b2ab6f199 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -33,11 +33,12 @@ struct hmm_dmirror_cmd { /* Expose the address space of the calling process through hmm device file */ #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd) #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. @@ -52,6 +53,8 @@ struct hmm_dmirror_cmd { * device the ioctl() is made * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some * other device + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device + * the ioctl() is made */ enum { HMM_DMIRROR_PROT_ERROR = 0xFF, @@ -63,6 +66,8 @@ enum { HMM_DMIRROR_PROT_ZERO = 0x10, HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, + HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40, + HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50, }; enum { From f12426f808b25c658a12ed00dc0733fb350d9716 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:40 +1100 Subject: [PATCH 256/334] tools: update hmm-test to support device coherent type Test cases such as migrate_fault and migrate_multiple, were modified to explicit migrate from device to sys memory without the need of page faults, when using device coherent type. Snapshot test case updated to read memory device type first and based on that, get the proper returned results migrate_ping_pong test case added to test explicit migration from device to sys memory for both private and coherent zone types. Helpers to migrate from device to sys memory and vicerversa were also added. Link: https://lkml.kernel.org/r/20220210072828.2930359-24-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/hmm-tests.c | 123 ++++++++++++++++++++----- 1 file changed, 102 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 203323967b507..84ec8c4a1dc7b 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -44,6 +44,14 @@ struct hmm_buffer { int fd; uint64_t cpages; uint64_t faults; + int zone_device_type; +}; + +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, }; #define TWOMEG (1 << 21) @@ -60,6 +68,21 @@ FIXTURE(hmm) unsigned int page_shift; }; +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + FIXTURE(hmm2) { int fd0; @@ -68,6 +91,24 @@ FIXTURE(hmm2) unsigned int page_shift; }; +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + static int hmm_open(int unit) { char pathname[HMM_PATH_MAX]; @@ -81,12 +122,19 @@ static int hmm_open(int unit) return fd; } +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + FIXTURE_SETUP(hmm) { self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd = hmm_open(0); + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd, 0); } @@ -95,9 +143,11 @@ FIXTURE_SETUP(hmm2) self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd0 = hmm_open(0); + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(1); + self->fd1 = hmm_open(variant->device_number1); ASSERT_GE(self->fd1, 0); } @@ -144,6 +194,7 @@ static int hmm_dmirror_cmd(int fd, } buffer->cpages = cmd.cpages; buffer->faults = cmd.faults; + buffer->zone_device_type = cmd.zone_device_type; return 0; } @@ -211,6 +262,20 @@ static void hmm_nanosleep(unsigned int n) nanosleep(&t, NULL); } +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + /* * Simple NULL test of device open/close. */ @@ -875,7 +940,7 @@ TEST_F(hmm, migrate) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -923,7 +988,7 @@ TEST_F(hmm, migrate_fault) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -936,7 +1001,7 @@ TEST_F(hmm, migrate_fault) ASSERT_EQ(ptr[i], i); /* Migrate memory to the device again. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -976,7 +1041,7 @@ TEST_F(hmm, migrate_shared) ASSERT_NE(buffer->ptr, MAP_FAILED); /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, -ENOENT); hmm_buffer_free(buffer); @@ -1015,7 +1080,7 @@ TEST_F(hmm2, migrate_mixed) p = buffer->ptr; /* Migrating a protected area should be an error. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); ASSERT_EQ(ret, -EINVAL); /* Punch a hole after the first page address. */ @@ -1023,7 +1088,7 @@ TEST_F(hmm2, migrate_mixed) ASSERT_EQ(ret, 0); /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); ASSERT_EQ(ret, -EINVAL); /* Page 2 will be a read-only zero page. */ @@ -1055,13 +1120,13 @@ TEST_F(hmm2, migrate_mixed) /* Now try to migrate pages 2-5 to device 1. */ buffer->ptr = p + 2 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 4); /* Page 5 won't be migrated to device 0 because it's on device 1. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, -ENOENT); buffer->ptr = p; @@ -1070,8 +1135,12 @@ TEST_F(hmm2, migrate_mixed) } /* - * Migrate anonymous memory to device private memory and fault it back to system - * memory multiple times. + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. */ TEST_F(hmm, migrate_multiple) { @@ -1107,8 +1176,7 @@ TEST_F(hmm, migrate_multiple) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, - npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -1116,7 +1184,13 @@ TEST_F(hmm, migrate_multiple) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - /* Fault pages back to system memory and check them. */ + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); @@ -1354,13 +1428,13 @@ TEST_F(hmm2, snapshot) /* Page 5 will be migrated to device 0. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); /* Page 6 will be migrated to device 1. */ buffer->ptr = p + 6 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); @@ -1377,9 +1451,16 @@ TEST_F(hmm2, snapshot) ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } hmm_buffer_free(buffer); } From b9040c00e0c53b30c4050eb7f61441e71ed34fe5 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Wed, 16 Feb 2022 15:31:40 +1100 Subject: [PATCH 257/334] tools: update test_hmm script to support SP config Add two more parameters to set spm_addr_dev0 & spm_addr_dev1 addresses. These two parameters configure the start SP addresses for each device in test_hmm driver. Consequently, this configures zone device type as coherent. Link: https://lkml.kernel.org/r/20220210072828.2930359-25-hch@lst.de Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/test_hmm.sh | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 0647b525a6256..539c9371e592a 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -40,11 +40,26 @@ check_test_requirements() load_driver() { - modprobe $DRIVER > /dev/null 2>&1 + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi if [ $? == 0 ]; then major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) mknod /dev/hmm_dmirror0 c $major 0 mknod /dev/hmm_dmirror1 c $major 1 + if [ $# -eq 2 ]; then + mknod /dev/hmm_dmirror2 c $major 2 + mknod /dev/hmm_dmirror3 c $major 3 + fi fi } @@ -58,7 +73,7 @@ run_smoke() { echo "Running smoke test. Note, this test provides basic coverage." - load_driver + load_driver $1 $2 $(dirname "${BASH_SOURCE[0]}")/hmm-tests unload_driver } @@ -75,6 +90,9 @@ usage() echo "# Smoke testing" echo "./${TEST_NAME}.sh smoke" echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke " + echo exit 0 } @@ -84,7 +102,7 @@ function run_test() usage else if [ "$1" = "smoke" ]; then - run_smoke + run_smoke $2 $3 else usage fi From d24f7c6b85194dff8217ae7a97d015a2615db9b2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 16 Feb 2022 15:31:40 +1100 Subject: [PATCH 258/334] mm: remove the vma check in migrate_vma_setup() migrate_vma_setup() checks that a valid vma is passed so that the page tables can be walked to find the pfns associated with a given address range. However in some cases the pfns are already known, such as when migrating device coherent pages during pin_user_pages() meaning a valid vma isn't required. Link: https://lkml.kernel.org/r/20220210072828.2930359-26-hch@lst.de Signed-off-by: Alistair Popple Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/migrate_device.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index f27486b501fee..41112660ed19a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -463,24 +463,24 @@ int migrate_vma_setup(struct migrate_vma *args) args->start &= PAGE_MASK; args->end &= PAGE_MASK; - if (!args->vma || is_vm_hugetlb_page(args->vma) || - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) - return -EINVAL; - if (nr_pages <= 0) - return -EINVAL; - if (args->start < args->vma->vm_start || - args->start >= args->vma->vm_end) - return -EINVAL; - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) - return -EINVAL; if (!args->src || !args->dst) return -EINVAL; - - memset(args->src, 0, sizeof(*args->src) * nr_pages); - args->cpages = 0; - args->npages = 0; - - migrate_vma_collect(args); + if (args->vma) { + if (is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || + args->end > args->vma->vm_end) + return -EINVAL; + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + } if (args->cpages) migrate_vma_unmap(args); @@ -662,7 +662,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - if (!page) { + if (!page && migrate->vma) { if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { From 7a960c82a4e74e8a983365881cd391b001419964 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 16 Feb 2022 15:31:40 +1100 Subject: [PATCH 259/334] mm/gup: migrate device coherent pages when pinning instead of failing Currently any attempts to pin a device coherent page will fail. This is because device coherent pages need to be managed by a device driver, and pinning them would prevent a driver from migrating them off the device. However this is no reason to fail pinning of these pages. These are coherent and accessible from the CPU so can be migrated just like pinning ZONE_MOVABLE pages. So instead of failing all attempts to pin them first try migrating them out of ZONE_DEVICE. [hch@lst.de: rebased to the split device memory checks, moved migrate_device_page to migrate_device.c] Link: https://lkml.kernel.org/r/20220210072828.2930359-27-hch@lst.de Signed-off-by: Alistair Popple Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Dan Williams Cc: Jason Gunthorpe Cc: Karol Herbst Cc: Logan Gunthorpe Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 37 ++++++++++++++++++++++++++----- mm/internal.h | 1 + mm/migrate_device.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index d0845d97cb857..4ab43b4fc9bc5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1865,9 +1865,31 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, ret = -EFAULT; goto unpin_pages; } + + /* + * Device coherent pages are managed by a driver and should not + * be pinned indefinitely as it prevents the driver moving the + * page. So when trying to pin with FOLL_LONGTERM instead try + * to migrate the page out of device memory. + */ if (is_device_coherent_page(head)) { - ret = -EFAULT; - goto unpin_pages; + WARN_ON_ONCE(PageCompound(head)); + + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + if (gup_flags & FOLL_PIN) { + get_page(head); + unpin_user_page(head); + } + + pages[i] = migrate_device_page(head, gup_flags); + if (!pages[i]) { + ret = -EBUSY; + goto unpin_pages; + } + continue; } if (is_pinnable_page(head)) @@ -1907,10 +1929,13 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, return nr_pages; unpin_pages: - if (gup_flags & FOLL_PIN) { - unpin_user_pages(pages, nr_pages); - } else { - for (i = 0; i < nr_pages; i++) + for (i = 0; i < nr_pages; i++) { + if (!pages[i]) + continue; + + if (gup_flags & FOLL_PIN) + unpin_user_page(pages[i]); + else put_page(pages[i]); } diff --git a/mm/internal.h b/mm/internal.h index bbea49756ef7f..7ed98955c8f46 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -720,5 +720,6 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); void free_zone_device_page(struct page *page); +struct page *migrate_device_page(struct page *page, unsigned int gup_flags); #endif /* __MM_INTERNAL_H */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 41112660ed19a..cd2c97e547e25 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -768,3 +768,56 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } EXPORT_SYMBOL(migrate_vma_finalize); + +/* + * Migrate a device coherent page back to normal memory. The caller should have + * a reference on page which will be copied to the new page if migration is + * successful or dropped on failure. + */ +struct page *migrate_device_page(struct page *page, unsigned int gup_flags) +{ + unsigned long src_pfn, dst_pfn = 0; + struct migrate_vma args; + struct page *dpage; + + lock_page(page); + src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + args.src = &src_pfn; + args.dst = &dst_pfn; + args.cpages = 1; + args.npages = 1; + args.vma = NULL; + migrate_vma_setup(&args); + if (!(src_pfn & MIGRATE_PFN_MIGRATE)) + return NULL; + + dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0); + + /* + * get/pin the new page now so we don't have to retry gup after + * migrating. We already have a reference so this should never fail. + */ + if (dpage && WARN_ON_ONCE(!try_grab_page(dpage, gup_flags))) { + __free_pages(dpage, 0); + dpage = NULL; + } + + if (dpage) { + lock_page(dpage); + dst_pfn = migrate_pfn(page_to_pfn(dpage)); + } + + migrate_vma_pages(&args); + if (src_pfn & MIGRATE_PFN_MIGRATE) + copy_highpage(dpage, page); + migrate_vma_finalize(&args); + if (dpage && !(src_pfn & MIGRATE_PFN_MIGRATE)) { + if (gup_flags & FOLL_PIN) + unpin_user_page(dpage); + else + put_page(dpage); + dpage = NULL; + } + + return dpage; +} From 8d5a5e4b791240573ded78c6a61c98fb4f8d9861 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:41 +1100 Subject: [PATCH 260/334] mm/damon/dbgfs/init_regions: use target index instead of target id Patch series "Remove the type-unclear target id concept". DAMON asks each monitoring target ('struct damon_target') to have one 'unsigned long' integer called 'id', which should be unique among the targets of same monitoring context. Meaning of it is, however, totally up to the monitoring primitives that registered to the monitoring context. For example, the virtual address spaces monitoring primitives treats the id as a 'struct pid' pointer. This makes the code flexible but ugly, not well-documented, and type-unsafe[1]. Also, identification of each target can be done via its index. For the reason, this patchset removes the concept and uses clear type definition. [1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/ This patch (of 4): Target id is a 'unsigned long' data, which can be interpreted differently by each monitoring primitives. For example, it means 'struct pid *' for the virtual address spaces monitoring, while it means nothing but an integer to be displayed to debugfs interface users for the physical address space monitoring. It's flexible but makes code ugly and type-unsafe[1]. To be prepared for eventual removal of the concept, this commit removes a use case of the concept in 'init_regions' debugfs file handling. In detail, this commit replaces use of the id with the index of each target in the context's targets list. [1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211230100723.2238-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211230100723.2238-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs-test.h | 20 ++++++++++---------- mm/damon/dbgfs.c | 25 ++++++++++++------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 86b9f9528231e..00bff058fe08f 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -113,19 +113,19 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); unsigned long ids[] = {1, 2, 3}; - /* Each line represents one region in `` `` */ - char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", - "2 10 20\n", - "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", + "1 10 20\n", + "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", ""}; /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", - "2 10 20\n", - "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", + "1 10 20\n", + "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", ""}; - char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ - "2 10 20\n 2 14 26\n", /* regions overlap */ - "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ + "1 10 20\n 1 14 26\n", /* regions overlap */ + "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ char *input, *expect; int i, rc; char buf[256]; diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 5b899601e56c3..3f65af04e4e60 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -440,18 +440,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) { struct damon_target *t; struct damon_region *r; + int target_idx = 0; int written = 0; int rc; damon_for_each_target(t, c) { damon_for_each_region(r, t) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %lu\n", - t->id, r->ar.start, r->ar.end); + "%d %lu %lu\n", + target_idx, r->ar.start, r->ar.end); if (!rc) return -ENOMEM; written += rc; } + target_idx++; } return written; } @@ -485,22 +487,19 @@ static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, return len; } -static int add_init_region(struct damon_ctx *c, - unsigned long target_id, struct damon_addr_range *ar) +static int add_init_region(struct damon_ctx *c, int target_idx, + struct damon_addr_range *ar) { struct damon_target *t; struct damon_region *r, *prev; - unsigned long id; + unsigned long idx = 0; int rc = -EINVAL; if (ar->start >= ar->end) return -EINVAL; damon_for_each_target(t, c) { - id = t->id; - if (targetid_is_pid(c)) - id = (unsigned long)pid_vnr((struct pid *)id); - if (id == target_id) { + if (idx++ == target_idx) { r = damon_new_region(ar->start, ar->end); if (!r) return -ENOMEM; @@ -523,7 +522,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) struct damon_target *t; struct damon_region *r, *next; int pos = 0, parsed, ret; - unsigned long target_id; + int target_idx; struct damon_addr_range ar; int err; @@ -533,11 +532,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) } while (pos < len) { - ret = sscanf(&str[pos], "%lu %lu %lu%n", - &target_id, &ar.start, &ar.end, &parsed); + ret = sscanf(&str[pos], "%d %lu %lu%n", + &target_idx, &ar.start, &ar.end, &parsed); if (ret != 3) break; - err = add_init_region(c, target_id, &ar); + err = add_init_region(c, target_idx, &ar); if (err) goto fail; pos += parsed; From c229ef6dbd138cb4875be048ce9278cfbffe24e8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:41 +1100 Subject: [PATCH 261/334] Docs/admin-guide/mm/damon/usage: update for changed initail_regions file input A previous commit made init_regions debugfs file to use target index instead of target id for specifying the target of the init regions. This commit updates the usage document to reflect the change. Link: https://lkml.kernel.org/r/20211230100723.2238-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/mm/damon/usage.rst | 24 ++++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 59b84904a8543..1e06435b8ff67 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -108,19 +108,23 @@ In such cases, users can explicitly set the initial monitoring target regions as they want, by writing proper values to the ``init_regions`` file. Each line of the input should represent one region in below form.:: - + -The ``target id`` should already in ``target_ids`` file, and the regions should -be passed in address order. For example, below commands will set a couple of -address ranges, ``1-100`` and ``100-200`` as the initial monitoring target -region of process 42, and another couple of address ranges, ``20-40`` and -``50-100`` as that of process 4242.:: +The ``target idx`` should be the index of the target in ``target_ids`` file, +starting from ``0``, and the regions should be passed in address order. For +example, below commands will set a couple of address ranges, ``1-100`` and +``100-200`` as the initial monitoring target region of pid 42, which is the +first one (index ``0``) in ``target_ids``, and another couple of address +ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one +(index ``1``) in ``target_ids``.:: # cd /damon - # echo "42 1 100 - 42 100 200 - 4242 20 40 - 4242 50 100" > init_regions + # cat target_ids + 42 4242 + # echo "0 1 100 + 0 100 200 + 1 20 40 + 1 50 100" > init_regions Note that this sets the initial monitoring target regions only. In case of virtual memory monitoring, DAMON will automatically updates the boundary of the From d5b00c3ca1da448e227997763ae41414913c9096 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:41 +1100 Subject: [PATCH 262/334] mm/damon/core: move damon_set_targets() into dbgfs damon_set_targets() function is defined in the core for general use cases, but called from only dbgfs. Also, because the function is for general use cases, dbgfs does additional handling of pid type target id case. To make the situation simpler, this commit moves the function into dbgfs and makes it to do the pid type case handling on its own. Link: https://lkml.kernel.org/r/20211230100723.2238-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 2 -- mm/damon/core-test.h | 5 +++- mm/damon/core.c | 32 -------------------------- mm/damon/dbgfs-test.h | 14 ++++++------ mm/damon/dbgfs.c | 53 +++++++++++++++++++++++++++++++++---------- 5 files changed, 52 insertions(+), 54 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5e1e3a128b77a..bd021af5db3d1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,8 +484,6 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 7008c3735e99f..4a6141ddd6fcf 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -86,7 +86,10 @@ static void damon_test_aggregate(struct kunit *test) struct damon_region *r; int it, ir; - damon_set_targets(ctx, target_ids, 3); + for (it = 0; it < 3; it++) { + t = damon_new_target(target_ids[it]); + damon_add_target(ctx, t); + } it = 0; damon_for_each_target(t, ctx) { diff --git a/mm/damon/core.c b/mm/damon/core.c index 1dd153c31c9e2..3fef5c667a31d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -245,38 +245,6 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } -/** - * damon_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @ids: array of target ids - * @nr_ids: number of entries in @ids - * - * This function should not be called while the kdamond is running. - * - * Return: 0 on success, negative error code otherwise. - */ -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_destroy_targets(ctx); - - for (i = 0; i < nr_ids; i++) { - t = damon_new_target(ids[i]); - if (!t) { - /* The caller should do cleanup of the ids itself */ - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - return -ENOMEM; - } - damon_add_target(ctx, t); - } - - return 0; -} - /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 00bff058fe08f..c1c988b607bc9 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -86,23 +86,23 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) ctx->primitive.target_valid = NULL; ctx->primitive.cleanup = NULL; - damon_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, ids, 3); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, NULL, 0); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - damon_set_targets(ctx, (unsigned long []){1, 2}, 2); + dbgfs_set_targets(ctx, (unsigned long []){1, 2}, 2); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); - damon_set_targets(ctx, (unsigned long []){2}, 1); + dbgfs_set_targets(ctx, (unsigned long []){2}, 1); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, NULL, 0); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); @@ -130,7 +130,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; - damon_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, ids, 3); /* Put valid inputs and check the results */ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { @@ -158,7 +158,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, ""); } - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, NULL, 0); damon_destroy_ctx(ctx); } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 3f65af04e4e60..58867b9666350 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -358,11 +358,48 @@ static void dbgfs_put_pids(unsigned long *ids, int nr_ids) put_pid((struct pid *)ids[i]); } +/* + * dbgfs_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @ids: array of target ids + * @nr_ids: number of entries in @ids + * + * This function should not be called while the kdamond is running. + * + * Return: 0 on success, negative error code otherwise. + */ +static int dbgfs_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (targetid_is_pid(ctx)) + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } + + for (i = 0; i < nr_ids; i++) { + t = damon_new_target(ids[i]); + if (!t) { + /* The caller should do cleanup of the ids itself */ + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + if (targetid_is_pid(ctx)) + dbgfs_put_pids(ids, nr_ids); + return -ENOMEM; + } + damon_add_target(ctx, t); + } + + return 0; +} + static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - struct damon_target *t, *next_t; bool id_is_pid = true; char *kbuf; unsigned long *targets; @@ -407,11 +444,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, } /* remove previously set targets */ - damon_for_each_target_safe(t, next_t, ctx) { - if (targetid_is_pid(ctx)) - put_pid((struct pid *)t->id); - damon_destroy_target(t); - } + dbgfs_set_targets(ctx, NULL, 0); /* Configure the context for the address space type */ if (id_is_pid) @@ -419,13 +452,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file, else damon_pa_set_primitives(ctx); - ret = damon_set_targets(ctx, targets, nr_targets); - if (ret) { - if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); - } else { + ret = dbgfs_set_targets(ctx, targets, nr_targets); + if (!ret) ret = count; - } unlock_out: mutex_unlock(&ctx->kdamond_lock); From 2f0f7bd066969b82902cd6488c6b1bbb7a2498e6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:41 +1100 Subject: [PATCH 263/334] mm/damon: remove the target id concept DAMON asks each monitoring target ('struct damon_target') to have one 'unsigned long' integer called 'id', which should be unique among the targets of same monitoring context. Meaning of it is, however, totally up to the monitoring primitives that registered to the monitoring context. For example, the virtual address spaces monitoring primitives treats the id as a 'struct pid' pointer. This makes the code flexible, but ugly, not well-documented, and type-unsafe[1]. Also, identification of each target can be done via its index. For the reason, this commit removes the concept and uses clear type definition. For now, only 'struct pid' pointer is used for the virtual address spaces monitoring. If DAMON is extended in future so that we need to put another identifier field in the struct, we will use a union for such primitives-dependent fields and document which primitives are using which type. [1] https://lore.kernel.org/linux-mm/20211013154535.4aaeaaf9d0182922e405dd1e@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211230100723.2238-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 11 ++- mm/damon/core-test.h | 18 +++-- mm/damon/core.c | 4 +- mm/damon/dbgfs-test.h | 63 ++++++----------- mm/damon/dbgfs.c | 152 +++++++++++++++++++++++++----------------- mm/damon/reclaim.c | 3 +- mm/damon/vaddr-test.h | 6 +- mm/damon/vaddr.c | 4 +- 8 files changed, 133 insertions(+), 128 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index bd021af5db3d1..7c1d915b35875 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -60,19 +60,18 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. - * @id: Unique identifier for this target. + * @pid: The PID of the virtual address space to monitor. * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The - * @id of each target should be unique among the targets of the context. For - * example, in the virtual address monitoring context, it could be a pidfd or - * an address of an mm_struct. + * @pid should be set for appropriate address space monitoring primitives + * including the virtual address spaces monitoring primitives. */ struct damon_target { - unsigned long id; + struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; @@ -475,7 +474,7 @@ struct damos *damon_new_scheme( void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -struct damon_target *damon_new_target(unsigned long id); +struct damon_target *damon_new_target(void); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 4a6141ddd6fcf..b4085deb9fa05 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test) KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); - t = damon_new_target(42); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_add_region(r, t); @@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test) struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; - t = damon_new_target(42); - KUNIT_EXPECT_EQ(test, 42ul, t->id); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_add_target(c, t); @@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test) static void damon_test_aggregate(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long target_ids[] = {1, 2, 3}; unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; @@ -87,7 +85,7 @@ static void damon_test_aggregate(struct kunit *test) int it, ir; for (it = 0; it < 3; it++) { - t = damon_new_target(target_ids[it]); + t = damon_new_target(); damon_add_target(ctx, t); } @@ -125,7 +123,7 @@ static void damon_test_split_at(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); damon_add_region(r, t); damon_split_region_at(c, t, r, 25); @@ -146,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test) struct damon_region *r, *r2, *r3; int i; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); r->nr_accesses = 10; damon_add_region(r, t); @@ -194,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test) unsigned long eaddrs[] = {112, 130, 156, 170, 230}; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); r->nr_accesses = nrs[i]; @@ -218,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); damon_split_regions_of(c, t, 2); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); damon_split_regions_of(c, t, 4); diff --git a/mm/damon/core.c b/mm/damon/core.c index 3fef5c667a31d..bf495236d741b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -144,7 +144,7 @@ void damon_destroy_scheme(struct damos *s) * * Returns the pointer to the new struct if success, or NULL otherwise */ -struct damon_target *damon_new_target(unsigned long id) +struct damon_target *damon_new_target(void) { struct damon_target *t; @@ -152,7 +152,7 @@ struct damon_target *damon_new_target(unsigned long id) if (!t) return NULL; - t->id = id; + t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index c1c988b607bc9..0d3a14c00acfb 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -12,66 +12,58 @@ #include -static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +static void damon_dbgfs_test_str_to_ints(struct kunit *test) { char *question; - unsigned long *answers; - unsigned long expected[] = {12, 35, 46}; + int *answers; + int expected[] = {12, 35, 46}; ssize_t nr_integers = 0, i; question = "123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "123abc"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "a123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "12 35"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 abc 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < 2; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = ""; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "\n"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); } @@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) static void damon_dbgfs_test_set_targets(struct kunit *test) { struct damon_ctx *ctx = dbgfs_new_ctx(); - unsigned long ids[] = {1, 2, 3}; char buf[64]; - /* Make DAMON consider target id as plain number */ - ctx->primitive.target_valid = NULL; - ctx->primitive.cleanup = NULL; + /* Make DAMON consider target has no pid */ + ctx->primitive = (struct damon_primitive){}; - dbgfs_set_targets(ctx, ids, 3); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); - - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - dbgfs_set_targets(ctx, (unsigned long []){1, 2}, 2); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); - - dbgfs_set_targets(ctx, (unsigned long []){2}, 1); + dbgfs_set_targets(ctx, 1, NULL); sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); + KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); @@ -112,7 +94,6 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) static void damon_dbgfs_test_set_init_regions(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long ids[] = {1, 2, 3}; /* Each line represents one region in `` `` */ char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", "1 10 20\n", @@ -130,7 +111,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; - dbgfs_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, 3, NULL); /* Put valid inputs and check the results */ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { @@ -158,12 +139,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, ""); } - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); damon_destroy_ctx(ctx); } static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_str_to_ints), KUNIT_CASE(damon_dbgfs_test_set_targets), KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 58867b9666350..78ff645433c64 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -275,7 +275,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, return ret; } -static inline bool targetid_is_pid(const struct damon_ctx *ctx) +static inline bool target_has_pid(const struct damon_ctx *ctx) { return ctx->primitive.target_valid == damon_va_target_valid; } @@ -283,17 +283,19 @@ static inline bool targetid_is_pid(const struct damon_ctx *ctx) static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; - unsigned long id; + int id; int written = 0; int rc; damon_for_each_target(t, ctx) { - id = t->id; - if (targetid_is_pid(ctx)) + if (target_has_pid(ctx)) /* Show pid numbers to debugfs users */ - id = (unsigned long)pid_vnr((struct pid *)id); + id = pid_vnr(t->pid); + else + /* Show 42 for physical address space, just for fun */ + id = 42; - rc = scnprintf(&buf[written], len - written, "%lu ", id); + rc = scnprintf(&buf[written], len - written, "%d ", id); if (!rc) return -ENOMEM; written += rc; @@ -321,75 +323,114 @@ static ssize_t dbgfs_target_ids_read(struct file *file, } /* - * Converts a string into an array of unsigned long integers + * Converts a string into an integers array * - * Returns an array of unsigned long integers if the conversion success, or - * NULL otherwise. + * Returns an array of integers array if the conversion success, or NULL + * otherwise. */ -static unsigned long *str_to_target_ids(const char *str, ssize_t len, - ssize_t *nr_ids) +static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) { - unsigned long *ids; - const int max_nr_ids = 32; - unsigned long id; + int *array; + const int max_nr_ints = 32; + int nr; int pos = 0, parsed, ret; - *nr_ids = 0; - ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); - if (!ids) + *nr_ints = 0; + array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); + if (!array) return NULL; - while (*nr_ids < max_nr_ids && pos < len) { - ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + while (*nr_ints < max_nr_ints && pos < len) { + ret = sscanf(&str[pos], "%d%n", &nr, &parsed); pos += parsed; if (ret != 1) break; - ids[*nr_ids] = id; - *nr_ids += 1; + array[*nr_ints] = nr; + *nr_ints += 1; } - return ids; + return array; } -static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +static void dbgfs_put_pids(struct pid **pids, int nr_pids) { int i; - for (i = 0; i < nr_ids; i++) - put_pid((struct pid *)ids[i]); + for (i = 0; i < nr_pids; i++) + put_pid(pids[i]); +} + +/* + * Converts a string into an struct pid pointers array + * + * Returns an array of struct pid pointers if the conversion success, or NULL + * otherwise. + */ +static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) +{ + int *ints; + ssize_t nr_ints; + struct pid **pids; + + *nr_pids = 0; + + ints = str_to_ints(str, len, &nr_ints); + if (!ints) + return NULL; + + pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); + if (!pids) + goto out; + + for (; *nr_pids < nr_ints; (*nr_pids)++) { + pids[*nr_pids] = find_get_pid(ints[*nr_pids]); + if (!pids[*nr_pids]) { + dbgfs_put_pids(pids, *nr_pids); + kfree(ints); + kfree(pids); + return NULL; + } + } + +out: + kfree(ints); + return pids; } /* * dbgfs_set_targets() - Set monitoring targets. * @ctx: monitoring context - * @ids: array of target ids - * @nr_ids: number of entries in @ids + * @nr_targets: number of targets + * @pids: array of target pids (size is same to @nr_targets) * - * This function should not be called while the kdamond is running. + * This function should not be called while the kdamond is running. @pids is + * ignored if the context is not configured to have pid in each target. On + * failure, reference counts of all pids in @pids are decremented. * * Return: 0 on success, negative error code otherwise. */ -static int dbgfs_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids) +static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, + struct pid **pids) { ssize_t i; struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (targetid_is_pid(ctx)) - put_pid((struct pid *)t->id); + if (target_has_pid(ctx)) + put_pid(t->pid); damon_destroy_target(t); } - for (i = 0; i < nr_ids; i++) { - t = damon_new_target(ids[i]); + for (i = 0; i < nr_targets; i++) { + t = damon_new_target(); if (!t) { - /* The caller should do cleanup of the ids itself */ damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); - if (targetid_is_pid(ctx)) - dbgfs_put_pids(ids, nr_ids); + if (target_has_pid(ctx)) + dbgfs_put_pids(pids, nr_targets); return -ENOMEM; } + if (target_has_pid(ctx)) + t->pid = pids[i]; damon_add_target(ctx, t); } @@ -402,10 +443,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file, struct damon_ctx *ctx = file->private_data; bool id_is_pid = true; char *kbuf; - unsigned long *targets; + struct pid **target_pids = NULL; ssize_t nr_targets; ssize_t ret; - int i; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -413,38 +453,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; - /* target id is meaningless here, but we set it just for fun */ - scnprintf(kbuf, count, "42 "); - } - - targets = str_to_target_ids(kbuf, count, &nr_targets); - if (!targets) { - ret = -ENOMEM; - goto out; + nr_targets = 1; } if (id_is_pid) { - for (i = 0; i < nr_targets; i++) { - targets[i] = (unsigned long)find_get_pid( - (int)targets[i]); - if (!targets[i]) { - dbgfs_put_pids(targets, i); - ret = -EINVAL; - goto free_targets_out; - } + target_pids = str_to_pids(kbuf, count, &nr_targets); + if (!target_pids) { + ret = -ENOMEM; + goto out; } } mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); + dbgfs_put_pids(target_pids, nr_targets); ret = -EBUSY; goto unlock_out; } /* remove previously set targets */ - dbgfs_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); /* Configure the context for the address space type */ if (id_is_pid) @@ -452,14 +481,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, else damon_pa_set_primitives(ctx); - ret = dbgfs_set_targets(ctx, targets, nr_targets); + ret = dbgfs_set_targets(ctx, nr_targets, target_pids); if (!ret) ret = count; unlock_out: mutex_unlock(&ctx->kdamond_lock); -free_targets_out: - kfree(targets); + kfree(target_pids); out: kfree(kbuf); return ret; @@ -688,12 +716,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!targetid_is_pid(ctx)) + if (!target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { - put_pid((struct pid *)t->id); + put_pid(t->pid); damon_destroy_target(t); } mutex_unlock(&ctx->kdamond_lock); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index bc476cef688e8..29da37192e4a0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -387,8 +387,7 @@ static int __init damon_reclaim_init(void) damon_pa_set_primitives(ctx); ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - /* 4242 means nothing but fun */ - target = damon_new_target(4242); + target = damon_new_target(); if (!target) { damon_destroy_ctx(ctx); return -ENOMEM; diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 6a1b9272ea123..f0d0ba591792c 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, struct damon_region *r; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < nr_regions / 2; i++) { r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); damon_add_region(r, t); @@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test) static void damon_test_split_evenly_fail(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); damon_add_region(r, t); @@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test, static void damon_test_split_evenly_succ(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); unsigned long expected_width = (end - start) / nr_pieces; unsigned long i = 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 89b6468da2b9b..f98edb90a873c 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -23,12 +23,12 @@ #endif /* - * 't->id' should be the pointer to the relevant 'struct pid' having reference + * 't->pid' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ static inline struct task_struct *damon_get_task_struct(struct damon_target *t) { - return get_pid_task((struct pid *)t->id, PIDTYPE_PID); + return get_pid_task(t->pid, PIDTYPE_PID); } /* From 7f6303487d981221eab9597d05d3e4950cffd96a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 16 Feb 2022 15:31:41 +1100 Subject: [PATCH 264/334] mm/damon: remove redundant page validation It will never get a NULL page by pte_page() as discussed in thread [1], thus remove the redundant page validation to fix below Smatch static checker warning. mm/damon/vaddr.c:405 damon_hugetlb_mkold() warn: 'page' can't be NULL. [1] https://lore.kernel.org/linux-mm/20220106091200.GA14564@kili/ Link: https://lkml.kernel.org/r/6d32f7d201b8970d53f51b6c5717d472aed2987c.1642386715.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reported-by: Dan Carpenter Reviewed-by: SeongJae Park Acked-by: David Rientjes Acked-by: Souptick Joarder Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/vaddr.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index f98edb90a873c..6d3454dd3204b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -402,9 +402,6 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, pte_t entry = huge_ptep_get(pte); struct page *page = pte_page(entry); - if (!page) - return; - get_page(page); if (pte_young(entry)) { @@ -564,9 +561,6 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, goto out; page = pte_page(entry); - if (!page) - goto out; - get_page(page); if (pte_young(entry) || !page_is_idle(page) || From f7e42c0f4b718f0e6b985038b5182b22b937a703 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:42 +1100 Subject: [PATCH 265/334] mm/damon: rename damon_primitives to damon_operations Patch series "Allow DAMON user code independent of monitoring primitives". In-kernel DAMON user code is required to configure the monitoring context (struct damon_ctx) with proper monitoring primitives (struct damon_primitive). This makes the user code dependent to all supporting monitoring primitives. For example, DAMON debugfs interface depends on both DAMON_VADDR and DAMON_PADDR, though some users have interest in only one use case. As more monitoring primitives are introduced, the problem will be bigger. To minimize such unnecessary dependency, this patchset makes monitoring primitives can be registered by the implemnting code and later dynamically searched and selected by the user code. In addition to that, this patchset renames monitoring primitives to monitoring operations, which is more easy to intuitively understand what it means and how it would be structed. This patch (of 8): DAMON has a set of callback functions called monitoring primitives and let it can be configured with various implementations for easy extension for different address spaces and usages. However, the word 'primitive' is not so explicit. Meanwhile, many other structs resembles similar purpose calls themselves 'operations'. To make the code easier to be understood, this commit renames 'damon_primitives' to 'damon_operations' before it is too late to rename. Link: https://lkml.kernel.org/r/20220215184603.1479-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220215184603.1479-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Xin Hao Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 48 ++++++++--------- mm/damon/Kconfig | 12 ++--- mm/damon/Makefile | 4 +- mm/damon/core.c | 65 ++++++++++++----------- mm/damon/dbgfs-test.h | 2 +- mm/damon/dbgfs.c | 10 ++-- mm/damon/{prmtv-common.c => ops-common.c} | 2 +- mm/damon/{prmtv-common.h => ops-common.h} | 0 mm/damon/paddr.c | 22 ++++---- mm/damon/reclaim.c | 2 +- mm/damon/vaddr-test.h | 2 +- mm/damon/vaddr.c | 22 ++++---- 12 files changed, 96 insertions(+), 95 deletions(-) rename mm/damon/{prmtv-common.c => ops-common.c} (99%) rename mm/damon/{prmtv-common.h => ops-common.h} (100%) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7c1d915b35875..00baeb42c18e2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -67,8 +67,8 @@ struct damon_region { * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The - * @pid should be set for appropriate address space monitoring primitives - * including the virtual address spaces monitoring primitives. + * @pid should be set for appropriate &struct damon_operations including the + * virtual address spaces monitoring operations. */ struct damon_target { struct pid *pid; @@ -120,9 +120,9 @@ enum damos_action { * uses smaller one as the effective quota. * * For selecting regions within the quota, DAMON prioritizes current scheme's - * target memory regions using the &struct damon_primitive->get_scheme_score. + * target memory regions using the &struct damon_operations->get_scheme_score. * You could customize the prioritization logic by setting &weight_sz, - * &weight_nr_accesses, and &weight_age, because monitoring primitives are + * &weight_nr_accesses, and &weight_age, because monitoring operations are * encouraged to respect those. */ struct damos_quota { @@ -256,10 +256,10 @@ struct damos { struct damon_ctx; /** - * struct damon_primitive - Monitoring primitives for given use cases. + * struct damon_operations - Monitoring operations for given use cases. * - * @init: Initialize primitive-internal data structures. - * @update: Update primitive-internal data structures. + * @init: Initialize operations-related data structures. + * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. @@ -269,18 +269,18 @@ struct damon_ctx; * @cleanup: Clean up the context. * * DAMON can be extended for various address spaces and usages. For this, - * users should register the low level primitives for their target address - * space and usecase via the &damon_ctx.primitive. Then, the monitoring thread + * users should register the low level operations for their target address + * space and usecase via the &damon_ctx.ops. Then, the monitoring thread * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting - * the monitoring, @update after each &damon_ctx.primitive_update_interval, and + * the monitoring, @update after each &damon_ctx.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each * &damon_ctx.aggr_interval. * - * @init should initialize primitive-internal data structures. For example, + * @init should initialize operations-related data structures. For example, * this could be used to construct proper monitoring target regions and link * those to @damon_ctx.adaptive_targets. - * @update should update the primitive-internal data structures. For example, + * @update should update the operations-related data structures. For example, * this could be used to update monitoring target regions for current status. * @prepare_access_checks should manipulate the monitoring regions to be * prepared for the next access check. @@ -300,7 +300,7 @@ struct damon_ctx; * monitoring. * @cleanup is called from @kdamond just before its termination. */ -struct damon_primitive { +struct damon_operations { void (*init)(struct damon_ctx *context); void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); @@ -354,15 +354,15 @@ struct damon_callback { * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. - * @primitive_update_interval: The time between monitoring primitive updates. + * @ops_update_interval: The time between monitoring operations updates. * * For each @sample_interval, DAMON checks whether each region is accessed or * not. It aggregates and keeps the access information (number of accesses to * each region) for @aggr_interval time. DAMON also checks whether the target * memory regions need update (e.g., by ``mmap()`` calls from the application, * in case of virtual memory monitoring) and applies the changes for each - * @primitive_update_interval. All time intervals are in micro-seconds. - * Please refer to &struct damon_primitive and &struct damon_callback for more + * @ops_update_interval. All time intervals are in micro-seconds. + * Please refer to &struct damon_operations and &struct damon_callback for more * detail. * * @kdamond: Kernel thread who does the monitoring. @@ -374,7 +374,7 @@ struct damon_callback { * * Once started, the monitoring thread runs until explicitly required to be * terminated or every monitoring target is invalid. The validity of the - * targets is checked via the &damon_primitive.target_valid of @primitive. The + * targets is checked via the &damon_operations.target_valid of @ops. The * termination can also be explicitly requested by writing non-zero to * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. * Therefore, users can know whether the monitoring is ongoing or terminated by @@ -384,7 +384,7 @@ struct damon_callback { * Note that the monitoring thread protects only @kdamond and @kdamond_stop via * @kdamond_lock. Accesses to other fields must be protected by themselves. * - * @primitive: Set of monitoring primitives for given use cases. + * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. * * @min_nr_regions: The minimum number of adaptive monitoring regions. @@ -395,17 +395,17 @@ struct damon_callback { struct damon_ctx { unsigned long sample_interval; unsigned long aggr_interval; - unsigned long primitive_update_interval; + unsigned long ops_update_interval; /* private: internal use only */ struct timespec64 last_aggregation; - struct timespec64 last_primitive_update; + struct timespec64 last_ops_update; /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; - struct damon_primitive primitive; + struct damon_operations ops; struct damon_callback callback; unsigned long min_nr_regions; @@ -484,7 +484,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long aggr_int, unsigned long ops_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); @@ -497,12 +497,12 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #ifdef CONFIG_DAMON_VADDR bool damon_va_target_valid(void *t); -void damon_va_set_primitives(struct damon_ctx *ctx); +void damon_va_set_operations(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ #ifdef CONFIG_DAMON_PADDR bool damon_pa_target_valid(void *t); -void damon_pa_set_primitives(struct damon_ctx *ctx); +void damon_pa_set_operations(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_PADDR */ #endif /* _DAMON_H */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 5bcf05851ad07..01bad77ad7ae6 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -25,27 +25,27 @@ config DAMON_KUNIT_TEST If unsure, say N. config DAMON_VADDR - bool "Data access monitoring primitives for virtual address spaces" + bool "Data access monitoring operations for virtual address spaces" depends on DAMON && MMU select PAGE_IDLE_FLAG help - This builds the default data access monitoring primitives for DAMON + This builds the default data access monitoring operations for DAMON that work for virtual address spaces. config DAMON_PADDR - bool "Data access monitoring primitives for the physical address space" + bool "Data access monitoring operations for the physical address space" depends on DAMON && MMU select PAGE_IDLE_FLAG help - This builds the default data access monitoring primitives for DAMON + This builds the default data access monitoring operations for DAMON that works for the physical address space. config DAMON_VADDR_KUNIT_TEST - bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS + bool "Test for DAMON operations" if !KUNIT_ALL_TESTS depends on DAMON_VADDR && KUNIT=y default KUNIT_ALL_TESTS help - This builds the DAMON virtual addresses primitives Kunit test suite. + This builds the DAMON virtual addresses operations Kunit test suite. For more information on KUnit and unit tests in general, please refer to the KUnit documentation. diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f7d5ac377a2bb..03931472991a4 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAMON) := core.o -obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o -obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o +obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o diff --git a/mm/damon/core.c b/mm/damon/core.c index bf495236d741b..be93fb1c34735 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -204,10 +204,10 @@ struct damon_ctx *damon_new_ctx(void) ctx->sample_interval = 5 * 1000; ctx->aggr_interval = 100 * 1000; - ctx->primitive_update_interval = 60 * 1000 * 1000; + ctx->ops_update_interval = 60 * 1000 * 1000; ktime_get_coarse_ts64(&ctx->last_aggregation); - ctx->last_primitive_update = ctx->last_aggregation; + ctx->last_ops_update = ctx->last_aggregation; mutex_init(&ctx->kdamond_lock); @@ -224,8 +224,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next_t; - if (ctx->primitive.cleanup) { - ctx->primitive.cleanup(ctx); + if (ctx->ops.cleanup) { + ctx->ops.cleanup(ctx); return; } @@ -250,7 +250,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * @ctx: monitoring context * @sample_int: time interval between samplings * @aggr_int: time interval between aggregations - * @primitive_upd_int: time interval between monitoring primitive updates + * @ops_upd_int: time interval between monitoring operations updates * @min_nr_reg: minimal number of regions * @max_nr_reg: maximum number of regions * @@ -260,7 +260,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * Return: 0 on success, negative error code otherwise. */ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long aggr_int, unsigned long ops_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg) { if (min_nr_reg < 3) @@ -270,7 +270,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; - ctx->primitive_update_interval = primitive_upd_int; + ctx->ops_update_interval = ops_upd_int; ctx->min_nr_regions = min_nr_reg; ctx->max_nr_regions = max_nr_reg; @@ -516,10 +516,10 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, { bool ret = __damos_valid_target(r, s); - if (!ret || !s->quota.esz || !c->primitive.get_scheme_score) + if (!ret || !s->quota.esz || !c->ops.get_scheme_score) return ret; - return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score; + return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; } static void damon_do_apply_schemes(struct damon_ctx *c, @@ -576,7 +576,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; /* Apply the scheme */ - if (c->primitive.apply_scheme) { + if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, @@ -586,7 +586,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_split_region_at(c, t, r, sz); } ktime_get_coarse_ts64(&begin); - sz_applied = c->primitive.apply_scheme(c, t, r, s); + sz_applied = c->ops.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -660,7 +660,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damos_set_effective_quota(quota); } - if (!c->primitive.get_scheme_score) + if (!c->ops.get_scheme_score) continue; /* Fill up the score histogram */ @@ -669,7 +669,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; - score = c->primitive.get_scheme_score( + score = c->ops.get_scheme_score( c, t, r, s); quota->histogram[score] += r->ar.end - r->ar.start; @@ -848,14 +848,15 @@ static void kdamond_split_regions(struct damon_ctx *ctx) } /* - * Check whether it is time to check and apply the target monitoring regions + * Check whether it is time to check and apply the operations-related data + * structures. * * Returns true if it is. */ -static bool kdamond_need_update_primitive(struct damon_ctx *ctx) +static bool kdamond_need_update_operations(struct damon_ctx *ctx) { - return damon_check_reset_time_interval(&ctx->last_primitive_update, - ctx->primitive_update_interval); + return damon_check_reset_time_interval(&ctx->last_ops_update, + ctx->ops_update_interval); } /* @@ -873,11 +874,11 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) if (kthread_should_stop()) return true; - if (!ctx->primitive.target_valid) + if (!ctx->ops.target_valid) return false; damon_for_each_target(t, ctx) { - if (ctx->primitive.target_valid(t)) + if (ctx->ops.target_valid(t)) return false; } @@ -976,8 +977,8 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); - if (ctx->primitive.init) - ctx->primitive.init(ctx); + if (ctx->ops.init) + ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) done = true; @@ -987,16 +988,16 @@ static int kdamond_fn(void *data) if (kdamond_wait_activation(ctx)) continue; - if (ctx->primitive.prepare_access_checks) - ctx->primitive.prepare_access_checks(ctx); + if (ctx->ops.prepare_access_checks) + ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) done = true; kdamond_usleep(ctx->sample_interval); - if (ctx->primitive.check_accesses) - max_nr_accesses = ctx->primitive.check_accesses(ctx); + if (ctx->ops.check_accesses) + max_nr_accesses = ctx->ops.check_accesses(ctx); if (kdamond_aggregate_interval_passed(ctx)) { kdamond_merge_regions(ctx, @@ -1008,13 +1009,13 @@ static int kdamond_fn(void *data) kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); - if (ctx->primitive.reset_aggregated) - ctx->primitive.reset_aggregated(ctx); + if (ctx->ops.reset_aggregated) + ctx->ops.reset_aggregated(ctx); } - if (kdamond_need_update_primitive(ctx)) { - if (ctx->primitive.update) - ctx->primitive.update(ctx); + if (kdamond_need_update_operations(ctx)) { + if (ctx->ops.update) + ctx->ops.update(ctx); sz_limit = damon_region_sz_limit(ctx); } } @@ -1025,8 +1026,8 @@ static int kdamond_fn(void *data) if (ctx->callback.before_terminate) ctx->callback.before_terminate(ctx); - if (ctx->primitive.cleanup) - ctx->primitive.cleanup(ctx); + if (ctx->ops.cleanup) + ctx->ops.cleanup(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 0d3a14c00acfb..8f7f325950559 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -74,7 +74,7 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) char buf[64]; /* Make DAMON consider target has no pid */ - ctx->primitive = (struct damon_primitive){}; + ctx->ops = (struct damon_operations){}; dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 78ff645433c64..719278a8cc5eb 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -56,7 +56,7 @@ static ssize_t dbgfs_attrs_read(struct file *file, mutex_lock(&ctx->kdamond_lock); ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", ctx->sample_interval, ctx->aggr_interval, - ctx->primitive_update_interval, ctx->min_nr_regions, + ctx->ops_update_interval, ctx->min_nr_regions, ctx->max_nr_regions); mutex_unlock(&ctx->kdamond_lock); @@ -277,7 +277,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, static inline bool target_has_pid(const struct damon_ctx *ctx) { - return ctx->primitive.target_valid == damon_va_target_valid; + return ctx->ops.target_valid == damon_va_target_valid; } static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) @@ -477,9 +477,9 @@ static ssize_t dbgfs_target_ids_write(struct file *file, /* Configure the context for the address space type */ if (id_is_pid) - damon_va_set_primitives(ctx); + damon_va_set_operations(ctx); else - damon_pa_set_primitives(ctx); + damon_pa_set_operations(ctx); ret = dbgfs_set_targets(ctx, nr_targets, target_pids); if (!ret) @@ -735,7 +735,7 @@ static struct damon_ctx *dbgfs_new_ctx(void) if (!ctx) return NULL; - damon_va_set_primitives(ctx); + damon_va_set_operations(ctx); ctx->callback.before_terminate = dbgfs_before_terminate; return ctx; } diff --git a/mm/damon/prmtv-common.c b/mm/damon/ops-common.c similarity index 99% rename from mm/damon/prmtv-common.c rename to mm/damon/ops-common.c index 92a04f5831d6b..e346cc10d1439 100644 --- a/mm/damon/prmtv-common.c +++ b/mm/damon/ops-common.c @@ -10,7 +10,7 @@ #include #include -#include "prmtv-common.h" +#include "ops-common.h" /* * Get an online page for a pfn if it's in the LRU list. Otherwise, returns diff --git a/mm/damon/prmtv-common.h b/mm/damon/ops-common.h similarity index 100% rename from mm/damon/prmtv-common.h rename to mm/damon/ops-common.h diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5e8244f65a1a2..9f0abd0369bc8 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -14,7 +14,7 @@ #include #include "../internal.h" -#include "prmtv-common.h" +#include "ops-common.h" static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) @@ -261,15 +261,15 @@ static int damon_pa_scheme_score(struct damon_ctx *context, return DAMOS_MAX_SCORE; } -void damon_pa_set_primitives(struct damon_ctx *ctx) +void damon_pa_set_operations(struct damon_ctx *ctx) { - ctx->primitive.init = NULL; - ctx->primitive.update = NULL; - ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks; - ctx->primitive.check_accesses = damon_pa_check_accesses; - ctx->primitive.reset_aggregated = NULL; - ctx->primitive.target_valid = damon_pa_target_valid; - ctx->primitive.cleanup = NULL; - ctx->primitive.apply_scheme = damon_pa_apply_scheme; - ctx->primitive.get_scheme_score = damon_pa_scheme_score; + ctx->ops.init = NULL; + ctx->ops.update = NULL; + ctx->ops.prepare_access_checks = damon_pa_prepare_access_checks; + ctx->ops.check_accesses = damon_pa_check_accesses; + ctx->ops.reset_aggregated = NULL; + ctx->ops.target_valid = damon_pa_target_valid; + ctx->ops.cleanup = NULL; + ctx->ops.apply_scheme = damon_pa_apply_scheme; + ctx->ops.get_scheme_score = damon_pa_scheme_score; } diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 29da37192e4a0..3c93095c793c4 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -384,7 +384,7 @@ static int __init damon_reclaim_init(void) if (!ctx) return -ENOMEM; - damon_pa_set_primitives(ctx); + damon_pa_set_operations(ctx); ctx->callback.after_aggregation = damon_reclaim_after_aggregation; target = damon_new_target(); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index f0d0ba591792c..1a55bb6c36c3d 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -314,7 +314,7 @@ static struct kunit_case damon_test_cases[] = { }; static struct kunit_suite damon_test_suite = { - .name = "damon-primitives", + .name = "damon-operations", .test_cases = damon_test_cases, }; kunit_test_suite(damon_test_suite); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 6d3454dd3204b..c0eb32025f9ba 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -15,7 +15,7 @@ #include #include -#include "prmtv-common.h" +#include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION @@ -739,17 +739,17 @@ static int damon_va_scheme_score(struct damon_ctx *context, return DAMOS_MAX_SCORE; } -void damon_va_set_primitives(struct damon_ctx *ctx) +void damon_va_set_operations(struct damon_ctx *ctx) { - ctx->primitive.init = damon_va_init; - ctx->primitive.update = damon_va_update; - ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; - ctx->primitive.check_accesses = damon_va_check_accesses; - ctx->primitive.reset_aggregated = NULL; - ctx->primitive.target_valid = damon_va_target_valid; - ctx->primitive.cleanup = NULL; - ctx->primitive.apply_scheme = damon_va_apply_scheme; - ctx->primitive.get_scheme_score = damon_va_scheme_score; + ctx->ops.init = damon_va_init; + ctx->ops.update = damon_va_update; + ctx->ops.prepare_access_checks = damon_va_prepare_access_checks; + ctx->ops.check_accesses = damon_va_check_accesses; + ctx->ops.reset_aggregated = NULL; + ctx->ops.target_valid = damon_va_target_valid; + ctx->ops.cleanup = NULL; + ctx->ops.apply_scheme = damon_va_apply_scheme; + ctx->ops.get_scheme_score = damon_va_scheme_score; } #include "vaddr-test.h" From 3fa25d108d6aea1dc898f0c12812a570a8820e4b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:42 +1100 Subject: [PATCH 266/334] mm/damon: let monitoring operations can be registered and selected In-kernel DAMON user code like DAMON debugfs interface should set 'struct damon_operations' of its 'struct damon_ctx' on its own. Therefore, the client code should depend on all supporting monitoring operations implementations that it could use. For example, DAMON debugfs interface depends on both vaddr and paddr, while some of the users are not always interested in both. To minimize such unnecessary dependencies, this commit makes the monitoring operations can be registered by implementing code and then dynamically selected by the user code without build-time dependency. Link: https://lkml.kernel.org/r/20220215184603.1479-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 18 ++++++++++++ mm/damon/core.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 00baeb42c18e2..076da277b249e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -253,11 +253,24 @@ struct damos { struct list_head list; }; +/** + * enum damon_ops_id - Identifier for each monitoring operations implementation + * + * @DAMON_OPS_VADDR: Monitoring operations for virtual address spaces + * @DAMON_OPS_PADDR: Monitoring operations for the physical address space + */ +enum damon_ops_id { + DAMON_OPS_VADDR, + DAMON_OPS_PADDR, + NR_DAMON_OPS, +}; + struct damon_ctx; /** * struct damon_operations - Monitoring operations for given use cases. * + * @id: Identifier of this operations set. * @init: Initialize operations-related data structures. * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. @@ -277,6 +290,8 @@ struct damon_ctx; * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each * &damon_ctx.aggr_interval. * + * Each &struct damon_operations instance having valid @id can be registered + * via damon_register_ops() and selected by damon_select_ops() later. * @init should initialize operations-related data structures. For example, * this could be used to construct proper monitoring target regions and link * those to @damon_ctx.adaptive_targets. @@ -301,6 +316,7 @@ struct damon_ctx; * @cleanup is called from @kdamond just before its termination. */ struct damon_operations { + enum damon_ops_id id; void (*init)(struct damon_ctx *context); void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); @@ -489,6 +505,8 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); +int damon_register_ops(struct damon_operations *ops); +int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/core.c b/mm/damon/core.c index be93fb1c34735..82e0a4620c4fe 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -25,6 +25,72 @@ static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; +static DEFINE_MUTEX(damon_ops_lock); +static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; + +/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ +static bool damon_registered_ops_id(enum damon_ops_id id) +{ + struct damon_operations empty_ops = {}; + + if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops))) + return false; + return true; +} + +/** + * damon_register_ops() - Register a monitoring operations set to DAMON. + * @ops: monitoring operations set to register. + * + * This function registers a monitoring operations set of valid &struct + * damon_operations->id so that others can find and use them later. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_register_ops(struct damon_operations *ops) +{ + int err = 0; + + if (ops->id >= NR_DAMON_OPS) + return -EINVAL; + mutex_lock(&damon_ops_lock); + /* Fail for already registered ops */ + if (damon_registered_ops_id(ops->id)) { + err = -EINVAL; + goto out; + } + damon_registered_ops[ops->id] = *ops; +out: + mutex_unlock(&damon_ops_lock); + return err; +} + +/** + * damon_select_ops() - Select a monitoring operations to use with the context. + * @ctx: monitoring context to use the operations. + * @id: id of the registered monitoring operations to select. + * + * This function finds registered monitoring operations set of @id and make + * @ctx to use it. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) +{ + int err = 0; + + if (id >= NR_DAMON_OPS) + return -EINVAL; + + mutex_lock(&damon_ops_lock); + if (!damon_registered_ops_id(id)) + err = -EINVAL; + else + ctx->ops = damon_registered_ops[id]; + mutex_unlock(&damon_ops_lock); + return err; +} + /* * Construct a damon_region struct * From 7cf2dd7a5509b8961d0084b0c553666a4195da64 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:42 +1100 Subject: [PATCH 267/334] mm/damon/paddr,vaddr: register themselves to DAMON in subsys_initcall This commit makes the monitoring operations for the physical address space and virtual address spaces register themselves to DAMON in the subsys_initcall step. Later, in-kernel DAMON user code can use them via damon_select_ops() without have to unnecessarily depend on all possible monitoring operations implementations. Link: https://lkml.kernel.org/r/20220215184603.1479-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/paddr.c | 20 ++++++++++++++++++++ mm/damon/vaddr.c | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 9f0abd0369bc8..d968bb38bd5da 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -273,3 +273,23 @@ void damon_pa_set_operations(struct damon_ctx *ctx) ctx->ops.apply_scheme = damon_pa_apply_scheme; ctx->ops.get_scheme_score = damon_pa_scheme_score; } + +static int __init damon_pa_initcall(void) +{ + struct damon_operations ops = { + .id = DAMON_OPS_PADDR, + .init = NULL, + .update = NULL, + .prepare_access_checks = damon_pa_prepare_access_checks, + .check_accesses = damon_pa_check_accesses, + .reset_aggregated = NULL, + .target_valid = damon_pa_target_valid, + .cleanup = NULL, + .apply_scheme = damon_pa_apply_scheme, + .get_scheme_score = damon_pa_scheme_score, + }; + + return damon_register_ops(&ops); +}; + +subsys_initcall(damon_pa_initcall); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index c0eb32025f9ba..87475ba37bec9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -752,4 +752,24 @@ void damon_va_set_operations(struct damon_ctx *ctx) ctx->ops.get_scheme_score = damon_va_scheme_score; } +static int __init damon_va_initcall(void) +{ + struct damon_operations ops = { + .id = DAMON_OPS_VADDR, + .init = damon_va_init, + .update = damon_va_update, + .prepare_access_checks = damon_va_prepare_access_checks, + .check_accesses = damon_va_check_accesses, + .reset_aggregated = NULL, + .target_valid = damon_va_target_valid, + .cleanup = NULL, + .apply_scheme = damon_va_apply_scheme, + .get_scheme_score = damon_va_scheme_score, + }; + + return damon_register_ops(&ops); +}; + +subsys_initcall(damon_va_initcall); + #include "vaddr-test.h" From 1358e309cb1d895e54241580e0df743348aa138b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:42 +1100 Subject: [PATCH 268/334] mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations() This commit makes DAMON_RECLAIM to select the registered monitoring operations for the physical address space instead of setting it on its own. This allows DAMON_RECLAIM be independent of DAMON_PADDR, but leave the dependency as is, because it's the only one monitoring operations it use, and therefore it makes no sense to build DAMON_RECLAIM without DAMON_PADDR. Link: https://lkml.kernel.org/r/20220215184603.1479-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/reclaim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c93095c793c4..b53d9c22fad15 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -384,7 +384,9 @@ static int __init damon_reclaim_init(void) if (!ctx) return -ENOMEM; - damon_pa_set_operations(ctx); + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + return -EINVAL; + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; target = damon_new_target(); From 1a4bc89b2e6fad34d299e42b7b4399069668bbfc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:42 +1100 Subject: [PATCH 269/334] mm/damon/dbgfs: use damon_select_ops() instead of damon_{v,p}a_set_operations() This commit makes DAMON debugfs interface to select the registered monitoring operations for the physical address space or virtual address spaces depending on user requests instead of setting it on its own. Note that DAMON debugfs interface is still dependent to DAMON_VADDR with this change, because it is also using its symbol, 'damon_va_target_valid'. Link: https://lkml.kernel.org/r/20220215184603.1479-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 719278a8cc5eb..8bf9e38b60f47 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -474,12 +474,18 @@ static ssize_t dbgfs_target_ids_write(struct file *file, /* remove previously set targets */ dbgfs_set_targets(ctx, 0, NULL); + if (!nr_targets) { + ret = count; + goto unlock_out; + } /* Configure the context for the address space type */ if (id_is_pid) - damon_va_set_operations(ctx); + ret = damon_select_ops(ctx, DAMON_OPS_VADDR); else - damon_pa_set_operations(ctx); + ret = damon_select_ops(ctx, DAMON_OPS_PADDR); + if (ret) + goto unlock_out; ret = dbgfs_set_targets(ctx, nr_targets, target_pids); if (!ret) @@ -735,7 +741,11 @@ static struct damon_ctx *dbgfs_new_ctx(void) if (!ctx) return NULL; - damon_va_set_operations(ctx); + if (damon_select_ops(ctx, DAMON_OPS_VADDR) && damon_select_ops(ctx, + DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return NULL; + } ctx->callback.before_terminate = dbgfs_before_terminate; return ctx; } From 0fd55d4b6605b3ae0771db3d197d023770ea4ee5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:43 +1100 Subject: [PATCH 270/334] mm/damon/dbgfs: use operations id for knowing if the target has pid DAMON debugfs interface depends on monitoring operations for virtual address spaces because it knows if the target has pid or not by seeing if the context is configured to use one of the virtual address space monitoring operation functions. We can replace that check with 'enum damon_ops_id' now, to make it independent. This commit makes the change. Link: https://lkml.kernel.org/r/20220215184603.1479-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 8bf9e38b60f47..05b574cbcea81 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -277,7 +277,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, static inline bool target_has_pid(const struct damon_ctx *ctx) { - return ctx->ops.target_valid == damon_va_target_valid; + return ctx->ops.id == DAMON_OPS_VADDR; } static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) @@ -741,8 +741,8 @@ static struct damon_ctx *dbgfs_new_ctx(void) if (!ctx) return NULL; - if (damon_select_ops(ctx, DAMON_OPS_VADDR) && damon_select_ops(ctx, - DAMON_OPS_PADDR)) { + if (damon_select_ops(ctx, DAMON_OPS_VADDR) && + damon_select_ops(ctx, DAMON_OPS_PADDR)) { damon_destroy_ctx(ctx); return NULL; } From 5904b385cbb29e0f00f87d0ef054da9f59be38b1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:43 +1100 Subject: [PATCH 271/334] mm/damon/dbgfs-test: fix is_target_id() change DAMON kunit tests for DAMON debugfs interface fails because it still assumes setting empty monitoring operations makes DAMON debugfs interface believe the target of the context don't have pid. This commit fixes the kunit test fails by explicitly setting the context's monitoring operations with the operations for the physical address space, which let debugfs knows the target will not have pid. Link: https://lkml.kernel.org/r/20220215184603.1479-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/damon/dbgfs-test.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 8f7f325950559..0bb0d532b1590 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -74,7 +74,7 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) char buf[64]; /* Make DAMON consider target has no pid */ - ctx->ops = (struct damon_operations){}; + damon_select_ops(ctx, DAMON_OPS_PADDR); dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); @@ -111,6 +111,8 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; + damon_select_ops(ctx, DAMON_OPS_PADDR); + dbgfs_set_targets(ctx, 3, NULL); /* Put valid inputs and check the results */ From 46b7513be782ee4b8eccba5917eb474c2dc1e683 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 16 Feb 2022 15:31:43 +1100 Subject: [PATCH 272/334] mm/damon/paddr,vaddr: remove damon_{p,v}a_{target_valid,set_operations}() Because DAMON debugfs interface and DAMON-based proactive reclaim are now using monitoring operations via registration mechanism, damon_{p,v}a_{target_valid,set_operations}() functions have no user. This commit clean them up. Link: https://lkml.kernel.org/r/20220215184603.1479-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Rientjes Cc: Xin Hao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/damon.h | 10 ---------- mm/damon/paddr.c | 20 +------------------- mm/damon/vaddr.c | 15 +-------------- 3 files changed, 2 insertions(+), 43 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 076da277b249e..49c4a11ecf200 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -513,14 +513,4 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ -#ifdef CONFIG_DAMON_VADDR -bool damon_va_target_valid(void *t); -void damon_va_set_operations(struct damon_ctx *ctx); -#endif /* CONFIG_DAMON_VADDR */ - -#ifdef CONFIG_DAMON_PADDR -bool damon_pa_target_valid(void *t); -void damon_pa_set_operations(struct damon_ctx *ctx); -#endif /* CONFIG_DAMON_PADDR */ - #endif /* _DAMON_H */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index d968bb38bd5da..7c263797a9a9c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -208,11 +208,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -bool damon_pa_target_valid(void *t) -{ - return true; -} - static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -261,19 +256,6 @@ static int damon_pa_scheme_score(struct damon_ctx *context, return DAMOS_MAX_SCORE; } -void damon_pa_set_operations(struct damon_ctx *ctx) -{ - ctx->ops.init = NULL; - ctx->ops.update = NULL; - ctx->ops.prepare_access_checks = damon_pa_prepare_access_checks; - ctx->ops.check_accesses = damon_pa_check_accesses; - ctx->ops.reset_aggregated = NULL; - ctx->ops.target_valid = damon_pa_target_valid; - ctx->ops.cleanup = NULL; - ctx->ops.apply_scheme = damon_pa_apply_scheme; - ctx->ops.get_scheme_score = damon_pa_scheme_score; -} - static int __init damon_pa_initcall(void) { struct damon_operations ops = { @@ -283,7 +265,7 @@ static int __init damon_pa_initcall(void) .prepare_access_checks = damon_pa_prepare_access_checks, .check_accesses = damon_pa_check_accesses, .reset_aggregated = NULL, - .target_valid = damon_pa_target_valid, + .target_valid = NULL, .cleanup = NULL, .apply_scheme = damon_pa_apply_scheme, .get_scheme_score = damon_pa_scheme_score, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 87475ba37bec9..b2ec0aa1ff451 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -653,7 +653,7 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) * Functions for the target validity check and cleanup */ -bool damon_va_target_valid(void *target) +static bool damon_va_target_valid(void *target) { struct damon_target *t = target; struct task_struct *task; @@ -739,19 +739,6 @@ static int damon_va_scheme_score(struct damon_ctx *context, return DAMOS_MAX_SCORE; } -void damon_va_set_operations(struct damon_ctx *ctx) -{ - ctx->ops.init = damon_va_init; - ctx->ops.update = damon_va_update; - ctx->ops.prepare_access_checks = damon_va_prepare_access_checks; - ctx->ops.check_accesses = damon_va_check_accesses; - ctx->ops.reset_aggregated = NULL; - ctx->ops.target_valid = damon_va_target_valid; - ctx->ops.cleanup = NULL; - ctx->ops.apply_scheme = damon_va_apply_scheme; - ctx->ops.get_scheme_score = damon_va_scheme_score; -} - static int __init damon_va_initcall(void) { struct damon_operations ops = { From bf43fc6b751b02d148551bdaf2ff26f30fe56994 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 16 Feb 2022 15:31:43 +1100 Subject: [PATCH 273/334] fs/buffer.c: add debug print for __getblk_gfp() stall problem Among syzbot's unresolved hung task reports, 18 out of 65 reports contain __getblk_gfp() line in the backtrace. Since there is a comment block that says that __getblk_gfp() will lock up the machine if try_to_free_buffers() attempt from grow_dev_page() is failing, let's start from checking whether syzbot is hitting that case. This change will be removed after the bug is fixed. Link: http://lkml.kernel.org/r/9b9fcdda-c347-53ee-fdbb-8a7d11cf430e@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Al Viro Cc: Mel Gorman Cc: Michal Hocko Cc: Andi Kleen Cc: Jan Kara Cc: Jeff Layton Cc: Cc: Tim Chen Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 50 +++++++++++++++++++++++++++++++++++++++++-- include/linux/sched.h | 7 ++++++ lib/Kconfig.debug | 6 ++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd3719..a427edf2664c6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1041,6 +1060,18 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -3192,6 +3223,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3230,11 +3266,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3258,6 +3301,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248b..0074277d0429c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1492,6 +1492,13 @@ struct task_struct { struct callback_head l1d_flush_kill; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 14b89aa37c5c9..f8319dbd76283 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1789,6 +1789,12 @@ config IO_STRICT_DEVMEM menu "$(SRCARCH) Debugging" +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu From 26e151ccc175db19be19ded09de3a1d27ec5bddd Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 16 Feb 2022 15:31:43 +1100 Subject: [PATCH 274/334] fs/buffer.c: dump more info for __getblk_gfp() stall problem We need to dump more variables on top of "fs/buffer.c: add debug print for __getblk_gfp() stall problem". Link: http://lkml.kernel.org/r/12239545-7d8a-820f-48ba-952e2e98a05c@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/buffer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a427edf2664c6..54fd7c94fdb24 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1064,9 +1064,15 @@ __getblk_slow(struct block_device *bdev, sector_t block, #ifdef CONFIG_DEBUG_AID_FOR_SYZBOT if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) continue; - printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n", + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n", current->comm, current->pid, current->getblk_executed, - current->getblk_bh_count, current->getblk_bh_state); + current->getblk_bh_count, current->getblk_bh_state, + IS_ERR_OR_NULL(bdev->bd_super) ? -1L : + bdev->bd_super->s_blocksize, size, + IS_ERR_OR_NULL(bdev->bd_super) ? -1 : + bdev->bd_super->s_blocksize_bits, + IS_ERR_OR_NULL(bdev->bd_inode) ? -1 : + bdev->bd_inode->i_blkbits); current->getblk_executed = 0; current->getblk_bh_count = 0; current->getblk_bh_state = 0; From 63a9847da5f7952add3212654840af2f3c066eaf Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 16 Feb 2022 15:31:44 +1100 Subject: [PATCH 275/334] kernel/hung_task.c: Monitor killed tasks. syzbot's current top report is "no output from test machine" where the userspace process failed to spawn a new test process for 300 seconds for some reason. One of reasons which can result in this report is that an already spawned test process was unable to terminate (e.g. trapped at an unkillable retry loop due to some bug) after SIGKILL was sent to that process. Therefore, reporting when a thread is failing to terminate despite a fatal signal is pending would give us more useful information. In the context of syzbot's testing where there are only 2 CPUs in the target VM (which means that only small number of threads and not so much memory) and threads get SIGKILL after 5 seconds from fork(), being unable to reach do_exit() within 10 seconds is likely a sign of something went wrong. Therefore, I would like to try this patch in linux-next.git for feasibility testing whether this patch helps finding more bugs and reproducers for such bugs, by bringing "unable to terminate threads" reports out of "no output from test machine" reports. Potential bad effect of this patch will be that kernel code becomes killable without addressing the root cause of being unable to terminate, for use of killable wait will bypass both TASK_UNINTERRUPTIBLE stall test and SIGKILL after 5 seconds behavior, which will result in failing to detect in real systems where SIGKILL won't be sent after 5 seconds when something went wrong. This version shares existing sysctl settings (e.g. check interval, timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE threads. We will likely want to use different sysctl settings for monitoring killed threads. But let's start as linux-next.git patch without introducing new sysctl settings. We can add sysctl settings before sending to linux.git. Link: http://lkml.kernel.org/r/60d1d7f6-b201-3dcb-a51b-76a31bcfa919@i-love.sakura.ne.jp Signed-off-by: Tetsuo Handa Cc: Dmitry Vyukov Cc: Petr Mladek Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Vitaly Kuznetsov Cc: Liu Chuansheng Cc: Valdis Kletnieks Cc: linux-kernel@vger.kernel.org Cc: Dmitry Vyukov Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/sched.h | 1 + kernel/hung_task.c | 44 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0074277d0429c..d7d232e7e654d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1065,6 +1065,7 @@ struct task_struct { #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; + unsigned long killed_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f76554..40220dfd6fa93 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -147,6 +147,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); } +static void check_killed_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long stamp = t->killed_time; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + /* + * Skip threads which are already inside do_exit(), for exit_mm() etc. + * might take many seconds. + */ + if (t->flags & PF_EXITING) + return; + if (!stamp) { + stamp = jiffies; + if (!stamp) + stamp++; + t->killed_time = stamp; + return; + } + if (time_is_after_jiffies(stamp + timeout * HZ)) + return; + trace_sched_process_hang(t); + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_call_panic = true; + } + /* + * This thread failed to terminate for more than + * sysctl_hung_task_timeout_secs seconds, complain: + */ + pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n", + t->comm, t->pid, (jiffies - stamp) / HZ); + sched_show_task(t); + hung_task_show_lock = true; + touch_nmi_watchdog(); +} + /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. @@ -198,6 +239,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } + /* Check threads which are about to terminate. */ + if (unlikely(fatal_signal_pending(t))) + check_killed_task(t, timeout); /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); From f3311f265826a5dbeb84c32765d2adbb59ef3966 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Wed, 16 Feb 2022 15:31:45 +1100 Subject: [PATCH 276/334] proc: alloc PATH_MAX bytes for /proc/${pid}/fd/ symlinks It's not a standard approach that use __get_free_page() to alloc path buffer directly. We'd better use kmalloc and PATH_MAX. PAGE_SIZE is different on different archs. An unlinked file with very long canonical pathname will readlink differently because "(deleted)" eats into a buffer. --adobriyan Link: https://lkml.kernel.org/r/Ye1fCxyZZ0I5lgOL@localhost.localdomain Signed-off-by: Hao Lee Signed-off-by: Alexey Dobriyan Cc: Christian Brauner Cc: Kees Cook Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index d654ce7150fdd..9e4d1e0e3ad79 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1764,25 +1764,25 @@ static const char *proc_pid_get_link(struct dentry *dentry, static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)__get_free_page(GFP_KERNEL); + char *tmp = (char *)kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; - pathname = d_path(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; - len = tmp + PAGE_SIZE - 1 - pathname; + len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: - free_page((unsigned long)tmp); + kfree(tmp); return len; } From 8565f9082d5b14ba73ad6436b206c7acd490abe5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Feb 2022 15:31:45 +1100 Subject: [PATCH 277/334] proc-alloc-path_max-bytes-for-proc-pid-fd-symlinks-fix remove now-unneeded cast Reported-by: kernel test robot Cc: Alexey Dobriyan Cc: Christian Brauner Cc: Hao Lee Cc: James Morris Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e4d1e0e3ad79..76bf1aa3cfe88 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1764,7 +1764,7 @@ static const char *proc_pid_get_link(struct dentry *dentry, static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)kmalloc(PATH_MAX, GFP_KERNEL); + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; From f863de9446094f0b839982c33758313b90fef229 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Feb 2022 15:31:46 +1100 Subject: [PATCH 278/334] proc/vmcore: fix possible deadlock on concurrent mmap and read Lockdep noticed that there is chance for a deadlock if we have concurrent mmap, concurrent read, and the addition/removal of a callback. As nicely explained by Boqun: " Lockdep warned about the above sequences because rw_semaphore is a fair read-write lock, and the following can cause a deadlock: TASK 1 TASK 2 TASK 3 ====== ====== ====== down_write(mmap_lock); down_read(vmcore_cb_rwsem) down_write(vmcore_cb_rwsem); // blocked down_read(vmcore_cb_rwsem); // cannot get the lock because of the fairness down_read(mmap_lock); // blocked IOW, a reader can block another read if there is a writer queued by the second reader and the lock is fair. " To fix, convert to srcu to make this deadlock impossible. We need srcu as our callbacks can sleep. With this change, I cannot trigger any lockdep warnings. [ 6.386519] ====================================================== [ 6.387203] WARNING: possible circular locking dependency detected [ 6.387965] 5.17.0-0.rc0.20220117git0c947b893d69.68.test.fc36.x86_64 #1 Not tainted [ 6.388899] ------------------------------------------------------ [ 6.389657] makedumpfile/542 is trying to acquire lock: [ 6.390308] ffffffff832d2eb8 (vmcore_cb_rwsem){.+.+}-{3:3}, at: mmap_vmcore+0x340/0x580 [ 6.391290] [ 6.391290] but task is already holding lock: [ 6.391978] ffff8880af226438 (&mm->mmap_lock#2){++++}-{3:3}, at: vm_mmap_pgoff+0x84/0x150 [ 6.392898] [ 6.392898] which lock already depends on the new lock. [ 6.392898] [ 6.393866] [ 6.393866] the existing dependency chain (in reverse order) is: [ 6.394762] [ 6.394762] -> #1 (&mm->mmap_lock#2){++++}-{3:3}: [ 6.395530] lock_acquire+0xc3/0x1a0 [ 6.396047] __might_fault+0x4e/0x70 [ 6.396562] _copy_to_user+0x1f/0x90 [ 6.397093] __copy_oldmem_page+0x72/0xc0 [ 6.397663] read_from_oldmem+0x77/0x1e0 [ 6.398229] read_vmcore+0x2c2/0x310 [ 6.398742] proc_reg_read+0x47/0xa0 [ 6.399265] vfs_read+0x101/0x340 [ 6.399751] __x64_sys_pread64+0x5d/0xa0 [ 6.400314] do_syscall_64+0x43/0x90 [ 6.400778] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 6.401390] [ 6.401390] -> #0 (vmcore_cb_rwsem){.+.+}-{3:3}: [ 6.402063] validate_chain+0x9f4/0x2670 [ 6.402560] __lock_acquire+0x8f7/0xbc0 [ 6.403054] lock_acquire+0xc3/0x1a0 [ 6.403509] down_read+0x4a/0x140 [ 6.403948] mmap_vmcore+0x340/0x580 [ 6.404403] proc_reg_mmap+0x3e/0x90 [ 6.404866] mmap_region+0x504/0x880 [ 6.405322] do_mmap+0x38a/0x520 [ 6.405744] vm_mmap_pgoff+0xc1/0x150 [ 6.406258] ksys_mmap_pgoff+0x178/0x200 [ 6.406823] do_syscall_64+0x43/0x90 [ 6.407339] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 6.407975] [ 6.407975] other info that might help us debug this: [ 6.407975] [ 6.408945] Possible unsafe locking scenario: [ 6.408945] [ 6.409684] CPU0 CPU1 [ 6.410196] ---- ---- [ 6.410703] lock(&mm->mmap_lock#2); [ 6.411121] lock(vmcore_cb_rwsem); [ 6.411792] lock(&mm->mmap_lock#2); [ 6.412465] lock(vmcore_cb_rwsem); [ 6.412873] [ 6.412873] *** DEADLOCK *** [ 6.412873] [ 6.413522] 1 lock held by makedumpfile/542: [ 6.414006] #0: ffff8880af226438 (&mm->mmap_lock#2){++++}-{3:3}, at: vm_mmap_pgoff+0x84/0x150 [ 6.414944] [ 6.414944] stack backtrace: [ 6.415432] CPU: 0 PID: 542 Comm: makedumpfile Not tainted 5.17.0-0.rc0.20220117git0c947b893d69.68.test.fc36.x86_64 #1 [ 6.416581] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 6.417272] Call Trace: [ 6.417593] [ 6.417882] dump_stack_lvl+0x5d/0x78 [ 6.418346] print_circular_bug+0x5d7/0x5f0 [ 6.418821] ? stack_trace_save+0x3a/0x50 [ 6.419273] ? save_trace+0x3d/0x330 [ 6.419681] check_noncircular+0xd1/0xe0 [ 6.420217] validate_chain+0x9f4/0x2670 [ 6.420715] ? __lock_acquire+0x8f7/0xbc0 [ 6.421234] ? __lock_acquire+0x8f7/0xbc0 [ 6.421685] __lock_acquire+0x8f7/0xbc0 [ 6.422127] lock_acquire+0xc3/0x1a0 [ 6.422535] ? mmap_vmcore+0x340/0x580 [ 6.422965] ? lock_is_held_type+0xe2/0x140 [ 6.423432] ? mmap_vmcore+0x340/0x580 [ 6.423893] down_read+0x4a/0x140 [ 6.424321] ? mmap_vmcore+0x340/0x580 [ 6.424800] mmap_vmcore+0x340/0x580 [ 6.425237] ? vm_area_alloc+0x1c/0x60 [ 6.425661] ? trace_kmem_cache_alloc+0x30/0xe0 [ 6.426174] ? kmem_cache_alloc+0x1e0/0x2f0 [ 6.426641] proc_reg_mmap+0x3e/0x90 [ 6.427052] mmap_region+0x504/0x880 [ 6.427462] do_mmap+0x38a/0x520 [ 6.427842] vm_mmap_pgoff+0xc1/0x150 [ 6.428260] ksys_mmap_pgoff+0x178/0x200 [ 6.428701] do_syscall_64+0x43/0x90 [ 6.429126] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 6.429745] RIP: 0033:0x7fc7359b8fc7 [ 6.430157] Code: 00 00 00 89 ef e8 69 b3 ff ff eb e4 e8 c2 64 01 00 66 90 f3 0f 1e fa 41 89 ca 41 f7 c1 ff 0f 00 00 75 10 b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 21 c3 48 8b 05 21 7e 0e 00 64 c7 00 16 00 00 [ 6.432147] RSP: 002b:00007fff35b4c208 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 [ 6.432970] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc7359b8fc7 [ 6.433746] RDX: 0000000000000001 RSI: 0000000000400000 RDI: 0000000000000000 [ 6.434529] RBP: 000055a1125ecf10 R08: 0000000000000003 R09: 0000000000002000 [ 6.435310] R10: 0000000000000002 R11: 0000000000000246 R12: 0000000000002000 [ 6.436093] R13: 0000000000400000 R14: 000055a1124269e2 R15: 0000000000000000 [ 6.436887] Link: https://lkml.kernel.org/r/20220119193417.100385-1-david@redhat.com Fixes: cc5f2704c934 ("proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks") Signed-off-by: David Hildenbrand Reported-by: Baoquan He Acked-by: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Peter Zijlstra Cc: Boqun Feng Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/vmcore.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 702754dd1daff..edeb01dfe05d3 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DECLARE_RWSEM(vmcore_cb_rwsem); +static DEFINE_SPINLOCK(vmcore_cb_lock); +DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ @@ -70,8 +71,8 @@ static bool vmcore_opened; void register_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); INIT_LIST_HEAD(&cb->next); + spin_lock(&vmcore_cb_lock); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); - list_del(&cb->next); + spin_lock(&vmcore_cb_lock); + list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is * very unusual (e.g., forced driver removal), but we cannot stop @@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); + + synchronize_srcu(&vmcore_cb_srcu); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn) struct vmcore_cb *cb; bool ret = true; - lockdep_assert_held_read(&vmcore_cb_rwsem); - - list_for_each_entry(cb, &vmcore_cb_list, next) { + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { if (unlikely(!cb->pfn_is_ram)) continue; ret = cb->pfn_is_ram(cb, pfn); @@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - down_read(&vmcore_cb_rwsem); + spin_lock(&vmcore_cb_lock); vmcore_opened = true; - up_read(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); return 0; } @@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, unsigned long pfn, offset; size_t nr_bytes; ssize_t read = 0, tmp; + int idx; if (!count) return 0; @@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset, userbuf); } if (tmp < 0) { - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return tmp; } @@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count, ++pfn; offset = 0; } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); - up_read(&vmcore_cb_rwsem); return read; } @@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - int ret; + int ret, idx; /* - * Check if oldmem_pfn_is_ram was registered to avoid - * looping over all pages without a reason. + * Check if a callback was registered to avoid looping over all + * pages without a reason. */ - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return ret; } From e95da70b6484a6030147acaef26f696324536f73 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 16 Feb 2022 15:31:47 +1100 Subject: [PATCH 279/334] proc/vmcore: fix vmcore_alloc_buf() kernel-doc comment Fix a spelling problem to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/proc/vmcore.c:492: warning: Function parameter or member 'size' not described in 'vmcore_alloc_buf' fs/proc/vmcore.c:492: warning: Excess function parameter 'sizez' description in 'vmcore_alloc_buf' Link: https://lkml.kernel.org/r/20220129011449.105278-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index edeb01dfe05d3..6f1b8ddc6f7a4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -480,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = { /** * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @sizez: size of buffer + * @size: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). From 7b71985822c2be59eb638ab9aaa4c48e284a3db2 Mon Sep 17 00:00:00 2001 From: Julius Hemanth Pitti Date: Wed, 16 Feb 2022 15:31:48 +1100 Subject: [PATCH 280/334] proc/sysctl: make protected_* world readable protected_* files have 600 permissions which prevents non-superuser from reading them. Container like "AWS greengrass" refuse to launch unless protected_hardlinks and protected_symlinks are set. When containers like these run with "userns-remap" or "--user" mapping container's root to non-superuser on host, they fail to run due to denied read access to these files. As these protections are hardly a secret, and do not possess any security risk, making them world readable. Though above greengrass usecase needs read access to only protected_hardlinks and protected_symlinks files, setting all other protected_* files to 644 to keep consistency. Link: http://lkml.kernel.org/r/20200709235115.56954-1-jpitti@cisco.com Fixes: 800179c9b8a1 ("fs: add link restrictions") Signed-off-by: Julius Hemanth Pitti Acked-by: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Cc: Ingo Molnar Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 3f1829b3ab5b7..e596aabd6dc5c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1031,7 +1031,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1040,7 +1040,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1049,7 +1049,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1058,7 +1058,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, From 2d65121f1bacf3f147a368b770263f2868787993 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 15:31:48 +1100 Subject: [PATCH 281/334] Kconfig.debug: make DEBUG_INFO selectable from a choice Currently it's not possible to enable DEBUG_INFO for an all*config build, since it is marked as "depends on !COMPILE_TEST". This generally makes sense because a debug build of an all*config target ends up taking much longer and the output is much larger. Having this be "default off" makes sense. However, there are cases where enabling DEBUG_INFO for such builds is useful for doing treewide A/B comparisons of build options, etc. Make DEBUG_INFO selectable from any of the DWARF version choice options, with DEBUG_INFO_NONE being the default for COMPILE_TEST. The mutually exclusive relationship between DWARF5 and BTF must be inverted, but the result remains the same. Additionally moves DEBUG_KERNEL and DEBUG_MISC up to the top of the menu because they were enabling features _above_ it, making it weird to navigate menuconfig. Link: https://lkml.kernel.org/r/20220125075126.891825-1-keescook@chromium.org Signed-off-by: Kees Cook Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Reviewed-by: Masahiro Yamada Cc: Nick Desaulniers Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.debug | 140 +++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f8319dbd76283..4807637ec89eb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -208,20 +208,88 @@ config DEBUG_BUGVERBOSE endmenu # "printk and dmesg options" +config DEBUG_KERNEL + bool "Kernel debugging" + help + Say Y here if you are developing drivers or trying to debug and + identify kernel problems. + +config DEBUG_MISC + bool "Miscellaneous debug code" + default DEBUG_KERNEL + depends on DEBUG_KERNEL + help + Say Y here if you need to enable miscellaneous debug code that should + be under a more specific debug option but isn't. + menu "Compile-time checks and compiler options" config DEBUG_INFO - bool "Compile the kernel with debug info" - depends on DEBUG_KERNEL && !COMPILE_TEST + bool help - If you say Y here the resulting kernel image will include - debugging info resulting in a larger kernel image. + A kernel debug info option other than "None" has been selected + in the "Debug information" choice below, indicating that debug + information will be generated for build targets. + +choice + prompt "Debug information" + depends on DEBUG_KERNEL + default DEBUG_INFO_NONE if COMPILE_TEST + help + Selecting something other than "None" results in a kernel image + that will include debugging info resulting in a larger kernel image. This adds debug symbols to the kernel and modules (gcc -g), and is needed if you intend to use kernel crashdump or binary object tools like crash, kgdb, LKCD, gdb, etc on the kernel. - Say Y here only if you plan to debug the kernel. - If unsure, say N. + Choose which version of DWARF debug info to emit. If unsure, + select "Toolchain default". + +config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT + bool "Rely on the toolchain's implicit default DWARF version" + select DEBUG_INFO + help + The implicit default version of DWARF debug info produced by a + toolchain changes over time. + + This can break consumers of the debug info that haven't upgraded to + support newer revisions, and prevent testing newer versions, but + those should be less common scenarios. + +config DEBUG_INFO_DWARF4 + bool "Generate DWARF Version 4 debuginfo" + select DEBUG_INFO + help + Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. + + If you have consumers of DWARF debug info that are not ready for + newer revisions of DWARF, you may wish to choose this or have your + config select this. + +config DEBUG_INFO_DWARF5 + bool "Generate DWARF Version 5 debuginfo" + select DEBUG_INFO + depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) + help + Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc + 5.0+ accepts the -gdwarf-5 flag but only had partial support for some + draft features until 7.0), and gdb 8.0+. + + Changes to the structure of debug info in Version 5 allow for around + 15-18% savings in resulting image and debug info section sizes as + compared to DWARF Version 4. DWARF Version 5 standardizes previous + extensions such as accelerators for symbol indexing and the format + for fission (.dwo/.dwp) files. Users may not want to select this + config if they rely on tooling that has not yet been updated to + support DWARF Version 5. + +config DEBUG_INFO_NONE + bool "Disable debug information" + help + Do not build the kernel with debugging information, which will + result in a faster and smaller build. + +endchoice # "Debug information" if DEBUG_INFO @@ -267,56 +335,12 @@ config DEBUG_INFO_SPLIT to know about the .dwo files and include them. Incompatible with older versions of ccache. -choice - prompt "DWARF version" - help - Which version of DWARF debug info to emit. - -config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT - bool "Rely on the toolchain's implicit default DWARF version" - help - The implicit default version of DWARF debug info produced by a - toolchain changes over time. - - This can break consumers of the debug info that haven't upgraded to - support newer revisions, and prevent testing newer versions, but - those should be less common scenarios. - - If unsure, say Y. - -config DEBUG_INFO_DWARF4 - bool "Generate DWARF Version 4 debuginfo" - help - Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. - - If you have consumers of DWARF debug info that are not ready for - newer revisions of DWARF, you may wish to choose this or have your - config select this. - -config DEBUG_INFO_DWARF5 - bool "Generate DWARF Version 5 debuginfo" - depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) - depends on !DEBUG_INFO_BTF - help - Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc - 5.0+ accepts the -gdwarf-5 flag but only had partial support for some - draft features until 7.0), and gdb 8.0+. - - Changes to the structure of debug info in Version 5 allow for around - 15-18% savings in resulting image and debug info section sizes as - compared to DWARF Version 4. DWARF Version 5 standardizes previous - extensions such as accelerators for symbol indexing and the format - for fission (.dwo/.dwp) files. Users may not want to select this - config if they rely on tooling that has not yet been updated to - support DWARF Version 5. - -endchoice # "DWARF version" - config DEBUG_INFO_BTF bool "Generate BTF typeinfo" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL + depends on !DEBUG_INFO_DWARF5 help Generate deduplicated BTF type information from DWARF debug info. Turning this on expects presence of pahole tool, which will convert @@ -585,20 +609,6 @@ source "lib/Kconfig.kcsan" endmenu -config DEBUG_KERNEL - bool "Kernel debugging" - help - Say Y here if you are developing drivers or trying to debug and - identify kernel problems. - -config DEBUG_MISC - bool "Miscellaneous debug code" - default DEBUG_KERNEL - depends on DEBUG_KERNEL - help - Say Y here if you need to enable miscellaneous debug code that should - be under a more specific debug option but isn't. - menu "Networking Debugging" source "net/Kconfig.debug" From 99af7fa7230afd2cc457a0ecc0b41e133eee6839 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 15:31:49 +1100 Subject: [PATCH 282/334] Kconfig.debug: make DEBUG_INFO always default=n While trying to make sure CONFIG_DEBUG_INFO wasn't set for COMPILE_TEST, I ordered the choices incorrectly to retain the prior default=n state. Move DEBUG_INFO_NONE to the top so that the default choice is disabled, and remove the "if COMPILE_TEST" as it is now redundant. Link: https://lkml.kernel.org/r/20220128214131.580131-1-keescook@chromium.org Link: https://lore.kernel.org/lkml/YfRY6+CaQxX7O8vF@dev-arch.archlinux-ax161 Reported-by: Nathan Chancellor Signed-off-by: Kees Cook Reviewed-by: Nathan Chancellor Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.debug | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4807637ec89eb..efc1a1908e041 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -234,7 +234,6 @@ config DEBUG_INFO choice prompt "Debug information" depends on DEBUG_KERNEL - default DEBUG_INFO_NONE if COMPILE_TEST help Selecting something other than "None" results in a kernel image that will include debugging info resulting in a larger kernel image. @@ -245,6 +244,12 @@ choice Choose which version of DWARF debug info to emit. If unsure, select "Toolchain default". +config DEBUG_INFO_NONE + bool "Disable debug information" + help + Do not build the kernel with debugging information, which will + result in a faster and smaller build. + config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT bool "Rely on the toolchain's implicit default DWARF version" select DEBUG_INFO @@ -283,12 +288,6 @@ config DEBUG_INFO_DWARF5 config if they rely on tooling that has not yet been updated to support DWARF Version 5. -config DEBUG_INFO_NONE - bool "Disable debug information" - help - Do not build the kernel with debugging information, which will - result in a faster and smaller build. - endchoice # "Debug information" if DEBUG_INFO From a2cc81d910a8008081fa04703887d547cfb1eee1 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 16 Feb 2022 15:31:50 +1100 Subject: [PATCH 283/334] include: drop pointless __compiler_offsetof indirection (1) compiler_types.h is unconditionally included via an -include flag (see scripts/Makefile.lib), and it defines __compiler_offsetof unconditionally. So testing for definedness of __compiler_offsetof is mostly pointless. (2) Every relevant compiler provides __builtin_offsetof (even sparse has had that for 14 years), and if for whatever reason one would end up picking up the poor man's fallback definition (C file compiler with completely custom CFLAGS?), newer clang versions won't treat the result as an Integer Constant Expression, so if used in place where such is required (static initializer or static_assert), one would get errors like t.c:11:16: error: static_assert expression is not an integral constant expression t.c:11:16: note: cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression t.c:4:33: note: expanded from macro 'offsetof' #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) So just define offsetof unconditionally and directly in terms of __builtin_offsetof. Link: https://lkml.kernel.org/r/20220202102147.326672-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook Acked-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/compiler_types.h | 2 -- include/linux/stddef.h | 6 +----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3c1795fdb5686..83ee7f7ada5d6 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -137,8 +137,6 @@ struct ftrace_likely_data { */ #define __naked __attribute__((__naked__)) notrace -#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) - /* * Prefer gnu_inline, so that extern inline functions do not emit an * externally visible function. This makes extern inline behave as per gnu89 diff --git a/include/linux/stddef.h b/include/linux/stddef.h index ca507bd5f8082..929d67710cc51 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -13,11 +13,7 @@ enum { }; #undef offsetof -#ifdef __compiler_offsetof -#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER) -#else -#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) -#endif +#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) /** * sizeof_field() - Report the size of a struct field in bytes From 502ea4f6d3937875766d97c4fbeb5b5a580fd421 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 16 Feb 2022 15:31:50 +1100 Subject: [PATCH 284/334] ilog2: force inlining of __ilog2_u32() and __ilog2_u64() Building a kernel with CONFIG_CC_OPTIMISE_FOR_SIZE leads to __ilog2_u32() being duplicated 50 times and __ilog2_u64() 3 times in vmlinux on a tiny powerpc32 config. __ilog2_u32() being 2 instructions it is not worth being kept out of line, so force inlining. Allthough the u64 version is a bit bigger, there is still a small benefit in keeping it inlined. On a 64 bits config there's a real benefit. With this change the size of vmlinux text is reduced by 1 kbytes, which is approx 50% more than the size of the removed functions. Before the patch there is for instance: c00d2a94 <__ilog2_u32>: c00d2a94: 7c 63 00 34 cntlzw r3,r3 c00d2a98: 20 63 00 1f subfic r3,r3,31 c00d2a9c: 4e 80 00 20 blr c00d36d8 <__order_base_2>: c00d36d8: 28 03 00 01 cmplwi r3,1 c00d36dc: 40 81 00 2c ble c00d3708 <__order_base_2+0x30> c00d36e0: 94 21 ff f0 stwu r1,-16(r1) c00d36e4: 7c 08 02 a6 mflr r0 c00d36e8: 38 63 ff ff addi r3,r3,-1 c00d36ec: 90 01 00 14 stw r0,20(r1) c00d36f0: 4b ff f3 a5 bl c00d2a94 <__ilog2_u32> c00d36f4: 80 01 00 14 lwz r0,20(r1) c00d36f8: 38 63 00 01 addi r3,r3,1 c00d36fc: 7c 08 03 a6 mtlr r0 c00d3700: 38 21 00 10 addi r1,r1,16 c00d3704: 4e 80 00 20 blr c00d3708: 38 60 00 00 li r3,0 c00d370c: 4e 80 00 20 blr With the patch it has become: c00d356c <__order_base_2>: c00d356c: 28 03 00 01 cmplwi r3,1 c00d3570: 40 81 00 14 ble c00d3584 <__order_base_2+0x18> c00d3574: 38 63 ff ff addi r3,r3,-1 c00d3578: 7c 63 00 34 cntlzw r3,r3 c00d357c: 20 63 00 20 subfic r3,r3,32 c00d3580: 4e 80 00 20 blr c00d3584: 38 60 00 00 li r3,0 c00d3588: 4e 80 00 20 blr No more need for __order_base_2() to setup a stack frame and save/restore caller address. And the following 'add 1' is merged in the subtract. Another typical use of it: c080ff28 : ... c080fff8: 7f c3 f3 78 mr r3,r30 c080fffc: 4b 8f 81 f1 bl c01081ec <__ilog2_u32> c0810000: 38 63 ff f2 addi r3,r3,-14 ... Becomes c080ff1c : ... c080ffec: 7f c3 00 34 cntlzw r3,r30 c080fff0: 20 63 00 11 subfic r3,r3,17 ... Here no need to move r30 argument to r3 then substract 14 to result. Just work on r30 and merge the 'sub 14' with the 'sub from 31'. Link: https://lkml.kernel.org/r/803a2ac3d923ebcfd0dd40f5886b05cae7bb0aba.1644243860.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/log2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/log2.h b/include/linux/log2.h index df0b155c21417..9f30d087a1281 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -18,7 +18,7 @@ * - the arch is not required to handle n==0 if implementing the fallback */ #ifndef CONFIG_ARCH_HAS_ILOG2_U32 -static inline __attribute__((const)) +static __always_inline __attribute__((const)) int __ilog2_u32(u32 n) { return fls(n) - 1; @@ -26,7 +26,7 @@ int __ilog2_u32(u32 n) #endif #ifndef CONFIG_ARCH_HAS_ILOG2_U64 -static inline __attribute__((const)) +static __always_inline __attribute__((const)) int __ilog2_u64(u64 n) { return fls64(n) - 1; From 2d25ec866de36a5225b6e6310089c69e1c030785 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Feb 2022 15:31:51 +1100 Subject: [PATCH 285/334] bitfield: add explicit inclusions to the example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not obvious that bitfield.h doesn't guarantee the bits.h inclusion and the example in the former is confusing. Some developers think that it's okay to just include bitfield.h to get it working. Change example to explicitly include necessary headers in order to avoid confusion. Link: https://lkml.kernel.org/r/20220207123341.47533-1-andriy.shevchenko@linux.intel.com Fixes: 3e9b3112ec74 ("add basic register-field manipulation macros") Depends-on: 8bd9cb51daac ("locking/atomics, asm-generic: Move some macros from to a new file") Signed-off-by: Andy Shevchenko Reported-by: Jan Dąbroś Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/bitfield.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 6093fa6db2600..c9be1657f03d9 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -19,6 +19,9 @@ * * Example: * + * #include + * #include + * * #define REG_FIELD_A GENMASK(6, 0) * #define REG_FIELD_B BIT(7) * #define REG_FIELD_C GENMASK(15, 8) From 57b104674a86bd4b1c8584560f6bd66bb496a512 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Wed, 16 Feb 2022 15:31:52 +1100 Subject: [PATCH 286/334] lz4: fix LZ4_decompress_safe_partial read out of bound When partialDecoding, it is EOF if we've either, filled the output buffer or can't proceed with reading an offset for following match. In some extreme corner cases when compressed data is crusted corrupted, UAF will occur. As reported by KASAN [1], LZ4_decompress_safe_partial may lead to read out of bound problem during decoding. lz4 upstream has fixed it [2] and this issue has been disscussed here [3] before. current decompression routine was ported from lz4 v1.8.3, bumping lib/lz4 to v1.9.+ is certainly a huge work to be done later, so, we'd better fix it first. [1] https://lore.kernel.org/all/000000000000830d1205cf7f0477@google.com/ [2] https://github.com/lz4/lz4/commit/c5d6f8a8be3927c0bec91bcc58667a6cfad244ad# [3] https://lore.kernel.org/all/CC666AE8-4CA4-4951-B6FB-A2EFDE3AC03B@fb.com/ Link: https://lkml.kernel.org/r/20211111105048.2006070-1-guoxuenan@huawei.com Reported-by: syzbot+63d688f1d899c588fb71@syzkaller.appspotmail.com Signed-off-by: Guo Xuenan Reviewed-by: Nick Terrell Cc: Gao Xiang Cc: Yann Collet Cc: Chengyang Fan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/lz4/lz4_decompress.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 926f4823d5eac..fd1728d94babb 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -271,8 +271,12 @@ static FORCE_INLINE int LZ4_decompress_generic( ip += length; op += length; - /* Necessarily EOF, due to parsing restrictions */ - if (!partialDecoding || (cpy == oend)) + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) break; } else { /* may overwrite up to WILDCOPYLENGTH beyond cpy */ From 078d8c5d5b0c7de9332fd5fae64de38b3ca00f36 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 16 Feb 2022 15:31:53 +1100 Subject: [PATCH 287/334] checkpatch: prefer MODULE_LICENSE("GPL") over MODULE_LICENSE("GPL v2") There is no effective difference. Given the large number of uses of "GPL v2", emit this message only for patches as a trivial treeside sed could be done one day. Ref: commit bf7fbeeae6db ("module: Cure the MODULE_LICENSE "GPL" vs. "GPL v2" bogosity") Link: https://lkml.kernel.org/r/20220128185924.80137-1-joe@perches.com Signed-off-by: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b01c36a15d9dd..b7c181ea0ac56 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -7418,6 +7418,13 @@ sub process { WARN("MODULE_LICENSE", "unknown module license " . $extracted_string . "\n" . $herecurr); } + if (!$file && $extracted_string eq '"GPL v2"') { + if (WARN("MODULE_LICENSE", + "Prefer \"GPL\" over \"GPL v2\" - see commit bf7fbeeae6db (\"module: Cure the MODULE_LICENSE \"GPL\" vs. \"GPL v2\" bogosity\")\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\bMODULE_LICENSE\s*\(\s*"GPL v2"\s*\)/MODULE_LICENSE("GPL")/; + } + } } # check for sysctl duplicate constants From 01587a2f1ff9b34275553bf2d08c44b62c9256ae Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 16 Feb 2022 15:31:53 +1100 Subject: [PATCH 288/334] checkpatch: add --fix option for some TRAILING_STATEMENTS Single line code like: if (foo) bar; should generally be written: if (foo) bar; Add a --fix test to do so. This fix is not done when an ASSIGN_IN_IF in the same line exists. Link: https://lkml.kernel.org/r/20220128185924.80137-2-joe@perches.com Signed-off-by: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b7c181ea0ac56..046a018093a7b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5551,6 +5551,7 @@ sub process { defined($stat) && defined($cond) && $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) { my ($s, $c) = ($stat, $cond); + my $fixed_assign_in_if = 0; if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/s) { if (ERROR("ASSIGN_IN_IF", @@ -5575,6 +5576,7 @@ sub process { $newline .= ')'; $newline .= " {" if (defined($brace)); fix_insert_line($fixlinenr + 1, $newline); + $fixed_assign_in_if = 1; } } } @@ -5598,8 +5600,20 @@ sub process { $stat_real = "[...]\n$stat_real"; } - ERROR("TRAILING_STATEMENTS", - "trailing statements should be on next line\n" . $herecurr . $stat_real); + if (ERROR("TRAILING_STATEMENTS", + "trailing statements should be on next line\n" . $herecurr . $stat_real) && + !$fixed_assign_in_if && + $cond_lines == 0 && + $fix && $perl_version_ok && + $fixed[$fixlinenr] =~ /^\+(\s*)((?:if|while|for)\s*$balanced_parens)\s*(.*)$/) { + my $indent = $1; + my $test = $2; + my $rest = rtrim($4); + if ($rest =~ /;$/) { + $fixed[$fixlinenr] = "\+$indent$test"; + fix_insert_line($fixlinenr + 1, "$indent\t$rest"); + } + } } } From 9fcbd52aaaad3b726b4d8fb4576ad67678bbf19a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 16 Feb 2022 15:31:54 +1100 Subject: [PATCH 289/334] checkpatch: add early_param exception to blank line after struct/function test Add early_param as another exception to the blank line preferred after function/struct/union declaration or definition test. Link: https://lkml.kernel.org/r/3bd6ada59f411a7685d7e64eeb670540d4bfdcde.camel@perches.com Signed-off-by: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 046a018093a7b..2653177f52d90 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3926,7 +3926,7 @@ sub process { if ($prevline =~ /^[\+ ]};?\s*$/ && $line =~ /^\+/ && !($line =~ /^\+\s*$/ || - $line =~ /^\+\s*EXPORT_SYMBOL/ || + $line =~ /^\+\s*(?:EXPORT_SYMBOL|early_param)/ || $line =~ /^\+\s*MODULE_/i || $line =~ /^\+\s*\#\s*(?:end|elif|else)/ || $line =~ /^\+[a-z_]*init/ || From 6db28f9cf7e1776ab9d65ddcbe238a0c40f4fc76 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 16 Feb 2022 15:31:55 +1100 Subject: [PATCH 290/334] scripts/checkpatch.pl: remove _deferred and _deferred_once false warning With commit 98e35f5894cf ("printk: git rid of [sched_delayed] message for printk_deferred") printk_deferred and printk_deferred_once require LOGLEVEL in argument, but checkpatch.pl was not fixed and still reports it as warning: WARNING: Possible unnecessary KERN_ALERT printk_deferred(KERN_ALERT "checking deferred "); As suggested by Andy, made 2 functions from logFunction. 1. logFunction: with all checks 2. logFunctionCore: without printk(?:_ratelimited|_once|_deferred) checking and call logFunctionCore instead of logFunction for checking of loglevel, which will exclude checking of printk(?:_ratelimited|_once|_deferred). This way, there is no need to maintain same stanza at multiple places for removing printk flavours. Link: https://lkml.kernel.org/r/20220202103309.1914992-1-maninder1.s@samsung.com Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: Markus Trippelsdorf Cc: Jan Kara Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- scripts/checkpatch.pl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2653177f52d90..727d23b934da2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -584,8 +584,7 @@ sub hash_show_words { our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; -our $logFunctions = qr{(?x: - printk(?:_ratelimited|_once|_deferred_once|_deferred|)| +our $logFunctionsCore = qr{(?x: (?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| TP_printk| WARN(?:_RATELIMIT|_ONCE|)| @@ -594,6 +593,11 @@ sub hash_show_words { seq_vprintf|seq_printf|seq_puts )}; +our $logFunctions = qr{(?x: + printk(?:_ratelimited|_once|_deferred_once|_deferred|)| + $logFunctionsCore +)}; + our $allocFunctions = qr{(?x: (?:(?:devm_)? (?:kv|k|v)[czm]alloc(?:_array)?(?:_node)? | @@ -6312,8 +6316,7 @@ sub process { } # check for logging functions with KERN_ - if ($line !~ /printk(?:_ratelimited|_once)?\s*\(/ && - $line =~ /\b$logFunctions\s*\(.*\b(KERN_[A-Z]+)\b/) { + if ($line =~ /\b$logFunctionsCore\s*\(.*\b(KERN_[A-Z]+)\b/) { my $level = $1; if (WARN("UNNECESSARY_KERN_LEVEL", "Possible unnecessary $level\n" . $herecurr) && From 1cec8d8bcb8dcc7748a2c914cd60d503d043f06f Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Wed, 16 Feb 2022 15:31:55 +1100 Subject: [PATCH 291/334] fs/binfmt_elf: fix AT_PHDR for unusual ELF files Patch series "fs/binfmt_elf: Fix AT_PHDR for unusual ELF files", v4. These patches fix a bug in AT_PHDR calculation. We cannot calculate AT_PHDR as the sum of load_addr and exec->e_phoff. This is because exec->e_phoff is the offset of PHDRs in the file and the address of PHDRs in the memory may differ from it. These patches fix the bug by calculating the address of program headers from PT_LOADs directly. This patch (of 2): As pointed out in the bugzilla discussion, we cannot calculate AT_PHDR as the sum of load_addr and exec->e_phoff. : The AT_PHDR of ELF auxiliary vectors should point to the memory address : of program header. But binfmt_elf.c calculates this address as follows: : : NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); : : which is wrong since e_phoff is the file offset of program header and : load_addr is the memory base address from PT_LOAD entry. : : The ld.so uses AT_PHDR as the memory address of program header. In normal : case, since the e_phoff is usually 64 and in the first PT_LOAD region, it : is the correct program header address. : : But if the address of program header isn't equal to the first PT_LOAD : address + e_phoff (e.g. Put the program header in other non-consecutive : PT_LOAD region), ld.so will try to read program header from wrong address : then crash or use incorrect program header. This is because exec->e_phoff is the offset of PHDRs in the file and the address of PHDRs in the memory may differ from it. This patch fixes the bug by calculating the address of program headers from PT_LOADs directly. Link: https://lkml.kernel.org/r/20211212232414.1402199-1-akirakawata1@gmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=197921 Link: https://lkml.kernel.org/r/20211212232414.1402199-2-akirakawata1@gmail.com Signed-off-by: Akira Kawata Reported-by: kernel test robot Acked-by: Kees Cook Cc: Alexey Dobriyan Cc: Al Viro Cc: Lukas Bulwahn Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9e11e6f13e83a..db989b6084191 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -170,8 +170,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -823,7 +823,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; + unsigned long load_addr, load_bias = 0, phdr_addr = 0; int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; @@ -1169,6 +1169,13 @@ static int load_elf_binary(struct linux_binprm *bprm) reloc_func_desc = load_bias; } } + + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1204,6 +1211,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1267,8 +1275,8 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; From bb8c880fd7e37930856685a150be925c88299bea Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Wed, 16 Feb 2022 15:31:56 +1100 Subject: [PATCH 292/334] fs-binfmt_elf-fix-at_phdr-for-unusual-elf-files-v5 add comment per Kees Link: https://lkml.kernel.org/r/20220127124014.338760-2-akirakawata1@gmail.com Signed-off-by: Akira Kawata Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index db989b6084191..d6a50f05b1c04 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1170,6 +1170,10 @@ static int load_elf_binary(struct linux_binprm *bprm) } } + /* + * Figure out which segment in the file contains the Program + * Header table, and map to the associated memory address. + */ if (elf_ppnt->p_offset <= elf_ex->e_phoff && elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + From 424807d5b4b901342ac37c80e057ef9e5475f002 Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Wed, 16 Feb 2022 15:31:57 +1100 Subject: [PATCH 293/334] fs/binfmt_elf: refactor load_elf_binary function I delete load_addr because it is not used anymore. And I rename load_addr_set to first_pt_load because it is used only to capture the first iteration of the loop. Link: https://lkml.kernel.org/r/20211212232414.1402199-3-akirakawata1@gmail.com Signed-off-by: Akira Kawata Acked-by: Kees Cook Cc: Alexey Dobriyan Cc: Al Viro Cc: Eric Biederman Cc: kernel test robot Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d6a50f05b1c04..4c02ff026d9cd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -823,8 +823,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr, load_bias = 0, phdr_addr = 0; - int load_addr_set = 0; + unsigned long load_bias = 0, phdr_addr = 0; + int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; @@ -1074,12 +1074,12 @@ static int load_elf_binary(struct linux_binprm *bprm) vaddr = elf_ppnt->p_vaddr; /* - * The first time through the loop, load_addr_set is false: + * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (load_addr_set) { + if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* @@ -1139,10 +1139,10 @@ static int load_elf_binary(struct linux_binprm *bprm) /* * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the + * (Note that first_pt_load is set to false later once the * initial mapping is performed.) */ - if (!load_addr_set) { + if (first_pt_load) { total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { @@ -1159,13 +1159,11 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out_free_dentry; } - if (!load_addr_set) { - load_addr_set = 1; - load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (first_pt_load) { + first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += load_bias; reloc_func_desc = load_bias; } } From 5fab2bc891e354cc34c0a6a87e598ef102dd34c3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 16 Feb 2022 15:31:58 +1100 Subject: [PATCH 294/334] ELF: fix overflow in total mapping size calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel assumes that ELF program headers are ordered by mapping address, but doesn't enforce it. It is possible to make mapping size extremely huge by simply shuffling first and last PT_LOAD segments. As long as PT_LOAD segments do not overlap, it is silly to require sorting by v_addr anyway because mmap() doesn't care. Don't assume PT_LOAD segments are sorted and calculate min and max addresses correctly. Link: https://lore.kernel.org/all/YVmd7D0M6G/DcP4O@localhost.localdomain/ Signed-off-by: Alexey Dobriyan Tested-by: Magnus Groß Cc: Kees Cook Cc: Alexander Viro Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4c02ff026d9cd..e297682e2c713 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) @@ -399,22 +399,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { - int i, first_idx = -1, last_idx = -1; + elf_addr_t min_addr = -1; + elf_addr_t max_addr = 0; + bool pt_load = false; + int i; for (i = 0; i < nr; i++) { - if (cmds[i].p_type == PT_LOAD) { - last_idx = i; - if (first_idx == -1) - first_idx = i; + if (phdr[i].p_type == PT_LOAD) { + min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); + max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); + pt_load = true; } } - if (first_idx == -1) - return 0; - - return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - - ELF_PAGESTART(cmds[first_idx].p_vaddr); + return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) From 8f436766b29987fe9b7617696b54464d323ffc09 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 16 Feb 2022 15:31:58 +1100 Subject: [PATCH 295/334] binfmt: move more stuff undef CONFIG_COREDUMP struct linux_binfmt::core_dump and struct min_coredump::min_coredump are used under CONFIG_COREDUMP only. Shrink those embedded configs a bit. Link: https://lkml.kernel.org/r/YglbIFyN+OtwVyjW@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/binfmt_elf.c | 2 ++ fs/binfmt_elf_fdpic.c | 2 +- fs/binfmt_flat.c | 2 ++ include/linux/binfmts.h | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e297682e2c713..4628a928e4ee7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, +#ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c6f588dc4a9db..7fa6e6632d9df 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = { .load_binary = load_elf_fdpic_binary, #ifdef CONFIG_ELF_CORE .core_dump = elf_fdpic_core_dump, -#endif .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; static int __init init_elf_fdpic_binfmt(void) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d776f80ee50c..5f0bf24bb3b85 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -102,8 +102,10 @@ static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, .load_binary = load_flat_binary, +#ifdef CONFIG_COREDUMP .core_dump = flat_core_dump, .min_coredump = PAGE_SIZE +#endif }; /****************************************************************************/ diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 049cf9421d831..5d651c219c99b 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -98,8 +98,10 @@ struct linux_binfmt { struct module *module; int (*load_binary)(struct linux_binprm *); int (*load_shlib)(struct file *); +#ifdef CONFIG_COREDUMP int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ +#endif } __randomize_layout; extern void __register_binfmt(struct linux_binfmt *fmt, int insert); From e989d93ae77ef7685f286936a7a0ef5f89d68f7e Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 16 Feb 2022 15:31:59 +1100 Subject: [PATCH 296/334] kallsyms: print module name in %ps/S case when KALLSYMS is disabled original: With KALLSYMS %pS %ps [16.4200] hello_init+0x0/0x24 [crash] hello_init [crash] Without KALLSYMS: [16.2200] 0xbe200040 0xbe200040 With Patch (Without KALLSYMS:) load address + current offset [Module Name] [13.5993] 0xbe200000+0x40 [crash] 0xbe200000+0x40 [crash] It will help in better debugging and checking when KALLSYMS is disabled, user will get information about module name and load address of module. verified for arm64: / # insmod /crash.ko [ 19.263556] 0xffff800000ec0000+0x38 [crash] .. [ 19.276023] Call trace: [ 19.276277] 0xffff800000ec0000+0x28 [crash] [ 19.276567] 0xffff800000ec0000+0x58 [crash] [ 19.276727] 0xffff800000ec0000+0x74 [crash] [ 19.276866] 0xffff8000080127d0 [ 19.276978] 0xffff80000812d95c [ 19.277085] 0xffff80000812f554 Link: https://lkml.kernel.org/r/20220201040044.1528568-1-maninder1.s@samsung.com Signed-off-by: Vaneet Narang Co-developed-by: Vaneet Narang Signed-off-by: Maninder Singh Cc: Petr Mladek Cc: Steven Rostedt (Google) Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Kefeng Wang Cc: Miroslav Benes Cc: Stephen Boyd Cc: Miguel Ojeda Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kallsyms.h | 27 +++++++++++++++++++++++++++ lib/vsprintf.c | 5 +++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 4176c7eca7b5a..b1308678c26c5 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -163,6 +163,33 @@ static inline bool kallsyms_show_value(const struct cred *cred) return false; } +#ifdef CONFIG_MODULES +static inline int fill_minimal_module_info(char *sym, int size, unsigned long value) +{ + struct module *mod; + unsigned long offset; + int ret = 0; + + preempt_disable(); + mod = __module_address(value); + if (mod) { + offset = value - (unsigned long)mod->core_layout.base; + snprintf(sym, size - 1, "0x%lx+0x%lx [%s]", + (unsigned long)mod->core_layout.base, offset, mod->name); + + sym[size - 1] = '\0'; + ret = 1; + } + + preempt_enable(); + return ret; +} +#else +static inline int fill_minimal_module_info(char *sym, int size, unsigned long value) +{ + return 0; +} +#endif /*CONFIG_MODULES*/ #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d419154b47bb8..d271172d6ed54 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -984,9 +984,7 @@ char *symbol_string(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { unsigned long value; -#ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; -#endif if (fmt[1] == 'R') ptr = __builtin_extract_return_addr(ptr); @@ -1006,6 +1004,9 @@ char *symbol_string(char *buf, char *end, void *ptr, return string_nocheck(buf, end, sym, spec); #else + if (fill_minimal_module_info(sym, KSYM_SYMBOL_LEN, value)) + return string_nocheck(buf, end, sym, spec); + return special_hex_number(buf, end, value, sizeof(void *)); #endif } From c3964fd3bbbf955aab10ad0a8817ad0152b20d06 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Wed, 16 Feb 2022 15:31:59 +1100 Subject: [PATCH 297/334] init: use ktime_us_delta() to make initcall_debug log more precise Use ktime_us_delta() to make the initcall_debug log more precise than right shifting the result of ktime_to_ns() by 10 bits. Link: https://lkml.kernel.org/r/20220209053350.15771-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Reviewed-by: Andrew Halaney Tested-by: Andrew Halaney Cc: Steven Rostedt Cc: Matthias Brugger Cc: Masami Hiramatsu Cc: Vlastimil Babka Cc: Kefeng Wang Cc: Rasmus Villemoes Cc: Kees Cook Cc: Valentin Schneider Cc: Peter Zijlstra Cc: YJ Chiang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- init/main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/init/main.c b/init/main.c index 65fa2e41a9c09..c8edcc3029b18 100644 --- a/init/main.c +++ b/init/main.c @@ -1246,15 +1246,11 @@ trace_initcall_start_cb(void *data, initcall_t fn) static __init_or_module void trace_initcall_finish_cb(void *data, initcall_t fn, int ret) { - ktime_t *calltime = (ktime_t *)data; - ktime_t delta, rettime; - unsigned long long duration; + ktime_t rettime, *calltime = (ktime_t *)data; rettime = ktime_get(); - delta = ktime_sub(rettime, *calltime); - duration = (unsigned long long) ktime_to_ns(delta) >> 10; printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", - fn, ret, duration); + fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime)); } static ktime_t initcall_calltime; From 20839a2a6a01479f4c5ad4549be03b89d476e4d0 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Wed, 16 Feb 2022 15:32:00 +1100 Subject: [PATCH 298/334] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index c8edcc3029b18..852a3f09d21a7 100644 --- a/init/main.c +++ b/init/main.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -230,13 +230,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -473,7 +473,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL, NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -502,7 +502,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -527,7 +528,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -727,7 +729,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1348,8 +1351,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1486,7 +1491,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From eb5ec433aabde7580bcde0cb4c73152cf0f5dbe9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 16 Feb 2022 15:32:01 +1100 Subject: [PATCH 299/334] fs/pipe: use kvcalloc to allocate a pipe_buffer array Right now, kcalloc is used to allocate a pipe_buffer array. The size of the pipe_buffer struct is 40 bytes. kcalloc allows allocating reliably chunks with sizes less or equal to PAGE_ALLOC_COSTLY_ORDER (3). It means that the maximum pipe size is 3.2MB in this case. In CRIU, we use pipes to dump processes memory. CRIU freezes a target process, injects a parasite code into it and then this code splices memory into pipes. If a maximum pipe size is small, we need to do many iterations or create many pipes. kvcalloc attempt to allocate physically contiguous memory, but upon failure, fall back to non-contiguous (vmalloc) allocation and so it isn't limited by PAGE_ALLOC_COSTLY_ORDER. The maximum pipe size for non-root users is limited by the /proc/sys/fs/pipe-max-size sysctl that is 1MB by default, so only the root user will be able to trigger vmalloc allocations. Link: https://lkml.kernel.org/r/20220104171058.22580-1-avagin@gmail.com Signed-off-by: Andrei Vagin Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/pipe.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b61..3e3413a4ccc29 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -803,7 +803,7 @@ struct pipe_inode_info *alloc_pipe_info(void) if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { @@ -846,7 +846,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) } if (pipe->tmp_page) __free_page(pipe->tmp_page); - kfree(pipe->bufs); + kvfree(pipe->bufs); kfree(pipe); } @@ -1261,8 +1261,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) if (nr_slots < n) return -EBUSY; - bufs = kcalloc(nr_slots, sizeof(*bufs), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT); if (unlikely(!bufs)) return -ENOMEM; @@ -1289,7 +1288,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) head = n; tail = 0; - kfree(pipe->bufs); + kvfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) From d4ff19f718ec57301bd4cd438359a04d4daad124 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 16 Feb 2022 15:32:01 +1100 Subject: [PATCH 300/334] fs/pipe.c: local vars have to match types of proper pipe_inode_info fields head, tail, ring_size are declared as unsigned int, so all local variables that operate with these fields have to be unsigned to avoid signed integer overflow. Right now, it isn't an issue because the maximum pipe size is limited by 1U<<31. Link: https://lkml.kernel.org/r/20220106171946.36128-1-avagin@gmail.com Signed-off-by: Andrei Vagin Suggested-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/pipe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 3e3413a4ccc29..71946832e33f9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -606,7 +606,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, head, tail, mask; + unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: @@ -828,7 +828,7 @@ struct pipe_inode_info *alloc_pipe_info(void) void free_pipe_info(struct pipe_inode_info *pipe) { - int i; + unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) { From 5af1a2f58f2160912d06863f3e09593c22cbd111 Mon Sep 17 00:00:00 2001 From: Qinghua Jin Date: Wed, 16 Feb 2022 15:32:02 +1100 Subject: [PATCH 301/334] minix: fix bug when opening a file with O_DIRECT Testcase: 1. create a minix file system and mount it 2. open a file on the file system with O_RDWR|O_CREAT|O_TRUNC|O_DIRECT 3. open fails with -EINVAL but leaves an empty file behind. All other open() failures don't leave the failed open files behind. It is hard to check the direct_IO op before creating the inode. Just as ext4 and btrfs do, this patch will resolve the issue by allowing to create the file with O_DIRECT but returning error when writing the file. Link: https://lkml.kernel.org/r/20220107133626.413379-1-qhjin.dev@gmail.com Signed-off-by: Qinghua Jin Reported-by: Colin Ian King Reviewed-by: Jan Kara Acked-by: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/minix/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a71f1cf894b9f..d4bd94234ef73 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { From 198aec7283725010cc09bc567852bc17b62a7f0d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 16 Feb 2022 15:32:02 +1100 Subject: [PATCH 302/334] fat: use pointer to simple type in put_user() The put_user(val,ptr) macro wants a pointer to a simple type, but in fat_ioctl_filldir() the d_name field references an "array of chars". Be more accurate and explicitly give the pointer to the first character of the d_name[] array. I noticed that issue while trying to optimize the parisc put_user() macro and used an intermediate variable to store the pointer. In that case I got this error: In file included from include/linux/uaccess.h:11, from include/linux/compat.h:17, from fs/fat/dir.c:18: fs/fat/dir.c: In function `fat_ioctl_filldir': fs/fat/dir.c:725:33: error: invalid initializer 725 | if (put_user(0, d2->d_name) || \ | ^~ include/asm/uaccess.h:152:33: note: in definition of macro `__put_user' 152 | __typeof__(ptr) __ptr = ptr; \ | ^~~ fs/fat/dir.c:759:1: note: in expansion of macro `FAT_IOCTL_FILLDIR_FUNC' 759 | FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) Andreas Schwab suggested to use __typeof__(&*(ptr)) __ptr = ptr; instead. This works, but nevertheless it's probably reasonable to fix the original caller too. Link: https://lkml.kernel.org/r/Ygo+A9MREmC1H3kr@p100 Signed-off-by: Helge Deller Acked-by: OGAWA Hirofumi Cc: David Laight Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/fat/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index c4a2742858587..249825017da75 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -722,7 +722,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ if (name_len >= sizeof(d1->d_name)) \ name_len = sizeof(d1->d_name) - 1; \ \ - if (put_user(0, d2->d_name) || \ + if (put_user(0, &d2->d_name[0]) || \ put_user(0, &d2->d_reclen) || \ copy_to_user(d1->d_name, name, name_len) || \ put_user(0, d1->d_name + name_len) || \ From 6d55a8934343946a4b044f249791b1a0060954d2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 15:32:03 +1100 Subject: [PATCH 303/334] exec: force single empty string when argv is empty Quoting[1] Ariadne Conill: "In several other operating systems, it is a hard requirement that the second argument to execve(2) be the name of a program, thus prohibiting a scenario where argc < 1. POSIX 2017 also recommends this behaviour, but it is not an explicit requirement[2]: The argument arg0 should point to a filename string that is associated with the process being started by one of the exec functions. ... Interestingly, Michael Kerrisk opened an issue about this in 2008[3], but there was no consensus to support fixing this issue then. Hopefully now that CVE-2021-4034 shows practical exploitative use[4] of this bug in a shellcode, we can reconsider. This issue is being tracked in the KSPP issue tracker[5]." While the initial code searches[6][7] turned up what appeared to be mostly corner case tests, trying to that just reject argv == NULL (or an immediately terminated pointer list) quickly started tripping[8] existing userspace programs. The next best approach is forcing a single empty string into argv and adjusting argc to match. The number of programs depending on argc == 0 seems a smaller set than those calling execve with a NULL argv. Account for the additional stack space in bprm_stack_limits(). Inject an empty string when argc == 0 (and set argc = 1). Warn about the case so userspace has some notice about the change: process './argc0' launched './argc0' with NULL argv: empty string added Additionally WARN() and reject NULL argv usage for kernel threads. [1] https://lore.kernel.org/lkml/20220127000724.15106-1-ariadne@dereferenced.org/ [2] https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html [3] https://bugzilla.kernel.org/show_bug.cgi?id=8408 [4] https://www.qualys.com/2022/01/25/cve-2021-4034/pwnkit.txt [5] https://github.com/KSPP/linux/issues/176 [6] https://codesearch.debian.net/search?q=execve%5C+*%5C%28%5B%5E%2C%5D%2B%2C+*NULL&literal=0 [7] https://codesearch.debian.net/search?q=execlp%3F%5Cs*%5C%28%5B%5E%2C%5D%2B%2C%5Cs*NULL&literal=0 [8] https://lore.kernel.org/lkml/20220131144352.GE16385@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20220201000947.2453721-1-keescook@chromium.org Signed-off-by: Kees Cook Reported-by: Ariadne Conill Reported-by: Michael Kerrisk Acked-by: Ariadne Conill Acked-by: Andy Lutomirski Acked-by: Christian Brauner Cc: Matthew Wilcox Cc: Rich Felker Cc: Eric Biederman Cc: Alexander Viro Cc: Shuah Khan Cc: Yang Yingliang Cc: Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/exec.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 79f2c9483302d..bbf3aadf7ce1d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -495,8 +495,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. + * + * In the case of argc = 0, make sure there is space for adding a + * empty string (which will bump argc to 1), to ensure confused + * userspace programs don't start processing from argv[1], thinking + * argc can never be 0, to keep them from walking envp by accident. + * See do_execveat_common(). */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + ptr_size = (min(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; @@ -1897,6 +1903,9 @@ static int do_execveat_common(int fd, struct filename *filename, } retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) + pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", + current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; @@ -1923,6 +1932,19 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out_free; + /* + * When argv is empty, add an empty string ("") as argv[0] to + * ensure confused userspace programs that start processing + * from argv[1] won't end up walking envp. See also + * bprm_stack_limits(). + */ + if (bprm->argc == 0) { + retval = copy_string_kernel("", bprm); + if (retval < 0) + goto out_free; + bprm->argc = 1; + } + retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); @@ -1951,6 +1973,8 @@ int kernel_execve(const char *kernel_filename, } retval = count_strings_kernel(argv); + if (WARN_ON_ONCE(retval == 0)) + retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; From f5e4c540708a2c4c7c13bb8b0f6ae8ee1dfa121c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 15:32:04 +1100 Subject: [PATCH 304/334] exec: Fix min/max typo in stack space calculation When handling the argc == 0 case, the stack space calculation should be using max() not min(). Link: https://lkml.kernel.org/r/20220201190700.3147041-1-keescook@chromium.org Signed-off-by: Kees Cook Cc: Michael Kerrisk Cc: Ariadne Conill Cc: Andy Lutomirski Cc: Matthew Wilcox Cc: Christian Brauner Cc: Rich Felker Cc: Eric Biederman Cc: Alexander Viro Cc: Shuah Khan Cc: Yang Yingliang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index bbf3aadf7ce1d..40b1008fb0f79 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -502,7 +502,7 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ - ptr_size = (min(bprm->argc, 1) + bprm->envc) * sizeof(void *); + ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; From 7bc30d556cbc3db4660840d59d0e2c0bcd37992b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 15:32:05 +1100 Subject: [PATCH 305/334] selftests/exec: test for empty string on NULL argv Test for the NULL argv argument producing a single empty string on exec. Link: https://lkml.kernel.org/r/20220201011637.2457646-1-keescook@chromium.org Signed-off-by: Kees Cook Cc: Eric Biederman Cc: Shuah Khan Cc: Yang Yingliang Cc: Alexander Viro Cc: Ariadne Conill Cc: Christian Brauner Cc: Matthew Wilcox Cc: Michael Kerrisk Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/exec/Makefile | 1 + tools/testing/selftests/exec/null-argv.c | 78 ++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 tools/testing/selftests/exec/null-argv.c diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 12c5e27d32c16..551affb437fe1 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -10,6 +10,7 @@ TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir TEST_FILES := Makefile TEST_GEN_PROGS += recursion-depth +TEST_GEN_PROGS += null-argv EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* \ $(OUTPUT)/S_I*.test diff --git a/tools/testing/selftests/exec/null-argv.c b/tools/testing/selftests/exec/null-argv.c new file mode 100644 index 0000000000000..c19726e710d19 --- /dev/null +++ b/tools/testing/selftests/exec/null-argv.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test that empty argvs are swapped out for a single empty string. */ +#include +#include +#include +#include + +#include "../kselftest.h" + +#define FORK(exec) \ +do { \ + pid = fork(); \ + if (pid == 0) { \ + /* Child */ \ + exec; /* Some kind of exec */ \ + perror("# " #exec); \ + return 1; \ + } \ + check_result(pid, #exec); \ +} while (0) + +void check_result(pid_t pid, const char *msg) +{ + int wstatus; + + if (pid == (pid_t)-1) { + perror("# fork"); + ksft_test_result_fail("fork failed: %s\n", msg); + return; + } + if (waitpid(pid, &wstatus, 0) < 0) { + perror("# waitpid"); + ksft_test_result_fail("waitpid failed: %s\n", msg); + return; + } + if (!WIFEXITED(wstatus)) { + ksft_test_result_fail("child did not exit: %s\n", msg); + return; + } + if (WEXITSTATUS(wstatus) != 0) { + ksft_test_result_fail("non-zero exit: %s\n", msg); + return; + } + ksft_test_result_pass("%s\n", msg); +} + +int main(int argc, char *argv[], char *envp[]) +{ + pid_t pid; + static char * const args[] = { NULL }; + static char * const str[] = { "", NULL }; + + /* argc counting checks */ + if (argc < 1) { + fprintf(stderr, "# FAIL: saw argc == 0 (old kernel?)\n"); + return 1; + } + if (argc != 1) { + fprintf(stderr, "# FAIL: unknown argc (%d)\n", argc); + return 1; + } + if (argv[0][0] == '\0') { + /* Good, we found a NULL terminated string at argv[0]! */ + return 0; + } + + /* Test runner. */ + ksft_print_header(); + ksft_set_plan(5); + + FORK(execve(argv[0], str, NULL)); + FORK(execve(argv[0], NULL, NULL)); + FORK(execve(argv[0], NULL, envp)); + FORK(execve(argv[0], args, NULL)); + FORK(execve(argv[0], args, envp)); + + ksft_exit(ksft_cnt.ksft_pass == ksft_plan); +} From 1b4b859ac0fe63f19302ba94d321ea74732a08b0 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 16 Feb 2022 15:32:06 +1100 Subject: [PATCH 306/334] kexec: make crashk_res, crashk_low_res and crash_notes symbols always visible Patch series "kexec: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef", v2. Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. I only modified x86, arm, arm64 and riscv, other architectures such as sh, powerpc and s390 are better to be kept kexec code as-is so they are not touched. This patch (of 5): Make the forward declarations of crashk_res, crashk_low_res and crash_notes always visible. Code referring to these symbols can then just check for IS_ENABLED(CONFIG_KEXEC_CORE), instead of requiring conditional compilation using an #ifdef, thus preparing to increase compile coverage and simplify the code. Link: https://lkml.kernel.org/r/20211206160514.2000-1-jszhang@kernel.org Link: https://lkml.kernel.org/r/20211206160514.2000-2-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Baoquan He Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Eric W. Biederman Cc: Alexandre Ghiti Cc: Palmer Dabbelt Cc: Russell King (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/kexec.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0c994ae37729e..58d1b58a971e3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -20,6 +20,12 @@ #include +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; +extern note_buf_t __percpu *crash_notes; + #ifdef CONFIG_KEXEC_CORE #include #include @@ -350,12 +356,6 @@ extern int kexec_load_disabled; #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; -extern note_buf_t __percpu *crash_notes; - /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; From 1d8b2a77e6ce6fcf49ef891ef6427ef344c553cc Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 16 Feb 2022 15:32:06 +1100 Subject: [PATCH 307/334] riscv: mm: init: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-3-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Palmer Dabbelt Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Russell King (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/riscv/mm/init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c27294128e182..ff2f41b3b558d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -957,7 +957,6 @@ static inline void setup_vm_final(void) } #endif /* CONFIG_MMU */ -#ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -974,6 +973,8 @@ static void __init reserve_crashkernel(void) int ret = 0; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; /* * Don't reserve a region for a crash kernel on a crash kernel * since it doesn't make much sense and we have limited memory @@ -1023,7 +1024,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } -#endif /* CONFIG_KEXEC_CORE */ void __init paging_init(void) { @@ -1037,9 +1037,7 @@ void __init misc_mem_init(void) arch_numa_init(); sparse_init(); zone_sizes_init(); -#ifdef CONFIG_KEXEC_CORE reserve_crashkernel(); -#endif memblock_dump_all(); } From f26c72d8517a9d5ee113d5014120f730aa51eacd Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 16 Feb 2022 15:32:07 +1100 Subject: [PATCH 308/334] x86/setup: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-4-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Russell King (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/kernel/setup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7a132eb794d8..af2d2dc438a20 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -391,8 +391,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC_CORE - /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -470,6 +468,9 @@ static void __init reserve_crashkernel(void) bool high = false; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ @@ -535,11 +536,6 @@ static void __init reserve_crashkernel(void) crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, From f76a2ff84cd10e5cd1efd8eca0bf7cab6a20a33e Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 16 Feb 2022 15:32:08 +1100 Subject: [PATCH 309/334] arm64: mm: use IS_ENABLED(CONFIG_KEXEC_CORE) instead of #ifdef Replace the conditional compilation using "#ifdef CONFIG_KEXEC_CORE" by a check for "IS_ENABLED(CONFIG_KEXEC_CORE)", to simplify the code and increase compile coverage. Link: https://lkml.kernel.org/r/20211206160514.2000-5-jszhang@kernel.org Signed-off-by: Jisheng Zhang Acked-by: Catalin Marinas Acked-by: Baoquan He Cc: Albert Ou Cc: Alexandre Ghiti Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric W. Biederman Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Russell King (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/mm/init.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index db63cc885771a..3973e305adc89 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -64,7 +64,6 @@ EXPORT_SYMBOL(memstart_addr); */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -#ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -78,6 +77,9 @@ static void __init reserve_crashkernel(void) unsigned long long crash_max = arm64_dma_phys_limit; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); /* no crashkernel= or invalid value specified */ @@ -110,11 +112,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif /* CONFIG_KEXEC_CORE */ /* * Return the maximum physical address for a zone accessible by the given bits From 1abcb37e1123bc610d2ad89b6d862d2e3572b3e5 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 16 Feb 2022 15:32:09 +1100 Subject: [PATCH 310/334] docs: kdump: update description about sysfs file system support Patch series "Update doc and fix some issues about kdump", v2. This patch (of 5): After commit 6a108a14fa35 ("kconfig: rename CONFIG_EMBEDDED to CONFIG_EXPERT"), "Configure standard kernel features (for small systems)" is not exist, we should use "Configure standard kernel features (expert users)" now. Link: https://lkml.kernel.org/r/1644324666-15947-1-git-send-email-yangtiezhu@loongson.cn Link: https://lkml.kernel.org/r/1644324666-15947-2-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Baoquan He Cc: Baoquan He Cc: Jonathan Corbet Cc: Marco Elver Cc: Andrey Ryabinin Cc: Xuefeng Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kdump/kdump.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index cb30ca3df27c9..d187df2f76aec 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -146,9 +146,9 @@ System kernel config options CONFIG_SYSFS=y Note that "sysfs file system support" might not appear in the "Pseudo - filesystems" menu if "Configure standard kernel features (for small - systems)" is not enabled in "General Setup." In this case, check the - .config file itself to ensure that sysfs is turned on, as follows:: + filesystems" menu if "Configure standard kernel features (expert users)" + is not enabled in "General Setup." In this case, check the .config file + itself to ensure that sysfs is turned on, as follows:: grep 'CONFIG_SYSFS' .config From a772ce982e470902d2ceda19dabe315235d8dbf3 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 16 Feb 2022 15:32:10 +1100 Subject: [PATCH 311/334] docs: kdump: add scp example to write out the dump file Except cp and makedumpfile, add scp example to write out the dump file. Link: https://lkml.kernel.org/r/1644324666-15947-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Baoquan He Cc: Andrey Ryabinin Cc: Jonathan Corbet Cc: Marco Elver Cc: Xuefeng Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kdump/kdump.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index d187df2f76aec..a748e7eb4429b 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -533,6 +533,10 @@ the following command:: cp /proc/vmcore +or use scp to write out the dump file between hosts on a network, e.g:: + + scp /proc/vmcore remote_username@remote_ip: + You can also use makedumpfile utility to write out the dump file with specified options to filter out unwanted contents, e.g:: From a03a26e9476c978c07f82b2752400a88f4c47328 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 16 Feb 2022 15:32:12 +1100 Subject: [PATCH 312/334] panic: unset panic_on_warn inside panic() In the current code, the following three places need to unset panic_on_warn before calling panic() to avoid recursive panics: kernel/kcsan/report.c: print_report() kernel/sched/core.c: __schedule_bug() mm/kfence/report.c: kfence_report_error() In order to avoid copy-pasting "panic_on_warn = 0" all over the places, it is better to move it inside panic() and then remove it from the other places. Link: https://lkml.kernel.org/r/1644324666-15947-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Jonathan Corbet Cc: Xuefeng Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/panic.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 55b50e052ec3a..95ba825522dd4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -185,6 +185,16 @@ void panic(const char *fmt, ...) int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + } + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -576,16 +586,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } if (!regs) dump_stack(); From 460380983c8efdf37242165f24237dfbf036731f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 16 Feb 2022 15:32:13 +1100 Subject: [PATCH 313/334] ubsan: no need to unset panic_on_warn in ubsan_epilogue() panic_on_warn is unset inside panic(), so no need to unset it before calling panic() in ubsan_epilogue(). Link: https://lkml.kernel.org/r/1644324666-15947-5-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Jonathan Corbet Cc: Xuefeng Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/ubsan.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/lib/ubsan.c b/lib/ubsan.c index bdc380ff5d5c7..36bd75e334263 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -154,16 +154,8 @@ static void ubsan_epilogue(void) current->in_ubsan--; - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } } void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) From 02de7a3f119de1c52d1016d9080c8a09388ce03b Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 16 Feb 2022 15:32:14 +1100 Subject: [PATCH 314/334] kasan: no need to unset panic_on_warn in end_report() panic_on_warn is unset inside panic(), so no need to unset it before calling panic() in end_report(). Link: https://lkml.kernel.org/r/1644324666-15947-6-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Jonathan Corbet Cc: Xuefeng Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kasan/report.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3ad9624dcc561..f14146563d412 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -117,16 +117,8 @@ static void end_report(unsigned long *flags, unsigned long addr) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) panic("panic_on_warn set ...\n"); - } if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); kasan_enable_current(); From 8d88813c721f0f16b0bd8c25977d912fadcc5545 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 16 Feb 2022 15:32:16 +1100 Subject: [PATCH 315/334] docs: sysctl/kernel: add missing bit to panic_print Patch series "Some improvements on panic_print". This is a mix of a documentation fix with some additions to the "panic_print" syscall / parameter. The goal here is being able to collect all CPUs backtraces during a panic event and also to enable "panic_print" in a kdump event - details of the reasoning and design choices in the patches. This patch (of 3): Commit de6da1e8bcf0 ("panic: add an option to replay all the printk message in buffer") added a new bit to the sysctl/kernel parameter "panic_print", but the documentation was added only in kernel-parameters.txt, not in the sysctl guide. Fix it here by adding bit 5 to sysctl admin-guide documentation. Link: https://lkml.kernel.org/r/20211109202848.610874-1-gpiccoli@igalia.com Link: https://lkml.kernel.org/r/20211109202848.610874-2-gpiccoli@igalia.com Fixes: de6da1e8bcf0 ("panic: add an option to replay all the printk message in buffer") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Feng Tang Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Samuel Iglesias Gonsalvez Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 77c3541039407..32db7947947c2 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -806,6 +806,7 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer +bit 5: print all printk messages in buffer ===== ============================================ So for example to print tasks and memory info on panic, user can:: From 7221f2a7d19182eaaf3a024bc17dd2d5fcf7db9e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 16 Feb 2022 15:32:17 +1100 Subject: [PATCH 316/334] sysctl: documentation: fix table format warning Fix malformed table warning in sysctl documentation: (don't use ':'s) Documentation/admin-guide/sysctl/kernel.rst:798: WARNING: Malformed table. Text in column margin in table line 7. ===== ============================================ bit 0 print all tasks info bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer bit 5: print all printk messages in buffer bit 6: print all CPUs backtrace (if available in the arch) Link: https://lkml.kernel.org/r/20220109055635.6999-1-rdunlap@infradead.org Fixes: 934d51cad60c ("docs: sysctl/kernel: add missing bit to panic_print") Fixes: addc64999934 ("panic: add option to dump all CPUs backtraces in panic_print") Signed-off-by: Randy Dunlap Reported-by: Stephen Rothwell Reviewed-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 32db7947947c2..ed98e790d9711 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -806,7 +806,7 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer -bit 5: print all printk messages in buffer +bit 5 print all printk messages in buffer ===== ============================================ So for example to print tasks and memory info on panic, user can:: From 7be57473febfa658ac636932b2c0e9aee50ca7f9 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 16 Feb 2022 15:32:18 +1100 Subject: [PATCH 317/334] panic: add option to dump all CPUs backtraces in panic_print Currently the "panic_print" parameter/sysctl allows some interesting debug information to be printed during a panic event. This is useful for example in cases the user cannot kdump due to resource limits, or if the user collects panic logs in a serial output (or pstore) and prefers a fast reboot instead of a kdump. Happens that currently there's no way to see all CPUs backtraces in a panic using "panic_print" on architectures that support that. We do have "oops_all_cpu_backtrace" sysctl, but although partially overlapping in the functionality, they are orthogonal in nature: "panic_print" is a panic tuning (and we have panics without oopses, like direct calls to panic() or maybe other paths that don't go through oops_enter() function), and the original purpose of "oops_all_cpu_backtrace" is to provide more information on oopses for cases in which the users desire to continue running the kernel even after an oops, i.e., used in non-panic scenarios. So, we hereby introduce an additional bit for "panic_print" to allow dumping the CPUs backtraces during a panic event. Link: https://lkml.kernel.org/r/20211109202848.610874-3-gpiccoli@igalia.com Signed-off-by: Guilherme G. Piccoli Reviewed-by: Feng Tang Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: Samuel Iglesias Gonsalvez Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kernel-parameters.txt | 1 + Documentation/admin-guide/sysctl/kernel.rst | 1 + kernel/panic.c | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 85f096fddad9d..a069d8fe2fee1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3726,6 +3726,7 @@ bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer bit 5: print all printk messages in buffer + bit 6: print all CPUs backtrace (if available in the arch) panic_on_taint= Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ed98e790d9711..59c3b4ce37cde 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -807,6 +807,7 @@ bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer bit 5 print all printk messages in buffer +bit 6 print all CPUs backtrace (if available in the arch) ===== ============================================ So for example to print tasks and memory info on panic, user can:: diff --git a/kernel/panic.c b/kernel/panic.c index 95ba825522dd4..3c3fb36d8d414 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_PRINT_ALL_CPU_BT 0x00000040 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -152,6 +153,9 @@ static void panic_print_sys_info(void) if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); From ca9f7117a29b1c84c1ae47f9568c41e5a209aa9a Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 16 Feb 2022 15:32:19 +1100 Subject: [PATCH 318/334] panic: allow printing extra panic information on kdump Currently we have the "panic_print" parameter/sysctl to allow some extra information to be printed in a panic event. On the other hand, the kdump mechanism allows to kexec a new kernel to collect a memory dump for the running kernel in case of panic. Right now these options are incompatible: the user either sets the kdump or makes use of "panic_print". The code path of "panic_print" isn't reached when kdump is configured. There are situations though in which this would be interesting: for example, in systems that are very memory constrained, a handcrafted tiny kernel/initrd for kdump might be used in order to only collect the dmesg in kdump kernel. Even more common, systems with no disk space for the full (compressed) memory dump might very well rely in this functionality too, dumping only the dmesg with the additional information provided by "panic_print". So, this is what the patch does: allows both functionality to co-exist; if "panic_print" is set and the system performs a kdump, the extra information is printed on dmesg before the kexec. Some notes about the design choices here: (a) We could have introduced a sysctl or an extra bit on "panic_print" to allow enabling the co-existence of kdump and "panic_print", but seems that would be over-engineering; we have 3 cases, let's check how this patch change things: - if the user have kdump set and not "panic_print", nothing changes; - if the user have "panic_print" set and not kdump, nothing changes; - if both are enabled, now we print the extra information before kdump, which is exactly the goal of the patch (and should be the goal of the user, since they enabled both options). (b) We assume that the code path won't return from __crash_kexec() so we didn't guard against double execution of panic_print_sys_info(). Link: https://lkml.kernel.org/r/20211109202848.610874-4-gpiccoli@igalia.com Signed-off-by: Guilherme G. Piccoli Cc: Feng Tang Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: Samuel Iglesias Gonsalvez Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/panic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/panic.c b/kernel/panic.c index 3c3fb36d8d414..25ff5d815a02d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -254,6 +254,13 @@ void panic(const char *fmt, ...) */ kgdb_panic(buf); + /* + * If we have a kdump kernel loaded, give a chance to panic_print + * show some extra information on kernel log if it was set... + */ + if (kexec_crash_loaded()) + panic_print_sys_info(); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. From 668bb9899176f45c1c1e5aea06a02ffb20c2a887 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 16 Feb 2022 15:32:20 +1100 Subject: [PATCH 319/334] kcov: split ioctl handling into locked and unlocked parts Patch series "kcov: improve mmap processing", v3. Subsequent mmaps of the same kcov descriptor currently do not update the virtual memory of the task and yet return 0 (success). This is counter-intuitive and may lead to unexpected memory access errors. Also, this unnecessarily limits the functionality of kcov to only the simplest usage scenarios. Kcov instances are effectively forever attached to their first address spaces and it becomes impossible to e.g. reuse the same kcov handle in forked child processes without mmapping the memory first. This is exactly what we tried to do in syzkaller and inadvertently came upon this behavior. This patch series addresses the problem described above. This patch (of 3): Currently all ioctls are de facto processed under a spinlock in order to serialise them. This, however, prohibits the use of vmalloc and other memory management functions in the implementations of those ioctls, unnecessary complicating any further changes to the code. Let all ioctls first be processed inside the kcov_ioctl() function which should execute the ones that are not compatible with spinlock and then pass control to kcov_ioctl_locked() for all other ones. KCOV_REMOTE_ENABLE is processed both in kcov_ioctl() and kcov_ioctl_locked() as the steps are easily separable. Although it is still compatible with a spinlock, move KCOV_INIT_TRACE handling to kcov_ioctl(), so that the changes from the next commit are easier to follow. Link: https://lkml.kernel.org/r/20220117153634.150357-1-nogikh@google.com Link: https://lkml.kernel.org/r/20220117153634.150357-2-nogikh@google.com Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Taras Madan Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/kcov.c | 68 ++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e7..e1be7301500bd 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -564,31 +564,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +673,32 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. Later we allocate + * size * sizeof(unsigned long) memory, that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + return -EBUSY; + } + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +714,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { From a1e17ac18e20ea71ebb83fd30656b3fef64d2d04 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 16 Feb 2022 15:32:21 +1100 Subject: [PATCH 320/334] kcov: properly handle subsequent mmap calls Allocate the kcov buffer during KCOV_MODE_INIT in order to untie mmapping of a kcov instance and the actual coverage collection process. Modify kcov_mmap, so that it can be reliably used any number of times once KCOV_MODE_INIT has succeeded. These changes to the user-facing interface of the tool only weaken the preconditions, so all existing user space code should remain compatible with the new version. Link: https://lkml.kernel.org/r/20220117153634.150357-3-nogikh@google.com Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Marco Elver Cc: Sebastian Andrzej Siewior Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/kcov.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index e1be7301500bd..475524bd900ab 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); - } - return 0; + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -674,6 +665,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) unsigned int remote_num_handles; unsigned long remote_arg_size; unsigned long size, flags; + void *area; kcov = filep->private_data; switch (cmd) { @@ -683,17 +675,21 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) * Must happen before anything else. * * First check the size argument - it must be at least 2 - * to hold the current position and one PC. Later we allocate - * size * sizeof(unsigned long) memory, that must not overflow. + * to hold the current position and one PC. */ size = arg; if (size < 2 || size > INT_MAX / sizeof(unsigned long)) return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; spin_lock_irqsave(&kcov->lock, flags); if (kcov->mode != KCOV_MODE_DISABLED) { spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); return -EBUSY; } + kcov->area = area; kcov->size = size; kcov->mode = KCOV_MODE_INIT; spin_unlock_irqrestore(&kcov->lock, flags); From 4be5b82ce8c05982b1d6c0786a92bb8efc2b7a52 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:23 +1100 Subject: [PATCH 321/334] selftests: set the BUILD variable to absolute path Patch series "selftests: Fix separate output directory builds", v2. Build of several selftests fail if separate output directory is specified by the following methods: 1) make -C tools/testing/selftests O= 2) export KBUILD_OUTPUT="build_dir"; make -C tools/testing/selftests Build fails because of several reasons: 1) The kernel headers aren't found. 2) The path of output objects is wrong and hence unaccessible. These problems can be solved by: 1) Including the correct path of uapi header files 2) By setting the BUILD variable correctly inside Makefile Following different build scenarios have been tested after making these changes to verify that nothing gets broken with these changes: make -C tools/testing/selftests make -C tools/testing/selftests/futex make -C tools/testing/selftests/kvm make -C tools/testing/selftests/landlock make -C tools/testing/selftests/net make -C tools/testing/selftests/net/mptcp make -C tools/testing/selftests/vm make -C tools/testing/selftests O=build make -C tools/testing/selftests o=/opt/build export KBUILD_OUTPUT="/opt/build"; make -C tools/testing/selftests export KBUILD_OUTPUT="build"; make -C tools/testing/selftests cd ; make -C /tools/testing/selftests cd ; make -C /tools/testing/selftests O=build This patch (of 10): The build of kselftests fails if relative path is specified through KBUILD_OUTPUT or O= method. BUILD variable is used to determine the path of the output objects. When make is run from other directories with relative paths, the exact path of the build objects is ambiguous and build fails. make[1]: Entering directory '/home/usama/repos/kernel/linux_mainline2/tools/testing/selftests/alsa' gcc mixer-test.c -L/usr/lib/x86_64-linux-gnu -lasound -o build/kselftest/alsa/mixer-test /usr/bin/ld: cannot open output file build/kselftest/alsa/mixer-test Set the BUILD variable to the absolute path of the output directory. Make the logic readable and easy to follow. Use spaces instead of tabs for indentation as if with tab indentation is considered recipe in make. Link: https://lkml.kernel.org/r/20220119101531.2850400-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20220119101531.2850400-2-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Andr Almeida Cc: Paolo Bonzini Cc: Mickal Salan Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Minghao Chi Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d08fe4cfe8115..a7b63860b7bce 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -114,19 +114,27 @@ ifdef building_out_of_srctree override LDFLAGS = endif -ifneq ($(O),) - BUILD := $(O)/kselftest +top_srcdir ?= ../../.. + +ifeq ("$(origin O)", "command line") + KBUILD_OUTPUT := $(O) +endif + +ifneq ($(KBUILD_OUTPUT),) + # Make's built-in functions such as $(abspath ...), $(realpath ...) cannot + # expand a shell special character '~'. We use a somewhat tedious way here. + abs_objtree := $(shell cd $(top_srcdir) && mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) && pwd) + $(if $(abs_objtree),, \ + $(error failed to create output directory "$(KBUILD_OUTPUT)")) + # $(realpath ...) resolves symlinks + abs_objtree := $(realpath $(abs_objtree)) + BUILD := $(abs_objtree)/kselftest else - ifneq ($(KBUILD_OUTPUT),) - BUILD := $(KBUILD_OUTPUT)/kselftest - else - BUILD := $(shell pwd) - DEFAULT_INSTALL_HDR_PATH := 1 - endif + BUILD := $(CURDIR) + DEFAULT_INSTALL_HDR_PATH := 1 endif # Prepare for headers install -top_srcdir ?= ../../.. include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) export KSFT_KHDR_INSTALL_DONE := 1 From 195ce63263f54bdf7328de309d5512f266ef105a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:24 +1100 Subject: [PATCH 322/334] selftests: add and export a kernel uapi headers path Kernel uapi headers can be present at different paths depending upon how the build was invoked. It becomes impossible for the tests to include the correct headers directory. Set and export KHDR_INCLUDES variable to make it possible for sub make files to include the header files. Link: https://lkml.kernel.org/r/20220119101531.2850400-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index a7b63860b7bce..21f983dfd047b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -129,8 +129,11 @@ ifneq ($(KBUILD_OUTPUT),) # $(realpath ...) resolves symlinks abs_objtree := $(realpath $(abs_objtree)) BUILD := $(abs_objtree)/kselftest + KHDR_INCLUDES := -I${abs_objtree}/usr/include else BUILD := $(CURDIR) + abs_srctree := $(shell cd $(top_srcdir) && pwd) + KHDR_INCLUDES := -I${abs_srctree}/usr/include DEFAULT_INSTALL_HDR_PATH := 1 endif @@ -139,6 +142,7 @@ include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) export KSFT_KHDR_INSTALL_DONE := 1 export BUILD +export KHDR_INCLUDES # set default goal to all, so make without a target runs all, even when # all isn't the first target in the file. From aae1bd2b1cf1bf661808af4af8e259748b933980 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:25 +1100 Subject: [PATCH 323/334] selftests: correct the headers install path uapi headers should be installed at the top of the object tree, "/usr/include". There is no need for kernel headers to be present at kselftest build directory, "/kselftest/usr/ include" as well. This duplication can be avoided by correctly specifying the INSTALL_HDR_PATH. Link: https://lkml.kernel.org/r/20220119101531.2850400-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 21f983dfd047b..80e5498eab92a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -167,7 +167,7 @@ khdr: ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install else - $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \ + $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$(abs_objtree)/usr \ ARCH=$(ARCH) -C $(top_srcdir) headers_install endif From e370e06f69cbb615c6be3719b6d9885c1bfac2a4 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:26 +1100 Subject: [PATCH 324/334] selftests: futex: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. KBUILD_OUTPUT also doesn't point to the correct directory when relative path is used. Thus out of tree builds fail. Remove the un-needed include paths and use KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-5-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/futex/functional/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 5cc38de9d8ea1..2a12b174cb04f 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -INCLUDES := -I../include -I../../ -I../../../../../usr/include/ \ - -I$(KBUILD_OUTPUT)/kselftest/usr/include -CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) +INCLUDES := -I../include -I../../ -I../../../../../usr/include/ +CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) $(KHDR_INCLUDES) LDLIBS := -lpthread -lrt HEADERS := \ From 023dc6e830bb4c84d443e714d28acbb6c5035d38 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:27 +1100 Subject: [PATCH 325/334] selftests: kvm: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-6-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 0e4926bc9a58d..d61286208e242 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -150,7 +150,7 @@ endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ - -I$( Date: Wed, 16 Feb 2022 15:32:28 +1100 Subject: [PATCH 326/334] selftests: landlock: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-7-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/landlock/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index a99596ca9882b..0b0049e133bba 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -Wall -O2 +CFLAGS += -Wall -O2 $(KHDR_INCLUDES) src_test := $(wildcard *_test.c) From fdadd1164c2743e5b89a137702b8abfb69e98a0f Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:29 +1100 Subject: [PATCH 327/334] selftests: net: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-8-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/net/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 9897fa9ab9537..0b1488616c551 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -2,7 +2,7 @@ # Makefile for net selftests CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES) TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh From 3e3075bfc7a9b14070f7d82867972228f5722460 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:30 +1100 Subject: [PATCH 328/334] selftests: mptcp: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-9-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Reviewed-by: Matthieu Baerts Cc: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/net/mptcp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 0356c4501c990..f905d5358e681 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -3,7 +3,7 @@ top_srcdir = ../../../../.. KSFT_KHDR_INSTALL := 1 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh From 8b53560ec894c2a76cbdbd3eb5a85affeecd8e9e Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:31 +1100 Subject: [PATCH 329/334] selftests: vm: add the uapi headers include variable Out of tree build of this test fails if relative path of the output directory is specified. Add the KHDR_INCLUDES to correctly reach the headers. Link: https://lkml.kernel.org/r/20220119101531.2850400-10-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Acked-by: Paolo Bonzini Tested-by: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c2f2f99be5182..5e43f072f5b76 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -23,7 +23,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) +CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_test From 01dd141b8818ab016b9079afa081f54b2cba14ec Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:32 +1100 Subject: [PATCH 330/334] selftests: vm: remove dependecy from internal kernel macros The defination of swap() is used from kernel's internal header when this test is built in source tree. The build fails when this test is built out of source tree as defination of swap() isn't found. Selftests shouldn't depend on kernel's internal header files. They can only depend on uapi header files. Add the defination of swap() to fix the build error: gcc -Wall -I/linux_mainline2/build/usr/include -no-pie userfaultfd.c -lrt -lpthread -o /linux_mainline2/build/kselftest/vm/userfaultfd userfaultfd.c: In function `userfaultfd_stress': userfaultfd.c:1530:3: warning: implicit declaration of function `swap'; did you mean `swab'? [-Wimplicit-function-declaration] 1530 | swap(area_src, area_dst); | ^~~~ | swab /usr/bin/ld: /tmp/cclUUH7V.o: in function `userfaultfd_stress': userfaultfd.c:(.text+0x4d64): undefined reference to `swap' /usr/bin/ld: userfaultfd.c:(.text+0x4d82): undefined reference to `swap' collect2: error: ld returned 1 exit status Link: https://lkml.kernel.org/r/20220119101531.2850400-11-usama.anjum@collabora.com Fixes: 2c769ed7137a ("tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Alistair Popple Cc: Andr Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: "David S. Miller" Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Mat Martineau Cc: Matthieu Baerts Cc: Mickal Salan Cc: Minghao Chi Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/vm/userfaultfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 96bf54fbca5c6..388bf4fc9dc6e 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -118,6 +118,9 @@ struct uffd_stats { ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" From 94edc8a3482e640f263b0223ebdccbe2745381f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 15:32:34 +1100 Subject: [PATCH 331/334] selftests: kselftest framework: provide "finished" helper Instead of having each time that wants to use ksft_exit() have to figure out the internals of kselftest.h, add the helper ksft_finished() that makes sure the passes, xfails, and skips are equal to the test plan count. Link: https://lkml.kernel.org/r/20220201013717.2464392-1-keescook@chromium.org Signed-off-by: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/kselftest.h | 10 ++++++++++ tools/testing/selftests/vm/memfd_secret.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index f1180987492c9..b8f248018174d 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -28,6 +28,7 @@ * * When all tests are finished, clean up and exit the program with one of: * + * ksft_finished(); * ksft_exit(condition); * ksft_exit_pass(); * ksft_exit_fail(); @@ -235,6 +236,15 @@ static inline int ksft_exit_fail(void) ksft_exit_fail(); \ } while (0) +/** + * ksft_finished() - Exit selftest with success if all tests passed + */ +#define ksft_finished() \ + ksft_exit(ksft_plan == \ + ksft_cnt.ksft_pass + \ + ksft_cnt.ksft_xfail + \ + ksft_cnt.ksft_xskip) + static inline int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c index 93e7e7ffed337..957b9e18c7295 100644 --- a/tools/testing/selftests/vm/memfd_secret.c +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -282,7 +282,7 @@ int main(int argc, char *argv[]) close(fd); - ksft_exit(!ksft_get_fail_cnt()); + ksft_finished(); } #else /* __NR_memfd_secret */ From 9320eb79d984b9e513423407a1e7f197166304b8 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 16 Feb 2022 15:32:35 +1100 Subject: [PATCH 332/334] selftests: use -isystem instead of -I to include headers Selftests need kernel headers and glibc for compilation. In compilation of selftests, uapi headers from kernel source are used instead of default ones while glibc has already been compiled with different header files installed in the operating system. So there can be redefinition warnings from compiler. These warnings can be suppressed by using -isystem to include the uapi headers. Link: https://lkml.kernel.org/r/20220214160756.3543590-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Reviewed-by: Kees Cook Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 80e5498eab92a..5d9d4ddccccb1 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -129,11 +129,11 @@ ifneq ($(KBUILD_OUTPUT),) # $(realpath ...) resolves symlinks abs_objtree := $(realpath $(abs_objtree)) BUILD := $(abs_objtree)/kselftest - KHDR_INCLUDES := -I${abs_objtree}/usr/include + KHDR_INCLUDES := -isystem ${abs_objtree}/usr/include else BUILD := $(CURDIR) abs_srctree := $(shell cd $(top_srcdir) && pwd) - KHDR_INCLUDES := -I${abs_srctree}/usr/include + KHDR_INCLUDES := -isystem ${abs_srctree}/usr/include DEFAULT_INSTALL_HDR_PATH := 1 endif From ceeb777f477940a8907d7daa2cd5a980f27930b7 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 16 Feb 2022 15:32:36 +1100 Subject: [PATCH 333/334] Revert "ubsan, kcsan: Don't combine sanitizer with kcov on clang" This reverts commit ea91a1d45d19469001a4955583187b0d75915759. Since df05c0e9496c ("Documentation: Raise the minimum supported version of LLVM to 11.0.0") the minimum Clang version is now 11.0, which fixed the UBSAN/KCSAN vs. KCOV incompatibilities. Link: https://bugs.llvm.org/show_bug.cgi?id=45831 Link: https://lkml.kernel.org/r/YaodyZzu0MTCJcvO@elver.google.com Link: https://lkml.kernel.org/r/20220128105631.509772-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/Kconfig.kcsan | 11 ----------- lib/Kconfig.ubsan | 12 ------------ 2 files changed, 23 deletions(-) diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 63b70b8c55519..de022445fbba5 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -10,21 +10,10 @@ config HAVE_KCSAN_COMPILER For the list of compilers that support KCSAN, please see . -config KCSAN_KCOV_BROKEN - def_bool KCOV && CC_HAS_SANCOV_TRACE_PC - depends on CC_IS_CLANG - depends on !$(cc-option,-Werror=unused-command-line-argument -fsanitize=thread -fsanitize-coverage=trace-pc) - help - Some versions of clang support either KCSAN and KCOV but not the - combination of the two. - See https://bugs.llvm.org/show_bug.cgi?id=45831 for the status - in newer releases. - menuconfig KCSAN bool "KCSAN: dynamic data race detector" depends on HAVE_ARCH_KCSAN && HAVE_KCSAN_COMPILER depends on DEBUG_KERNEL && !KASAN - depends on !KCSAN_KCOV_BROKEN select STACKTRACE help The Kernel Concurrency Sanitizer (KCSAN) is a dynamic diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 236c5cefc4cc5..f3c57ed518381 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -27,16 +27,6 @@ config UBSAN_TRAP the system. For some system builders this is an acceptable trade-off. -config UBSAN_KCOV_BROKEN - def_bool KCOV && CC_HAS_SANCOV_TRACE_PC - depends on CC_IS_CLANG - depends on !$(cc-option,-Werror=unused-command-line-argument -fsanitize=bounds -fsanitize-coverage=trace-pc) - help - Some versions of clang support either UBSAN or KCOV but not the - combination of the two. - See https://bugs.llvm.org/show_bug.cgi?id=45831 for the status - in newer releases. - config CC_HAS_UBSAN_BOUNDS def_bool $(cc-option,-fsanitize=bounds) @@ -46,7 +36,6 @@ config CC_HAS_UBSAN_ARRAY_BOUNDS config UBSAN_BOUNDS bool "Perform array index bounds checking" default UBSAN - depends on !UBSAN_KCOV_BROKEN depends on CC_HAS_UBSAN_ARRAY_BOUNDS || CC_HAS_UBSAN_BOUNDS help This option enables detection of directly indexed out of bounds @@ -72,7 +61,6 @@ config UBSAN_ARRAY_BOUNDS config UBSAN_LOCAL_BOUNDS bool "Perform array local bounds checking" depends on UBSAN_TRAP - depends on !UBSAN_KCOV_BROKEN depends on $(cc-option,-fsanitize=local-bounds) help This option enables -fsanitize=local-bounds which traps when an From ab94bd5db44d58d4a1955298f0ecde8e5ff18df0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Feb 2022 15:32:37 +1100 Subject: [PATCH 334/334] ipc/mqueue: use get_tree_nodev() in mqueue_get_tree() When running the stress-ng clone benchmark with multiple testing threads, it was found that there were significant spinlock contention in sget_fc(). The contended spinlock was the sb_lock. It is under heavy contention because the following code in the critcal section of sget_fc(): hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } After testing with added instrumentation code, it was found that the the benchmark could generate thousands of ipc namespaces with the corresponding number of entries in the mqueue's fs_supers list where the namespaces are the key for the search. This leads to excessive time in scanning the list for a match. Looking back at the mqueue calling sequence leading to sget_fc(): mq_init_ns() => mq_create_mount() => fc_mount() => vfs_get_tree() => mqueue_get_tree() => get_tree_keyed() => vfs_get_super() => sget_fc() Currently, mq_init_ns() is the only mqueue function that will indirectly call mqueue_get_tree() with a newly allocated ipc namespace as the key for searching. As a result, there will never be a match with the exising ipc namespaces stored in the mqueue's fs_supers list. So using get_tree_keyed() to do an existing ipc namespace search is just a waste of time. Instead, we could use get_tree_nodev() to eliminate the useless search. By doing so, we can greatly reduce the sb_lock hold time and avoid the spinlock contention problem in case a large number of ipc namespaces are present. Of course, if the code is modified in the future to allow mqueue_get_tree() to be called with an existing ipc namespace instead of a new one, we will have to use get_tree_keyed() in this case. The following stress-ng clone benchmark command was run on a 2-socket 48-core Intel system: ./stress-ng --clone 32 --verbose --oomable --metrics-brief -t 20 The "bogo ops/s" increased from 5948.45 before patch to 9137.06 after patch. This is an increase of 54% in performance. Link: https://lkml.kernel.org/r/20220121172315.19652-1-longman@redhat.com Fixes: 935c6912b198 ("ipc: Convert mqueue fs to fs_context") Signed-off-by: Waiman Long Cc: Al Viro Cc: David Howells Cc: Manfred Spraul Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- ipc/mqueue.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5becca9be867c..089c34d0732cf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -45,6 +45,7 @@ struct mqueue_fs_context { struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 @@ -427,6 +428,14 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); + } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } @@ -454,6 +463,10 @@ static int mqueue_init_fs_context(struct fs_context *fc) return 0; } +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; @@ -465,6 +478,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) return ERR_CAST(fc); ctx = fc->fs_private; + ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns);