Skip to content

Commit

Permalink
BACKPORT: FROMGIT: userfaultfd: add UFFDIO_CONTINUE ioctl
Browse files Browse the repository at this point in the history
This ioctl is how userspace ought to resolve "minor" userfaults. The
idea is, userspace is notified that a minor fault has occurred. It might
change the contents of the page using its second non-UFFD mapping, or
not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured
the page contents are correct, carry on setting up the mapping".

Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in
the minor fault case, we already have some pre-existing underlying page.
Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
We'd just use memcpy() or similar instead.

It turns out hugetlb_mcopy_atomic_pte() already does very close to what
we want, if an existing page is provided via `struct page **pagep`. We
already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
just extend that design: add an enum for the three modes of operation,
and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
case. (Basically, look up the existing page, and avoid adding the
existing page to the page cache or calling set_page_huge_active() on
it.)

Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>

(cherry picked from commit 14ea86439abaf3423cd9b6712ed5ce8451d2d181
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm)
Link: https://lore.kernel.org/patchwork/patch/1388136/

Conflicts: fs/userfaultfd.c
	include/linux/hugetlb.h
	include/linux/userfaultfd_k.h
	include/uapi/linux/userfaultfd.h
	mm/hugetlb.c
	mm/userfaultfd.c
(1. 8f251a3 is not cherry-picked yet so
    switched SetHPageMigratable() to set_active_huge_page() in
    mm/hugetlb.c,
2. Other files conflicts due to lack of write-protect userfaultfd
support. Manually rebased accordingly
3. Included linux/mm.h in linux/userfaultfd_k.h for definitions of
VM_UFFD_*)
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Bug: 160737021
Bug: 169683130
Change-Id: I45b62959dcb1d343154cb831113a26e47e77c8af
  • Loading branch information
CmdrMoozy authored and toddkjos committed Apr 10, 2021
1 parent 5dc3e85 commit b69f713
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 31 deletions.
72 changes: 72 additions & 0 deletions fs/userfaultfd.c
Expand Up @@ -1516,6 +1516,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
up_write(&mm->mmap_sem);
mmput(mm);
if (!ret) {
__u64 ioctls_out;

ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
UFFD_API_RANGE_IOCTLS;

/* CONTINUE ioctl is only supported for MINOR ranges. */
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
Expand Down Expand Up @@ -1826,6 +1835,66 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
return ret;
}

static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
__s64 ret;
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range;

user_uffdio_continue = (struct uffdio_continue __user *)arg;

ret = -EAGAIN;
if (READ_ONCE(ctx->mmap_changing))
goto out;

ret = -EFAULT;
if (copy_from_user(&uffdio_continue, user_uffdio_continue,
/* don't copy the output fields */
sizeof(uffdio_continue) - (sizeof(__s64))))
goto out;

ret = validate_range(ctx->mm, &uffdio_continue.range.start,
uffdio_continue.range.len);
if (ret)
goto out;

ret = -EINVAL;
/* double check for wraparound just in case. */
if (uffdio_continue.range.start + uffdio_continue.range.len <=
uffdio_continue.range.start) {
goto out;
}
if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
goto out;

if (mmget_not_zero(ctx->mm)) {
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len,
&ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
}

if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
return -EFAULT;
if (ret < 0)
goto out;

/* len == 0 would wake all */
BUG_ON(!ret);
range.len = ret;
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
range.start = uffdio_continue.range.start;
wake_userfault(ctx, &range);
}
ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;

out:
return ret;
}

static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
Expand Down Expand Up @@ -1910,6 +1979,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
case UFFDIO_CONTINUE:
ret = userfaultfd_continue(ctx, arg);
break;
}
return ret;
}
Expand Down
4 changes: 3 additions & 1 deletion include/linux/hugetlb.h
Expand Up @@ -10,6 +10,7 @@
#include <linux/list.h>
#include <linux/kref.h>
#include <asm/pgtable.h>
#include <linux/userfaultfd_k.h>

struct ctl_table;
struct user_struct;
Expand Down Expand Up @@ -94,6 +95,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
enum mcopy_atomic_mode mode,
struct page **pagep);
#endif /* CONFIG_USERFAULTFD */
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
Expand Down Expand Up @@ -189,7 +191,7 @@ static inline void hugetlb_show_meminfo(void)
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#ifdef CONFIG_USERFAULTFD
#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
src_addr, pagep) ({ BUG(); 0; })
src_addr, mode, pagep) ({ BUG(); 0; })
#endif /* CONFIG_USERFAULTFD */
#define huge_pte_offset(mm, address, sz) 0

Expand Down
19 changes: 19 additions & 0 deletions include/linux/userfaultfd_k.h
Expand Up @@ -14,6 +14,7 @@
#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */

#include <linux/fcntl.h>
#include <linux/mm.h>

/* The set of all possible UFFD-related VM flags. */
#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
Expand All @@ -35,13 +36,31 @@ extern int sysctl_unprivileged_userfaultfd;

extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);

/*
* The mode of operation for __mcopy_atomic and its helpers.
*
* This is almost an implementation detail (mcopy_atomic below doesn't take this
* as a parameter), but it's exposed here because memory-kind-specific
* implementations (e.g. hugetlbfs) need to know the mode of operation.
*/
enum mcopy_atomic_mode {
/* A normal copy_from_user into the destination range. */
MCOPY_ATOMIC_NORMAL,
/* Don't copy; map the destination range to the zero page. */
MCOPY_ATOMIC_ZEROPAGE,
/* Just install pte(s) with the existing page(s) in the page cache. */
MCOPY_ATOMIC_CONTINUE,
};

extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
bool *mmap_changing);
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long len,
bool *mmap_changing);
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, bool *mmap_changing);

/* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
Expand Down
21 changes: 19 additions & 2 deletions include/uapi/linux/userfaultfd.h
Expand Up @@ -38,10 +38,12 @@
#define UFFD_API_RANGE_IOCTLS \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE)
(__u64)1 << _UFFDIO_ZEROPAGE | \
(__u64)1 << _UFFDIO_CONTINUE)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY)
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_CONTINUE)

/*
* Valid ioctl command number range with this API is from 0x00 to
Expand All @@ -56,6 +58,7 @@
#define _UFFDIO_WAKE (0x02)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_ZEROPAGE (0x04)
#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_API (0x3F)

/* userfaultfd ioctl ids */
Expand All @@ -72,6 +75,8 @@
struct uffdio_copy)
#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
struct uffdio_zeropage)
#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \
struct uffdio_continue)

/* read() structure */
struct uffd_msg {
Expand Down Expand Up @@ -242,6 +247,18 @@ struct uffdio_zeropage {
__s64 zeropage;
};

struct uffdio_continue {
struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
__u64 mode;

/*
* Fields below here are written by the ioctl and must be at the end:
* the copy_from_user will not read past here.
*/
__s64 mapped;
};

/*
* Flags for the userfaultfd(2) system call itself.
*/
Expand Down
40 changes: 26 additions & 14 deletions mm/hugetlb.c
Expand Up @@ -37,7 +37,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"

Expand Down Expand Up @@ -4343,8 +4342,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
enum mcopy_atomic_mode mode,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct address_space *mapping;
pgoff_t idx;
unsigned long size;
Expand All @@ -4354,8 +4355,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
spinlock_t *ptl;
int ret;
struct page *page;
int writable;

if (!*pagep) {
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);

if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
if (!page)
goto out;
} else if (!*pagep) {
ret = -ENOMEM;
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page))
Expand Down Expand Up @@ -4384,13 +4394,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*/
__SetPageUptodate(page);

mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);

/*
* If shared, add to page cache
*/
if (vm_shared) {
/* Add shared, newly allocated pages to the page cache. */
if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
Expand Down Expand Up @@ -4435,8 +4440,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}

_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
if (dst_vma->vm_flags & VM_WRITE)
/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
if (is_continue && !vm_shared)
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;

_dst_pte = make_huge_pte(dst_vma, page, writable);
if (writable)
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);

Expand All @@ -4450,15 +4461,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);

spin_unlock(ptl);
set_page_huge_active(page);
if (vm_shared)
if (!is_continue)
set_page_huge_active(page);
if (vm_shared || is_continue)
unlock_page(page);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
if (vm_shared)
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
put_page(page);
Expand Down

0 comments on commit b69f713

Please sign in to comment.