Skip to content

Commit

Permalink
pidfs: remove config option
Browse files Browse the repository at this point in the history
Enable pidfs unconditionally. There's no real reason not do to it.
For 32bit systems we add a simple inode allocation mechanism that still
guarantees that userspace can compare processes by inode number which
they already do as I found out in [1]. If they also need the uniqueness
property that we get by default on 64bit systems they should simply
parse the contents of /proc/<pid>/fd/<nr>. On 64bit we don't have to
deal with any of this and things are nice and simple.

Link: systemd/systemd#31713 [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
  • Loading branch information
brauner committed Mar 13, 2024
1 parent e9c5263 commit 7dc6964
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 67 deletions.
7 changes: 0 additions & 7 deletions fs/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,6 @@ source "fs/proc/Kconfig"
source "fs/kernfs/Kconfig"
source "fs/sysfs/Kconfig"

config FS_PID
bool "Pseudo filesystem for process file descriptors"
depends on 64BIT
default y
help
Pidfs implements advanced features for process file descriptors.

config TMPFS
bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
Expand Down
99 changes: 47 additions & 52 deletions fs/pidfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,6 @@

#include "internal.h"

static int pidfd_release(struct inode *inode, struct file *file)
{
#ifndef CONFIG_FS_PID
struct pid *pid = file->private_data;

file->private_data = NULL;
put_pid(pid);
#endif
return 0;
}

#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
Expand Down Expand Up @@ -120,7 +109,6 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
}

static const struct file_operations pidfs_file_operations = {
.release = pidfd_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
Expand All @@ -131,16 +119,47 @@ struct pid *pidfd_pid(const struct file *file)
{
if (file->f_op != &pidfs_file_operations)
return ERR_PTR(-EBADF);
#ifdef CONFIG_FS_PID
return file_inode(file)->i_private;
#else
return file->private_data;
#endif
}

#ifdef CONFIG_FS_PID
static struct vfsmount *pidfs_mnt __ro_after_init;

#if BITS_PER_LONG == 32
/*
* Provide a fallback mechanism for 32-bit systems so processes remain
* reliably comparable by inode number even on those systems.
*/
static DEFINE_IDA(pidfd_inum_ida);

/*
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This avoids
* collisions with the root inode which is 1 for pseudo filesystems.
*/
static int pidfs_inum(struct pid *pid, unsigned long *ino)
{
int ret;

ret = ida_alloc_range(&pidfd_inum_ida, 301, UINT_MAX, GFP_ATOMIC);
if (ret < 0)
return -ENOSPC;

*ino = ret;
return 0;
}

static void pidfs_free_inum(unsigned long ino)
{
ida_free(&pidfd_inum_ida, ino);
}
#else
static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
{
*ino = pid->ino;
return 0;
}
#define pidfs_free_inum(ino) ((void)(ino))
#endif

/*
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
* implemented. Let's reject it completely until we have a clean
Expand Down Expand Up @@ -173,6 +192,7 @@ static void pidfs_evict_inode(struct inode *inode)

clear_inode(inode);
put_pid(pid);
pidfs_free_inum(inode->i_ino);
}

static const struct super_operations pidfs_sops = {
Expand All @@ -183,8 +203,10 @@ static const struct super_operations pidfs_sops = {

static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
d_inode(dentry)->i_ino);
struct inode *inode = d_inode(dentry);
struct pid *pid = inode->i_private;

return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino);
}

static const struct dentry_operations pidfs_dentry_operations = {
Expand Down Expand Up @@ -239,13 +261,13 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
struct file *pidfd_file;
struct path path;
int ret;
unsigned long ino;

/*
* Inode numbering for pidfs start at RESERVED_PIDS + 1.
* This avoids collisions with the root inode which is 1
* for pseudo filesystems.
*/
ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
ret = pidfs_inum(pid, &ino);
if (ret < 0)
return ERR_PTR(ret);

ret = path_from_stashed(&pid->stashed, ino, pidfs_mnt,
get_pid(pid), &path);
if (ret < 0)
return ERR_PTR(ret);
Expand All @@ -261,30 +283,3 @@ void __init pidfs_init(void)
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
}

bool is_pidfs_sb(const struct super_block *sb)
{
return sb == pidfs_mnt->mnt_sb;
}

#else /* !CONFIG_FS_PID */

struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
{
struct file *pidfd_file;

pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
flags | O_RDWR);
if (IS_ERR(pidfd_file))
return pidfd_file;

get_pid(pid);
return pidfd_file;
}

void __init pidfs_init(void) { }
bool is_pidfs_sb(const struct super_block *sb)
{
return false;
}
#endif
4 changes: 1 addition & 3 deletions include/linux/pid.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,8 @@ struct pid
refcount_t count;
unsigned int level;
spinlock_t lock;
#ifdef CONFIG_FS_PID
struct dentry *stashed;
unsigned long ino;
#endif
u64 ino;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
Expand Down
1 change: 0 additions & 1 deletion include/linux/pidfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@

struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
bool is_pidfs_sb(const struct super_block *sb);

#endif /* _LINUX_PID_FS_H */
4 changes: 0 additions & 4 deletions kernel/pid.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,11 @@ int pid_max = PID_MAX_DEFAULT;

int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
#ifdef CONFIG_FS_PID
/*
* Pseudo filesystems start inode numbering after one. We use Reserved
* PIDs as a natural offset.
*/
static u64 pidfs_ino = RESERVED_PIDS;
#endif

/*
* PID-map pages start out as NULL, they get allocated upon
Expand Down Expand Up @@ -280,10 +278,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
#ifdef CONFIG_FS_PID
pid->stashed = NULL;
pid->ino = ++pidfs_ino;
#endif
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
Expand Down

0 comments on commit 7dc6964

Please sign in to comment.