Skip to content
Browse files

cgroup: Remove call to synchronize_rcu in cgroup_attach_task

synchronize_rcu can be very expensive, averaging 100 ms in
some cases.  In cgroup_attach_task, it is used to prevent
a task->cgroups pointer dereferenced in an RCU read side
critical section from being invalidated, by delaying the
call to put_css_set until after an RCU grace period.

To avoid the call to synchronize_rcu, make the put_css_set
call rcu-safe by moving the deletion of the css_set links
into free_css_set_work, scheduled by the rcu callback
free_css_set_rcu.

The decrement of the cgroup refcount is no longer
synchronous with the call to put_css_set, which can result
in the cgroup refcount staying positive after the last call
to cgroup_attach_task returns.  To allow the cgroup to be
deleted with cgroup_rmdir synchronously after
cgroup_attach_task, have rmdir check the refcount of all
associated css_sets.  If cgroup_rmdir is called on a cgroup
for which the css_sets all have refcount zero but the
cgroup refcount is nonzero, reuse the rmdir waitqueue to
block the rmdir until free_css_set_work is called.

Signed-off-by: Colin Cross <ccross@android.com>

Conflicts:

	kernel/cgroup.c

Change-Id: I3b3f245c8f5e2e5d33f1e54178b2bb6ef10a0817
  • Loading branch information...
1 parent 4a12178 commit befae2f2c9137d6af5c8b38670f00441019032bb @colincross colincross committed Nov 23, 2010
Showing with 87 additions and 48 deletions.
  1. +1 −0 include/linux/cgroup.h
  2. +86 −48 kernel/cgroup.c
View
1 include/linux/cgroup.h
@@ -247,6 +247,7 @@ struct css_set {
/* For RCU-protected deletion */
struct rcu_head rcu_head;
+ struct work_struct work;
};
/*
View
134 kernel/cgroup.c
@@ -287,6 +287,33 @@ static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ wake_up_all(&cgroup_rmdir_waitq);
+}
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
@@ -346,6 +373,37 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
return &css_set_table[index];
}
+static void free_css_set_work(struct work_struct *work)
+{
+ struct css_set *cg = container_of(work, struct css_set, work);
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ write_lock(&css_set_lock);
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
+ struct cgroup *cgrp = link->cgrp;
+ list_del(&link->cg_link_list);
+ list_del(&link->cgrp_link_list);
+ if (atomic_dec_and_test(&cgrp->count)) {
+ check_for_release(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ }
+ kfree(link);
+ }
+ write_unlock(&css_set_lock);
+
+ kfree(cg);
+}
+
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+ struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+
+ INIT_WORK(&cg->work, free_css_set_work);
+ schedule_work(&cg->work);
+}
+
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
@@ -362,8 +420,6 @@ static inline void get_css_set(struct css_set *cg)
static void put_css_set(struct css_set *cg)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
@@ -377,23 +433,11 @@ static void put_css_set(struct css_set *cg)
return;
}
- /* This css_set is dead. unlink it and release cgroup refcounts */
hlist_del(&cg->hlist);
css_set_count--;
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
- struct cgroup *cgrp = link->cgrp;
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- if (atomic_dec_and_test(&cgrp->count))
- check_for_release(cgrp);
-
- kfree(link);
- }
-
write_unlock(&css_set_lock);
- kfree_rcu(cg, rcu_head);
+ call_rcu(&cg->rcu_head, free_css_set_rcu);
}
/*
@@ -725,9 +769,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
@@ -917,33 +961,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
}
/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
-/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
@@ -1875,6 +1892,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
struct cgroupfs_root *root = cgrp->root;
struct cgroup_taskset tset = { };
struct css_set *newcg;
+ struct css_set *cg;
/* @tsk either already exited or can't exit until the end */
if (tsk->flags & PF_EXITING)
@@ -1910,15 +1928,20 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
+ task_lock(tsk);
+ cg = tsk->cgroups;
+ get_css_set(cg);
+ task_unlock(tsk);
+
cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(cgrp, &tset);
}
-
set_bit(CGRP_RELEASABLE, &cgrp->flags);
- synchronize_rcu();
+ /* put_css_set will not destroy cg until after an RCU grace period */
+ put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -3931,6 +3954,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
return !failed;
}
+/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
+ * Must be called with css_set_lock held */
+static int cgroup_css_sets_empty(struct cgroup *cgrp)
+{
+ struct cg_cgroup_link *link;
+
+ list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+ struct css_set *cg = link->cg;
+ if (atomic_read(&cg->refcount) > 0)
+ return 0;
+ }
+
+ return 1;
+}
+
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
@@ -3943,7 +3981,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
/* the vfs holds both inode->i_mutex already */
again:
mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
+ if (!cgroup_css_sets_empty(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
@@ -3976,7 +4014,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;

0 comments on commit befae2f

Please sign in to comment.
Something went wrong with that request. Please try again.