simple_lmk: Introduce Simple Low Memory Killer for Android

This is a complete low memory killer solution for Android that is small and simple. Processes are killed according to the priorities that Android gives them, so that the least important processes are always killed first. Processes are killed until memory deficits are satisfied, as observed from kswapd struggling to free up pages. Simple LMK stops killing processes when kswapd finally goes back to sleep. The only tunables are the desired amount of memory to be freed per reclaim event and desired frequency of reclaim events. Simple LMK tries to free at least the desired amount of memory per reclaim and waits until all of its victims' memory is freed before proceeding to kill more processes. Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
acuicultor · Jun 8, 2023 · d1b1a51 · d1b1a51
1 parent fccefe4
commit d1b1a51
Show file tree

Hide file tree

Showing 6 changed files with 409 additions and 0 deletions.
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
@@ -74,6 +74,50 @@ config ANDROID_VENDOR_HOOKS
 	  Allow vendor modules to attach to tracepoint "hooks" defined via
 	  DECLARE_HOOK or DECLARE_RESTRICTED_HOOK.
 
+config ANDROID_SIMPLE_LMK
+	bool "Simple Android Low Memory Killer"
+	depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG
+	help
+	  This is a complete low memory killer solution for Android that is
+	  small and simple. Processes are killed according to the priorities
+	  that Android gives them, so that the least important processes are
+	  always killed first. Processes are killed until memory deficits are
+	  satisfied, as observed from kswapd struggling to free up pages. Simple
+	  LMK stops killing processes when kswapd finally goes back to sleep.
+
+if ANDROID_SIMPLE_LMK
+
+config ANDROID_SIMPLE_LMK_AGGRESSION
+	int "Reclaim frequency selection"
+	range 1 3
+	default 1
+	help
+	  This value determines how frequently Simple LMK will perform memory
+	  reclaims. A lower value corresponds to less frequent reclaims, which
+	  maximizes memory usage. The range of values has a logarithmic
+	  correlation; 2 is twice as aggressive as 1, and 3 is twice as
+	  aggressive as 2, which makes 3 four times as aggressive as 1.
+
+	  The aggression is set as a factor of kswapd's scan depth. This means
+	  that a system with more memory will have a more expensive aggression
+	  factor compared to a system with less memory. For example, setting an
+	  aggression factor of 1 with 4 GiB of memory would be like setting a
+	  factor of 2 with 8 GiB of memory; the more memory a system has, the
+	  more expensive it is to use a lower value.
+
+	  Choosing a value of 1 here works well with systems that have 4 GiB of
+	  memory. If the default doesn't work well, then this value should be
+	  tweaked based on empirical results using different values.
+
+config ANDROID_SIMPLE_LMK_MINFREE
+	int "Minimum MiB of memory to free per reclaim"
+	range 8 512
+	default 100
+	help
+	  Simple LMK will try to free at least this much memory per reclaim.
+
+endif
+
 endif # if ANDROID
 
 endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
 obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
 obj-$(CONFIG_ANDROID_DEBUG_SYMBOLS)	+= debug_symbols.o
 obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o
+obj-$(CONFIG_ANDROID_SIMPLE_LMK)	+= simple_lmk.o
diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+
+#define pr_fmt(fmt) "simple_lmk: " fmt
+
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/oom.h>
+#include <linux/sort.h>
+#include <linux/version.h>
+
+/* The sched_param struct is located elsewhere in newer kernels */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+#include <uapi/linux/sched/types.h>
+#endif
+
+/* SEND_SIG_FORCED isn't present in newer kernels */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0)
+#define SIG_INFO_TYPE SEND_SIG_FORCED
+#else
+#define SIG_INFO_TYPE SEND_SIG_PRIV
+#endif
+
+/* The group argument to do_send_sig_info is different in newer kernels */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
+#define KILL_GROUP_TYPE true
+#else
+#define KILL_GROUP_TYPE PIDTYPE_TGID
+#endif
+
+/* The minimum number of pages to free per reclaim */
+#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
+
+/* Kill up to this many victims per reclaim */
+#define MAX_VICTIMS 1024
+
+struct victim_info {
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	unsigned long size;
+};
+
+/* Pulled from the Android framework. Lower adj means higher priority. */
+static const short adj_prio[] = {
+	906, /* CACHED_APP_MAX_ADJ */
+	905, /* Cached app */
+	904, /* Cached app */
+	903, /* Cached app */
+	902, /* Cached app */
+	901, /* Cached app */
+	900, /* CACHED_APP_MIN_ADJ */
+	800, /* SERVICE_B_ADJ */
+	700, /* PREVIOUS_APP_ADJ */
+	600, /* HOME_APP_ADJ */
+	500, /* SERVICE_ADJ */
+	400, /* HEAVY_WEIGHT_APP_ADJ */
+	300, /* BACKUP_APP_ADJ */
+	200, /* PERCEPTIBLE_APP_ADJ */
+	100, /* VISIBLE_APP_ADJ */
+	0    /* FOREGROUND_APP_ADJ */
+};
+
+static struct victim_info victims[MAX_VICTIMS];
+static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
+static DECLARE_COMPLETION(reclaim_done);
+static int victims_to_kill;
+static bool needs_reclaim;
+
+static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr)
+{
+	const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
+	const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
+
+	return rhs->size - lhs->size;
+}
+
+static bool vtsk_is_duplicate(struct victim_info *varr, int vlen,
+			      struct task_struct *vtsk)
+{
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		if (same_thread_group(varr[i].tsk, vtsk))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned long find_victims(struct victim_info *varr, int *vindex,
+				  int vmaxlen, short target_adj)
+{
+	unsigned long pages_found = 0;
+	int old_vindex = *vindex;
+	struct task_struct *tsk;
+
+	for_each_process(tsk) {
+		struct task_struct *vtsk;
+		unsigned long tasksize;
+
+		/*
+		 * Search for tasks with the targeted importance (adj). Since
+		 * only tasks with a positive adj can be targeted, that
+		 * naturally excludes tasks which shouldn't be killed, like init
+		 * and kthreads. Although oom_score_adj can still be changed
+		 * while this code runs, it doesn't really matter. We just need
+		 * to make sure that if the adj changes, we won't deadlock
+		 * trying to lock a task that we locked earlier.
+		 */
+		if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj ||
+		    vtsk_is_duplicate(varr, *vindex, tsk))
+			continue;
+
+		vtsk = find_lock_task_mm(tsk);
+		if (!vtsk)
+			continue;
+
+		/* Store this potential victim away for later */
+		varr[*vindex].tsk = vtsk;
+		varr[*vindex].mm = vtsk->mm;
+		varr[*vindex].size = get_mm_rss(vtsk->mm);
+
+		/* Keep track of the number of pages that have been found */
+		pages_found += tasksize;
+
+		/* Make sure there's space left in the victim array */
+		if (++*vindex == vmaxlen)
+			break;
+	}
+
+	/*
+	 * Sort the victims in descending order of size to prioritize killing
+	 * the larger ones first.
+	 */
+	if (pages_found)
+		sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr),
+		     victim_size_cmp, NULL);
+
+	return pages_found;
+}
+
+static int process_victims(struct victim_info *varr, int vlen,
+			   unsigned long pages_needed)
+{
+	unsigned long pages_found = 0;
+	int i, nr_to_kill = 0;
+
+	/*
+	 * Calculate the number of tasks that need to be killed and quickly
+	 * release the references to those that'll live.
+	 */
+	for (i = 0; i < vlen; i++) {
+		struct victim_info *victim = &victims[i];
+		struct task_struct *vtsk = victim->tsk;
+
+		/* The victim's mm lock is taken in find_victims; release it */
+		if (pages_found >= pages_needed) {
+			task_unlock(vtsk);
+			continue;
+		}
+
+		pages_found += victim->size;
+		nr_to_kill++;
+	}
+
+	return nr_to_kill;
+}
+
+static void scan_and_kill(unsigned long pages_needed)
+{
+	int i, nr_to_kill = 0, nr_victims = 0;
+	unsigned long pages_found = 0;
+
+	/*
+	 * Hold the tasklist lock so tasks don't disappear while scanning. This
+	 * is preferred to holding an RCU read lock so that the list of tasks
+	 * is guaranteed to be up to date.
+	 */
+	read_lock(&tasklist_lock);
+	for (i = 0; i < ARRAY_SIZE(adj_prio); i++) {
+		pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS,
+					    adj_prio[i]);
+		if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS)
+			break;
+	}
+	read_unlock(&tasklist_lock);
+
+	/* Pretty unlikely but it can happen */
+	if (unlikely(!nr_victims))
+		return;
+
+	/* First round of victim processing to weed out unneeded victims */
+	nr_to_kill = process_victims(victims, nr_victims, pages_needed);
+
+	/*
+	 * Try to kill as few of the chosen victims as possible by sorting the
+	 * chosen victims by size, which means larger victims that have a lower
+	 * adj can be killed in place of smaller victims with a high adj.
+	 */
+	sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL);
+
+	/* Second round of victim processing to finally select the victims */
+	nr_to_kill = process_victims(victims, nr_to_kill, pages_needed);
+
+	/* Kill the victims */
+	WRITE_ONCE(victims_to_kill, nr_to_kill);
+	for (i = 0; i < nr_to_kill; i++) {
+		struct victim_info *victim = &victims[i];
+		struct task_struct *vtsk = victim->tsk;
+
+		pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
+			vtsk->signal->oom_score_adj,
+			victim->size << (PAGE_SHIFT - 10));
+
+		/* Accelerate the victim's death by forcing the kill signal */
+		do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE);
+
+		/* Grab a reference to the victim for later before unlocking */
+		get_task_struct(vtsk);
+		task_unlock(vtsk);
+	}
+
+	/* Try to speed up the death process now that we can schedule again */
+	for (i = 0; i < nr_to_kill; i++) {
+		struct task_struct *vtsk = victims[i].tsk;
+
+		/* Increase the victim's priority to make it die faster */
+		set_user_nice(vtsk, MIN_NICE);
+
+		/* Allow the victim to run on any CPU */
+		set_cpus_allowed_ptr(vtsk, cpu_all_mask);
+
+		/* Finally release the victim reference acquired earlier */
+		put_task_struct(vtsk);
+	}
+
+	/* Wait until all the victims die */
+	wait_for_completion(&reclaim_done);
+}
+
+static int simple_lmk_reclaim_thread(void *data)
+{
+	static const struct sched_param sched_max_rt_prio = {
+		.sched_priority = MAX_RT_PRIO - 1
+	};
+
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
+
+	while (1) {
+		bool should_stop;
+
+		wait_event(oom_waitq, (should_stop = kthread_should_stop()) ||
+				      READ_ONCE(needs_reclaim));
+
+		if (should_stop)
+			break;
+
+		/*
+		 * Kill a batch of processes and wait for their memory to be
+		 * freed. After their memory is freed, sleep for 20 ms to give
+		 * OOM'd allocations a chance to scavenge for the newly-freed
+		 * pages. Rinse and repeat while there are still OOM'd
+		 * allocations.
+		 */
+		do {
+			scan_and_kill(MIN_FREE_PAGES);
+			msleep(20);
+		} while (READ_ONCE(needs_reclaim));
+	}
+
+	return 0;
+}
+
+void simple_lmk_decide_reclaim(int kswapd_priority)
+{
+	if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION)
+		return;
+
+	if (!cmpxchg(&needs_reclaim, false, true))
+		wake_up(&oom_waitq);
+}
+
+void simple_lmk_stop_reclaim(void)
+{
+	WRITE_ONCE(needs_reclaim, false);
+}
+
+void simple_lmk_mm_freed(struct mm_struct *mm)
+{
+	static atomic_t nr_killed = ATOMIC_INIT(0);
+	int i, nr_to_kill;
+
+	nr_to_kill = READ_ONCE(victims_to_kill);
+	for (i = 0; i < nr_to_kill; i++) {
+		if (cmpxchg(&victims[i].mm, mm, NULL) == mm) {
+			if (atomic_inc_return(&nr_killed) == nr_to_kill) {
+				WRITE_ONCE(victims_to_kill, 0);
+				nr_killed = (atomic_t)ATOMIC_INIT(0);
+				complete(&reclaim_done);
+			}
+			break;
+		}
+	}
+}
+
+/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
+static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
+{
+	static bool init_done;
+	struct task_struct *thread;
+
+	if (cmpxchg(&init_done, false, true))
+		return 0;
+
+	thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd");
+	BUG_ON(IS_ERR(thread));
+
+	return 0;
+}
+
+static const struct kernel_param_ops simple_lmk_init_ops = {
+	.set = simple_lmk_init_set
+};
+
+/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "lowmemorykiller."
+module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);