The VRQ Scheduler based on Con Kolivas' BFS 0472

VRQ comments === Below are comments from BFS 0472 === A single shared runqueue O(n) strict fairness earliest deadline first design. Excellent throughput and latency for 1 to many CPUs on desktop and server commodity hardware. Not recommended for 4096 cpus. Scalability is optimal when your workload is equal to the number of CPUs on bfs. ie you should ONLY do make -j4 on quad core, -j2 on dual core and so on. Interactive mode is enabled by default but can be disabled for improved throughput echo 0 > /proc/sys/kernel/interactive Features SCHED_IDLEPRIO and SCHED_ISO scheduling policies as well. You do NOT need to use these policies for good performance, they are purely optional for even better performance in extreme conditions. To run something idleprio, use schedtool like so: schedtool -D -e make -j4 To run something isoprio, use schedtool like so: schedtool -I -e amarok Includes configurable SMT-nice support for better nice level and scheduling policy support across SMT (aka hyperthread) sibling CPUs. Includes accurate sub-tick accounting of tasks so userspace reported cpu usage may be very different if you have very short lived tasks. -ck
cchalpha · Oct 23, 2017 · 779deb3 · 779deb3
1 parent 851843f
commit 779deb3
Show file tree

Hide file tree

Showing 34 changed files with 8,624 additions and 39 deletions.
diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
@@ -39,6 +39,7 @@ show up in /proc/sys/kernel:
 - hung_task_timeout_secs
 - hung_task_warnings
 - kexec_load_disabled
+- iso_cpu
 - kptr_restrict
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -72,6 +73,7 @@ show up in /proc/sys/kernel:
 - randomize_va_space
 - real-root-dev               ==> Documentation/admin-guide/initrd.rst
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -394,6 +396,16 @@ When kptr_restrict is set to (2), kernel pointers printed using
 
 ==============================================================
 
+iso_cpu: (BFS CPU scheduler only).
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
@@ -810,6 +822,20 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval: (BFS CPU scheduler only)
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct task_struct *spusched_task;
 static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 
-/*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO		120
-
 /*
  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  * tick for every 10 CPU scheduler ticks.

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
@@ -945,10 +945,26 @@ config SCHED_SMT
 	depends on SMP
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  when dealing with Intel P4/Core 2 chips with HyperThreading at a
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SMT_NICE
+	bool "SMT (Hyperthreading) aware nice priority and policy support"
+	depends on SCHED_BFS && SCHED_SMT
+	default y
+	---help---
+	  Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+	  of the use of 'nice' levels and different scheduling policies
+	  (e.g. realtime) due to sharing of CPU power between hyperthreads.
+	  SMT nice support makes each logical CPU aware of what is running on
+	  its hyperthread siblings, maintaining appropriate distribution of
+	  CPU according to nice levels and scheduling policies at the expense
+	  of slightly increased overhead.
+
+	  If unsure say Y here.
+
+
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
@@ -2094,7 +2110,7 @@ config HOTPLUG_CPU
 config BOOTPARAM_HOTPLUG_CPU0
 	bool "Set default setting of cpu0_hotpluggable"
 	default n
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !SCHED_BFS
 	---help---
 	  Set whether default state of cpu0_hotpluggable is on or off.
 
@@ -2123,7 +2139,7 @@ config BOOTPARAM_HOTPLUG_CPU0
 config DEBUG_HOTPLUG_CPU0
 	def_bool n
 	prompt "Debug CPU0 hotplug"
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !SCHED_BFS
 	---help---
 	  Enabling this option offlines CPU0 (if CPU0 can be offlined) as
 	  soon as possible and boots up userspace with CPU0 offlined. User

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -1925,6 +1926,10 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 
 	/* Make sure that target_freq is within supported range */
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
+	if (target_freq >= policy->max)
+		cpu_nonscaling(policy->cpu);
+	else
+		cpu_scaling(policy->cpu);
 
 	pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n",
 		 policy->cpu, target_freq, relation, old_target_freq);

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
@@ -31,8 +31,8 @@ struct cs_dbs_tuners {
 };
 
 /* Conservative governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
-#define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
+#define DEF_FREQUENCY_DOWN_THRESHOLD		(26)
 #define DEF_FREQUENCY_STEP			(5)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
@@ -21,7 +21,7 @@
 #include "cpufreq_ondemand.h"
 
 /* On-demand governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
@@ -130,7 +130,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
 }
 
 /*
- * Every sampling_rate, we check, if current idle time is less than 20%
+ * Every sampling_rate, we check, if current idle time is less than 37%
  * (default), then we try to increase frequency. Else, we adjust the frequency
  * proportional to load.
  */

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
@@ -1325,8 +1325,13 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate)
 	vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
 	vid = ceiling_fp(vid_fp);
 
-	if (pstate > cpudata->pstate.max_pstate)
-		vid = cpudata->vid.turbo;
+	if (pstate < cpudata->pstate.max_pstate)
+		cpu_scaling(cpudata->cpu);
+	else {
+		if (pstate > cpudata->pstate.max_pstate)
+			vid = cpudata->vid.turbo;
+		cpu_nonscaling(cpudata->cpu);
+	}
 
 	return val | vid;
 }

diff --git a/fs/proc/base.c b/fs/proc/base.c
@@ -464,7 +464,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 		seq_printf(m, "0 0 0\n");
 	else
 		seq_printf(m, "%llu %llu %lu\n",
-		   (unsigned long long)task->se.sum_exec_runtime,
+		   (unsigned long long)tsk_seruntime(task),
 		   (unsigned long long)task->sched_info.run_delay,
 		   task->sched_info.pcount);
 

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
@@ -177,8 +177,6 @@ extern struct cred init_cred;
 # define INIT_VTIME(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
-
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
@@ -229,6 +227,78 @@ extern struct cred init_cred;
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
+#ifdef CONFIG_SCHED_BFS
+#define INIT_TASK_COMM "BFS"
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= NORMAL_PRIO,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= NORMAL_PRIO,					\
+	.deadline	= 0,						\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
+	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.time_slice	= HZ,					\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
+	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	.thread_group	= LIST_HEAD_INIT(tsk.thread_group),		\
+	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),	\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_TASK_RCU_TASKS(tsk)					\
+	INIT_CPUSET_SEQ(tsk)						\
+	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
+	INIT_VTIME(tsk)							\
+	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
+}
+#else /* CONFIG_SCHED_BFS */
+#define INIT_TASK_COMM "swapper"
 #define INIT_TASK(tsk)	\
 {									\
 	INIT_TASK_TI(tsk)						\
@@ -306,7 +376,7 @@ extern struct cred init_cred;
 	INIT_LIVEPATCH(tsk)						\
 	INIT_TASK_SECURITY						\
 }
-
+#endif /* CONFIG_SCHED_BFS */
 
 /* Attach to the init_task data structure for proper alignment */
 #define __init_task_data __attribute__((__section__(".data..init_task")))

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
@@ -51,6 +51,8 @@ enum {
  */
 static inline int task_nice_ioprio(struct task_struct *task)
 {
+	if (iso_task(task))
+		return 0;
 	return (task_nice(task) + 20) / 5;
 }
 

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
@@ -167,7 +167,7 @@ static inline u64 get_jiffies_64(void)
  * Have the 32 bit jiffies value wrap 5 minutes after boot
  * so jiffies wrap bugs show up earlier.
  */
-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
 
 /*
  * Change timeval to jiffies, trying to avoid the