@@ -59,6 +59,20 @@ config EXPERIMENTAL
you say Y here, you will be offered the choice of using features or
drivers that are currently considered to be in the alpha-test phase.

config SCHED_BFS
bool "BFS cpu scheduler"
---help---
The Brain Fuck CPU Scheduler for excellent interactivity and
responsiveness on the desktop and solid scalability on normal
hardware and commodity servers. Not recommended for 4096 CPUs.

Currently incompatible with the Group CPU scheduler, and RCU TORTURE
TEST so these options are disabled.

Say Y here.
default y


config BROKEN
bool

@@ -333,7 +347,7 @@ choice
# Kind of a stub config for the pure tick based cputime accounting
config TICK_CPU_ACCOUNTING
bool "Simple tick based cputime accounting"
depends on !S390 && !NO_HZ_FULL
depends on !S390 && !NO_HZ_FULL && !SCHED_BFS
help
This is the basic tick based cputime accounting that maintains
statistics about user, system and idle time spent on per jiffies
@@ -356,7 +370,7 @@ config VIRT_CPU_ACCOUNTING_NATIVE

config VIRT_CPU_ACCOUNTING_GEN
bool "Full dynticks CPU time accounting"
depends on HAVE_CONTEXT_TRACKING && 64BIT
depends on HAVE_CONTEXT_TRACKING && 64BIT && !SCHED_BFS
select VIRT_CPU_ACCOUNTING
select CONTEXT_TRACKING
help
@@ -826,6 +840,7 @@ config NUMA_BALANCING
depends on ARCH_SUPPORTS_NUMA_BALANCING
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
depends on SMP && NUMA && MIGRATION
depends on !SCHED_BFS
help
This option adds support for automatic NUMA aware memory/task placement.
The mechanism is quite primitive and is based on migrating memory when
@@ -888,6 +903,7 @@ config PROC_PID_CPUSET

config CGROUP_CPUACCT
bool "Simple CPU accounting cgroup subsystem"
depends on !SCHED_BFS
help
Provides a simple Resource Controller for monitoring the
total CPU consumed by the tasks in a cgroup.
@@ -990,6 +1006,7 @@ config CGROUP_PERF

menuconfig CGROUP_SCHED
bool "Group CPU scheduler"
depends on !SCHED_BFS
default n
help
This feature lets CPU scheduler recognize task groups and control CPU
@@ -1154,6 +1171,7 @@ config UIDGID_STRICT_TYPE_CHECKS

config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
depends on !SCHED_BFS
select EVENTFD
select CGROUPS
select CGROUP_SCHED
@@ -1571,38 +1589,8 @@ config COMPAT_BRK

On non-ancient distros (post-2000 ones) N is usually a safe choice.

choice
prompt "Choose SLAB allocator"
default SLUB
help
This option allows to select a slab allocator.

config SLAB
bool "SLAB"
help
The regular slab allocator that is established and known to work
well in all environments. It organizes cache hot objects in
per cpu and per node queues.

config SLUB
bool "SLUB (Unqueued Allocator)"
help
SLUB is a slab allocator that minimizes cache line usage
instead of managing queues of cached objects (SLAB approach).
Per cpu caching is realized using slabs of objects instead
of queues of objects. SLUB can use memory efficiently
and has enhanced diagnostics. SLUB is the default choice for
a slab allocator.

config SLOB
depends on EXPERT
bool "SLOB (Simple Allocator)"
help
SLOB replaces the stock allocator with a drastically simpler
allocator. SLOB is generally more space efficient but
does not perform as well on large systems.

endchoice
def_bool y

config MMAP_ALLOW_UNINITIALIZED
bool "Allow mmapped anonymous memory to be uninitialized"
@@ -741,7 +741,6 @@ int __init_or_module do_one_initcall(initcall_t fn)
return ret;
}


extern initcall_t __initcall_start[];
extern initcall_t __initcall0_start[];
extern initcall_t __initcall1_start[];
@@ -862,6 +861,8 @@ static int __ref kernel_init(void *unused)

flush_delayed_fput();

print_scheduler_version();

if (ramdisk_execute_command) {
if (!run_init_process(ramdisk_execute_command))
return 0;
@@ -133,7 +133,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->se.sum_exec_runtime;
t3 = tsk_seruntime(tsk);

d->cpu_count += t1;

@@ -136,7 +136,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->sum_sched_runtime += tsk_seruntime(tsk);
}

sig->nr_threads--;
@@ -498,11 +498,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
{
cputime_t utime, stime;

add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
add_device_randomness((const void*) &tsk_seruntime(tsk),
sizeof(unsigned long long));
task_cputime(tsk, &utime, &stime);
cleanup_timers(tsk->cpu_timers,
utime, stime, tsk->se.sum_exec_runtime);
utime, stime, tsk_seruntime(tsk));

}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -513,7 +513,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
task_cputime(tsk, &utime, &stime);
cleanup_timers(tsk->signal->cpu_timers,
utime + sig->utime, stime + sig->stime,
tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
tsk_seruntime(tsk) + sig->sum_sched_runtime);
}

static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -976,7 +976,7 @@ static void check_thread_timers(struct task_struct *tsk,
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
tsk->cputime_expires.sched_exp = t->expires.sched;
break;
}
@@ -993,15 +993,15 @@ static void check_thread_timers(struct task_struct *tsk,
ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);

if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
@@ -1282,7 +1282,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
struct task_cputime task_sample = {
.utime = utime,
.stime = stime,
.sum_exec_runtime = tsk->se.sum_exec_runtime
.sum_exec_runtime = tsk_seruntime(tsk)
};

if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
@@ -11,10 +11,14 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif

ifdef CONFIG_SCHED_BFS
obj-y += bfs.o clock.o
else
obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_CONCURRENCY) += consolidation.o
endif
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHEDSTATS) += stats.o

Large diffs are not rendered by default.

@@ -40,7 +40,8 @@ struct cpu_stopper {
};

static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);

static bool stop_machine_initialized = false;

static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -130,7 +130,12 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused three = 3;
static unsigned long one_ul = 1;
static int one_hundred = 100;
static int __maybe_unused one_hundred = 100;
#ifdef CONFIG_SCHED_BFS
extern int rr_interval;
extern int sched_iso_cpu;
static int __read_mostly one_thousand = 1000;
#endif
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -263,7 +268,7 @@ static struct ctl_table sysctl_base_table[] = {
{ }
};

#ifdef CONFIG_SCHED_DEBUG
#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
@@ -280,6 +285,7 @@ static int max_extfrag_threshold = 1000;
#endif

static struct ctl_table kern_table[] = {
#ifndef CONFIG_SCHED_BFS
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -443,6 +449,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
#endif /* !CONFIG_SCHED_BFS */
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -914,6 +921,26 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
#ifdef CONFIG_SCHED_BFS
{
.procname = "rr_interval",
.data = &rr_interval,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = &one,
.extra2 = &one_thousand,
},
{
.procname = "iso_cpu",
.data = &sched_iso_cpu,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one_hundred,
},
#endif
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
{
.procname = "spin_retry",
@@ -94,7 +94,7 @@ config NO_HZ_IDLE
config NO_HZ_FULL
bool "Full dynticks system (tickless)"
# NO_HZ_COMMON dependency
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
# We need at least one periodic CPU for timekeeping
depends on SMP
# RCU_USER_QS dependency
@@ -953,7 +953,7 @@ config SPARSE_RCU_POINTER

config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
depends on DEBUG_KERNEL && !SCHED_BFS
default n
help
This option provides a kernel module that runs torture tests