# Does cluster-packing save energy?
This notebook runs a workload at various frequencies. The workload is run both force-packed onto a single cluster and force-spread across two clusters, and the energy is compared.

The target configuration (with AEP configuration) should be in target.conf.

This assumes an SMP-like system e.g. HiKey, Pixel, with two clusters.

## Setup...

In [1]:
# Various setup, probably mostly unused

from env import TestEnv
import pandas as pd
import json
import os
from conf import LisaLogging
from bart.common.Utils import area_under_curve
from trappy.plotter import plot_trace
from IPython.display import display
from trappy import ILinePlot
from trappy.stats.grammar import Parser
from bart.sched.SchedMultiAssert import SchedMultiAssert
import pandas as pd
from trace import Trace
LisaLogging.setup()
import logging
logging.getLogger('Trace').setLevel(logging.ERROR)
logging.getLogger('Analysis').setLevel(logging.WARNING)
logging.getLogger('EnergyMeter').setLevel(logging.DEBUG)
%matplotlib inline
from platforms.juno_energy import juno_energy
from platforms.pixel_energy import pixel_energy
import tests.eas.generic
import numpy as np
from IPython.display import display
from wlgen import RTA, Periodic
from executor import Executor
from platforms.hikey_energy import hikey_energy
import numpy as np

2017-03-15 18:37:11,831 INFO    : root         : Using LISA logging configuration:
2017-03-15 18:37:11,831 INFO    : root         :   /home/brendan/lisa/logging.conf


In [2]:
# Some nonsense to get caiman to work on Brendan's computer
p = os.getenv('PATH').split(':')
caiman_path = '/opt/ds5_v5.23.0/bin'
if caiman_path not in p:
    p.insert(0, caiman_path)
    os.environ['PATH'] = ':'.join(p)

In [3]:
te = TestEnv(test_conf={
        'modules': ['cgroups'], 
        'ftrace': {
            'events': ['sched_switch', 'cpu_frequency', 'cpu_idle', 'sched_load_avg_cpu', 'sched_load_avg_task', 'irq*']
        }
    }, force_new=True)

2017-03-15 18:37:11,984 INFO    : TestEnv      : Using base path: /home/brendan/lisa
2017-03-15 18:37:11,984 INFO    : TestEnv      : Loading default (file) target configuration
2017-03-15 18:37:11,984 INFO    : TestEnv      : Loading target configuration [/home/brendan/lisa/target.config]...
2017-03-15 18:37:11,985 INFO    : TestEnv      : Loading custom (inline) test configuration
2017-03-15 18:37:11,985 INFO    : TestEnv      : External tools using:
2017-03-15 18:37:11,986 INFO    : TestEnv      :    ANDROID_HOME: /home/eas/Work/Android/android-sdk-linux/
2017-03-15 18:37:11,986 INFO    : TestEnv      :    CATAPULT_HOME: /home/brendan/lisa/tools/catapult
2017-03-15 18:37:11,986 INFO    : TestEnv      : Devlib modules to load: ['bl', 'cpufreq', 'cgroups']
2017-03-15 18:37:11,987 INFO    : TestEnv      : Connecting Android target [HT65B0300003]
2017-03-15 18:37:11,987 INFO    : TestEnv      : Connection settings:
2017-03-15 18:37:11,987 INFO    : TestEnv      :    {'device': u'HT65B03

In [4]:
# Some nonsense to make LISA treat HiKey with the respect it deserves
if te.conf['board'] == 'hikey':
    from trappy.stats.Topology import Topology
    hikey_topology = Topology(clusters=[[0, 1, 2, 3], [4, 5, 6, 7]])
    te.topology = hikey_topology
    te.platform['clusters'] = {'big': [0, 1, 2, 3], 'little': [4, 5, 6, 7]}

# Define Workloads
We're going to create two workloads with a handful of small tasks, such that they could fit onto a single cluster at the lowest OPP. 

To avoid evertying being thrown out by the periodiciy of the tasks, the start times and periods of the tasks are staggered.

We'll run this same workload both with and without using taskset to force them onto a single cluster.

In [5]:
def get_wloads(nrg_model, cpu=0, type='min', bandwidth_to_eat=0.25, ntasks=16):
    cpu_node = nrg_model.cpu_nodes[cpu]
    cluster_node = cpu_node.parent
    ncpus = len(cluster_node.cpus)
    if type == 'min':
        cpu_cap = cpu_node.min_capacity
    elif type == 'max':
        cpu_cap = cpu_node.max_capacity
        
    total_cap = bandwidth_to_eat * ncpus * cpu_cap
    target_task_cap = total_cap / ntasks
    task_pct = int((target_task_cap / 1024) * 100)
    task_cap = int(1024 * (task_pct / 100.))
    tasks_per_cpu = int(cpu_cap / task_cap)
    
    print 'Will create workloads with {} tasks each using {} capacity'.format(ntasks, task_cap)
    
    # Stagger the workloads so that they prevent shared idle states
    period_s = 10e-3
    stagger_s = 0.7e-3
    delays = np.arange(period_s, step=stagger_s)
    
    if len(delays) > ntasks:
        print "WARNING: not enough tasks to cover period with wakeups"
    
    # Figure out a set of CPUs that spans both clusters
    [cluster0, cluster1] = te.platform['clusters'].values()
    cpu_pairs = zip(cluster0, cluster1)
    print cluster0, cluster1, cpu_pairs
    cluster_spread_cpuset = reduce(lambda x, y: x + y, cpu_pairs[0:len(cpu_pairs) / 2])
    
    tasks = {}
    for i in range(ntasks):
        tasks['{}pct_{}'.format(task_pct, i)] = {
            'kind': 'Periodic',
            'params': {
                'duty_cycle_pct': task_pct,
                'period_ms' : period_s * 1e3 + (i - (ntasks / 2)),
                'delay_s' : delays[i % len(delays)],
                'duration_s': 3
            }
        }
    
    wloads = {
        'forced_pack': {
            'type': 'rt-app',
            'conf': {
                'class': 'profile',
                'params': tasks,
                'cpus': cluster0,
                'duration': 3,
            }
        },
        'break_packing': {
            'type': 'rt-app',
            'conf': {
                'class': 'profile',
                'params': tasks,
                'cpus': cluster_spread_cpuset,
                'duration': 3,
            }
        }
    }
        
    return wloads

In [6]:
wloads = get_wloads(te.nrg_model, type='max')

Will create workloads with 16 tasks each using 20 capacity
[2, 3] [0, 1] [(2, 0), (3, 1)]


# Run workloads
Now we'll run the workloads with the powersave governor

In [7]:
# Find frequencies that all CPUs can run at
shared_freqs = sorted(set.intersection(*[set(te.target.cpufreq.list_frequencies(c)) 
                                         for c in range(te.target.number_of_cpus)]))

In [None]:
executor = Executor(te, {
        'confs': [{
            'tag': 'freq_{}'.format(freq),
            'flags': ['ftrace', 'freeze_userspace'],
            'cpufreq': {
                'governor': 'userspace',
                'freqs': {cpu: freq for cpu in range(te.target.number_of_cpus)}
            },
        } for freq in shared_freqs[::len(shared_freqs)/5]],
        'wloads': wloads,
        'iterations': 5
    })

2017-03-15 18:37:15,204 INFO    : Executor     : Loading custom (inline) test configuration
2017-03-15 18:37:15,205 INFO    : Executor     : 
2017-03-15 18:37:15,206 INFO    : Executor     : ################################################################################
2017-03-15 18:37:15,207 INFO    : Executor     : Experiments configuration
2017-03-15 18:37:15,207 INFO    : Executor     : ################################################################################
2017-03-15 18:37:15,208 INFO    : Executor     : Configured to run:
2017-03-15 18:37:15,209 INFO    : Executor     :      6 target configurations:
2017-03-15 18:37:15,210 INFO    : Executor     :       freq_307200, freq_537600, freq_902400, freq_1132800, freq_1363200, freq_1593600
2017-03-15 18:37:15,210 INFO    : Executor     :      2 workloads (5 iterations each)
2017-03-15 18:37:15,211 INFO    : Executor     :       break_packing, forced_pack
2017-03-15 18:37:15,211 INFO    : Executor     : Total: 60 experiments
20

In [None]:
executor.run()

2017-03-15 18:37:15,943 INFO    : Executor     : 
2017-03-15 18:37:15,944 INFO    : Executor     : ################################################################################
2017-03-15 18:37:15,945 INFO    : Executor     : Experiments execution
2017-03-15 18:37:15,945 INFO    : Executor     : ################################################################################
2017-03-15 18:37:15,946 INFO    : Executor     : 
2017-03-15 18:37:15,947 INFO    : Executor     : configuring target for [freq_307200] experiments
2017-03-15 18:37:16,119 INFO    : Executor     : Configuring all CPUs to use [userspace] cpufreq governor
2017-03-15 18:37:16,183 INFO    : Executor     :        CPUFreq - CPU frequencies: {0: 307200, 1: 307200, 2: 307200, 3: 307200}
2017-03-15 18:37:17,068 INFO    : TestEnv      : Using RT-App calibration values:
2017-03-15 18:37:17,069 INFO    : TestEnv      :    {"0": 105, "1": 105, "2": 78, "3": 78}
2017-03-15 18:37:17,070 INFO    : Workload     : Setup new workl

# Examine Results

In [None]:
traces = [Trace(te.platform, e.out_dir, ['sched_switch', 'cpu_idle', 'cpu_frequency']) for e in executor.experiments]

## Plot task residency
It's worth visually inspecting the task residency to check that the "un-packed" runs were really unpacked and that there aren't any aliasing problems that would obviously make un-packed runs just as efficient as packed runs

In [None]:
def get_energy(experiment):
    with open(os.path.join(experiment.out_dir, 'energy.json')) as f:
        return sum(json.load(f).values())

In [None]:
if True:
    for trace, experiment in zip(traces, executor.experiments)[:20:5]:
        print get_energy(experiment)
        plot_trace(trace.ftrace)

# Compare energy usage between packed an unpacked runs

Split up workload runs between those that were packed onto a single cluster and those that weren't. Then compare the mean energy usage for those two classes of workload runs.

In [None]:
def examine_experiment(experiment, trace):
    ma = SchedMultiAssert(trace.ftrace, te.topology, experiment.wload.tasks.keys())
    cluster_conclusions = []
    for cluster in te.topology.get_level('cluster'):
        residencies = [t['residency'] for p, t in ma.getResidency('cluster', cluster, percent=True).iteritems()]
        if all(r > 90 for r in residencies):
            cluster_conclusions.append('PACKED')
        else:
            cluster_conclusions.append(None)
    clusters_packed = [i for i, c in enumerate(cluster_conclusions) if c == 'PACKED']
    
    if not clusters_packed:
        print "I don't think we packed onto any cluster"
        packed = False
        # plot_trace(trace.ftrace)
    elif len(clusters_packed) == 1:
        [i] = clusters_packed
        print 'I think we packed onto cluster {} ({})'.format(i, te.topology.get_node('cluster', i))
        packed = True
    else:
        raise Exception('this code is borken')

    energy = get_energy(experiment)
    
    frequency = experiment.conf['cpufreq']['freqs'][0]
    
    return frequency, packed, energy

In [None]:
# df = pd.DataFrame(columns=['freq', 'packed', 'energy'])
results = {f: {True: [], False: []} for f in te.target.cpufreq.list_frequencies(0)}
for experiment, trace in zip(executor.experiments, traces):
    freq, packed, energy = examine_experiment(experiment, trace)
    print freq, packed, energy
    results[freq][packed].append(energy)

mean_diffs = {}
for freq, freq_results in results.iteritems():
    if not freq_results[True] or not freq_results[False]:
        print "No results at frequency {}".format(freq)
        continue
    mean_unpacked = np.mean(freq_results[False])
    mean_packed = np.mean(freq_results[True])
    print freq
    mean_diffs[freq] = (mean_unpacked - mean_packed) / mean_packed

In [None]:
mean_diffs

In [None]:
index = sorted(mean_diffs.keys())
pd.DataFrame([mean_diffs[f] for f in index], index=index).plot(kind='bar')