In [None]:
import os, re, sys, argparse, glob
import csv
import itertools
from datetime import datetime
from collections import namedtuple
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import *
from matplotlib_helper import *

# plt.rcParams['svg.fonttype'] = 'none'
#
# import matplotlib_inline.backend_inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
def removeprefix(self: str, prefix: str, /) -> str:
    if self.startswith(prefix):
        return self[len(prefix):]
    else:
        return self[:]

def removesuffix(self: str, suffix: str, /) -> str:
    # suffix='' should not call self[:-0].
    if suffix and self.endswith(suffix):
        return self[:-len(suffix)]
    else:
        return self[:]

In [None]:
def common_prefix(l):
    "Given a list of pathnames, returns the longest common leading component"
    if not l: return ''
    s1 = min(l)
    s2 = max(l)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

In [None]:
def plot_cdf_array(array, label, include_count = False, index=0, color=None):
    x = sorted(array)
    y = np.linspace(0., 1., len(array) + 1)[1:]
    if include_count:
        label += ' (%d)' % len(array)
    if color is None:
        color = get_next_color()
    plt.plot(x, y, label=label, color=color, linestyle=get_linestyle(index))

In [None]:
def to_relative_timestamp(timestamp: datetime, l_timestamp: list[datetime]):
    return (timestamp - l_timestamp[0]).total_seconds()

In [None]:
def plot_timeseries(data_array, plot_axis=None, timestamp_column_name='timestamp', prefix=None, use_relative_time=False, color=None, index=0):
    x = [entry[timestamp_column_name] for entry in data_array]
    if use_relative_time:
        x = [to_relative_timestamp(t, x) for t in x]
    data_keys = []
    for key in data_array[0].keys():
        if key == timestamp_column_name:
            continue
        data_keys.append(key)
    lines = []
    for key in data_keys:
        data_series = [entry[key] for entry in data_array]
        label = ('%s - ' % prefix if prefix else '') + key
        if plot_axis is None:
            plot_axis = plt.gca()
        line = plot_axis.plot(x, data_series, color=color, linestyle=get_linestyle(index), label=label)
        index += 1
        lines.append(line)
    return lines

In [None]:
def plot_interval_boundary(l_timestamps, interval, label, plot_axis=None, use_relative_time=False):
    assert len(interval) == 2, f"Interval {interval} must have exactly two elements."
    if plot_axis is None:
        plot_axis = plt.gca()
    xs = []
    for timestamp_index in interval:
        timestamp = l_timestamps[timestamp_index]
        x = to_relative_timestamp(timestamp, l_timestamps) if use_relative_time else timestamp
        xs.append(x)
        plot_axis.vlines(x, 0, 100, color='black', linestyle='dashed')
    average_timestamp = xs[0] + (xs[1] - xs[0]) / 2
    plot_axis.text(average_timestamp, 100, label, ha='center', va='bottom')

In [None]:
def get_rapl_data(rapl_log_file):
    data_array = []
    with open(rapl_log_file, 'r') as f:
        csv_reader = csv.DictReader(f)
        column_names = csv_reader.fieldnames
        required_columns = set(['timestamp', 'total_intel_energy', 'total_cpu_energy', 'total_dram_energy'])
        assert [required_column in column_names for required_column in required_columns]
        for row in csv_reader:
            timestamp = datetime.fromisoformat(row['timestamp'])
            total_intel_energy = float(row['total_intel_energy'])
            total_cpu_energy = float(row['total_cpu_energy'])
            total_dram_energy = float(row['total_dram_energy'])
            data_array.append({
                'timestamp': timestamp,
                'total_intel_energy': total_intel_energy,
                'total_cpu_energy': total_cpu_energy,
                'total_dram_energy': total_dram_energy,
            })
    return data_array

In [None]:
def get_cpu_mem_usage_data(usage_log_file):
    data_array = []
    with open(usage_log_file, 'r') as f:
        csv_reader = csv.DictReader(f)
        column_names = csv_reader.fieldnames
        required_columns = set(['timestamp', 'cpu-user', 'cpu-kernel', 'cpu-idle', 'mem-used', 'mem-free'])
        assert [required_column in column_names for required_column in required_columns]
        for row in csv_reader:
            timestamp = datetime.fromisoformat(row['timestamp'])
            cpu_user = float(row['cpu-user'])
            cpu_kernel = float(row['cpu-kernel'])
            cpu_idle = float(row['cpu-idle'])
            mem_used = float(row['mem-used'])
            mem_free = float(row['mem-free'])
            if mem_used + mem_free <= 0:
                print(row, file=sys.stderr)
            data_array.append({
                'timestamp': timestamp,
                'cpu': cpu_user + cpu_kernel,
                'mem': mem_used / (mem_used + mem_free),
                # 'cpu-user': cpu_user,
                # 'cpu-kernel': cpu_kernel,
                # 'cpu-idle': cpu_idle,
                # 'mem-used': mem_used,
                # 'mem-free': mem_free,
            })
    return data_array

In [None]:
def auto_detect_log_files(dirpath):
    dirpath = os.path.expanduser(dirpath)
    RAPL_LOGFILE_SUFFIX = ".rapl.csv"
    USAGE_LOGFILE_SUFFIX = ".usage.csv"
    rapl_log_files = sorted(glob.glob(os.path.join(dirpath, '*' + RAPL_LOGFILE_SUFFIX)))
    usage_log_files = sorted(glob.glob(os.path.join(dirpath, '*' + USAGE_LOGFILE_SUFFIX)))
    assert len(rapl_log_files) == len(usage_log_files)
    assert [removesuffix(filename, RAPL_LOGFILE_SUFFIX) for filename in rapl_log_files] == [removesuffix(filename, USAGE_LOGFILE_SUFFIX) for filename in usage_log_files]
    return list(zip(rapl_log_files, usage_log_files))

In [None]:
def parse_energy_delta_into_stages(l_delta_power):
    """Parse array into ranges of starting/ending stable power periods and middle varying power period."""
    POWER_DIFF_THRESHOLD = 1.
    N_CONSECUTIVE = 10
    SLIDING_WINDOW_SIZE = N_CONSECUTIVE - 1
    l_is_stable = np.convolve(np.abs(l_delta_power) < POWER_DIFF_THRESHOLD, v=np.ones(N_CONSECUTIVE), mode='valid') == N_CONSECUTIVE
    l_stable_groups = []
    TupleIsStableAndDuration = namedtuple('TupleIsStableAndDuration', ['is_stable', 'duration'])
    for is_stable, group in itertools.groupby(l_is_stable):
        duration = sum(1 for _ in group)
        l_stable_groups.append(TupleIsStableAndDuration(is_stable, duration))
    if len(l_stable_groups) < 3:
        raise ValueError("Not enough groups for stable -> varying -> stable power readings")

    # alternating T/Ts due to groupby: (_, F) (optional), (a, T), (b, F), ..., (y, T), (z, F) (optional)
    # First and last unstable groups are discarded, so the first and last stable groups are true idle periods,
    #   and all the middle section is considered running period.
    index_stable_groups = np.where(np.array([t.is_stable for t in l_stable_groups]))[0]
    index_first_stable_group = index_stable_groups[0]
    index_last_stable_group = index_stable_groups[-1]
    assert index_first_stable_group in [0, 1]
    assert index_last_stable_group in [len(l_stable_groups) - 2, len(l_stable_groups) - 1]

    # calculate the [start_index, end_index) intervals for first and last stable group as idle start and idle end period
    first_stable_group_start = l_stable_groups[0].duration if not l_stable_groups[0].is_stable else 0
    first_stable_group_end = first_stable_group_start + (l_stable_groups[index_first_stable_group].duration + SLIDING_WINDOW_SIZE)
    last_stable_group_end = len(l_delta_power) - (l_stable_groups[-1].duration if not l_stable_groups[-1].is_stable else 0)
    last_stable_group_start = last_stable_group_end - (l_stable_groups[index_last_stable_group].duration + SLIDING_WINDOW_SIZE)

    # Properly formulate the start/end idle intervals and everything in between as running interval
    idle_start_range = [first_stable_group_start, first_stable_group_end]
    running_range = [first_stable_group_end, last_stable_group_start]
    idle_end_range = [last_stable_group_start, last_stable_group_end]
    return idle_start_range, running_range, idle_end_range

In [None]:
# data_array is a list of dicts, each of which has 'timestamp', 'total_intel_energy', ...
# it's already sorted by timestamp
def get_energy_stats(data_array):
    assert len(data_array) > 2, "Time series is too short"
    l_timestamp = np.array([entry['timestamp'] for entry in data_array])
    delta_timestamps = [int(delta.total_seconds()) for delta in np.diff(l_timestamp, n=1)]
    sample_interval_s = delta_timestamps[0]
    # print('Sample intervals:', delta_timestamps)
    assert all(delta == sample_interval_s for delta in delta_timestamps)

    # detect idle power draw via diff among consecutive readings
    l_power = np.array([entry['total_intel_energy']/sample_interval_s for entry in data_array])
    # print(l_power)
    l_delta_power = np.diff(l_power, n=1)
    # print(l_delta_power)
    assert len(l_power) == len(l_delta_power) + 1

    # Break down readings into three stages in [start_index, end_index) intervals
    idle_start_range, running_range, idle_end_range = parse_energy_delta_into_stages(l_delta_power)
    # print(idle_start_range, running_range, idle_end_range)
    l_power_idle_before = l_power[idle_start_range[0]:idle_start_range[1]]
    l_power_workload = l_power[running_range[0]:running_range[1]]
    l_power_idle_after = l_power[idle_end_range[0]:idle_end_range[1]]

    avg_power_idle_before = np.average(l_power_idle_before)
    std_power_idle_before = np.std(l_power_idle_before)
    avg_power_idle_after = np.average(l_power_idle_after)
    std_power_idle_after = np.std(l_power_idle_after)
    # print(avg_power_idle_before, std_power_idle_before, avg_power_idle_after, std_power_idle_after)

    IDLE_POWER_STD_THRESHOLD = 0.05
    assert std_power_idle_before / avg_power_idle_before < IDLE_POWER_STD_THRESHOLD, \
        "Idle power std is too high (%.2f+/-%.2f)" % (avg_power_idle_before, std_power_idle_before)
    assert std_power_idle_after / avg_power_idle_after < IDLE_POWER_STD_THRESHOLD, \
        "Idle power std is too high (%.2f+/-%.2f)" % (avg_power_idle_after, std_power_idle_after)

    # "Idle power before/after difference is too high"
    avg_power_idle = np.average([avg_power_idle_before, avg_power_idle_after])
#     print('Workload duration: %ds' % len(l_power_workload) * sample_interval_s)
#     print('Idle power: %.fW' % (avg_power_idle / sample_interval_s))
    running_duration = len(l_power_workload) * sample_interval_s
    return {
        'duration': running_duration,
        'timestamps': l_timestamp,
        'idle_start_range': idle_start_range,
        'running_range': running_range,
        'idle_end_range': idle_end_range,
        'total_energy': np.sum(l_power_workload),
        'delta_energy': np.sum(l_power_workload) - running_duration * avg_power_idle,
        'sample_interval_s': sample_interval_s,
        'idle_power': avg_power_idle,
    }

In [None]:
ROOT_DIR = os.path.realpath('../data/')

l_sample_interval_s = []
# prepend these with "video-transcoding/logs/"
log_files_pairs = [
    # ('combined/ffmpeg.rapl.csv', 'combined/ffmpeg.usage.csv'),
    # ('combined/scp.src.rapl.csv', 'combined/scp.src.usage.csv'),
    # ('combined/scp.dst.rapl.csv', 'combined/scp.dst.usage.csv'),
    # ('mbw/mbw.2GB.rapl.csv', 'mbw/mbw.2GB.usage.csv'),
    # ('mbw/mbw.32GB.rapl.csv', 'mbw/mbw.32GB.usage.csv'),
    # ('stress.cpu=40.timeout=15/stress.cpu=40.timeout=15.rapl.csv', 'stress.cpu=40.timeout=15/stress.cpu=40.timeout=15.usage.csv'),
    # 'ffmpeg-Rain.csv'
    # 'ffmpeg-Rain-10x.csv',
    # 'ffmpeg-Rain.data-copy.1G.csv',
    # 'ffmpeg.youtube-wnhvanMdx4s.720p.csv',
    # 'ffmpeg-Rain.data-copy.100G.csv',
    # 'spark-wordcount-short.csv',
    # 'spark-wordcount-long.csv',
    # 'spark-wordcount-long.data-copy.1G.csv',
    # 'spark-wordcount-long.data-copy.100G.csv',
]
wildcard_dir_names = [
    # 'video-transcoding/logs/stress.smt=on.cpu=*.timeout=60',
    # 'video-transcoding/logs/stress.smt=off.cpu=*.timeout=60',
    # 'video-transcoding/logs/mbw.*',
    # 'video-transcoding/logs/ffmpeg.youtube-wnhvanMdx4s.720p.grayscale',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.1x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.2x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.4x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.8x',
    # 'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.9x',
    # 'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.10x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.16x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.32x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.64x',
    'video-transcoding/logs/ffmpeg.resize.4k-1080p.copyrighted-A66C0008.128x',
    # 'code-compilation/compile.linux.ramdisk',
    # 'code-compilation/compile.linux.ssd',
    # 'file-compression/gzip.linux.ramdisk',
    # 'file-compression/gzip.linux.ssd',
    # 'file-compression/bzip.linux.ramdisk',
    # 'file-compression/bzip.linux.ssd',
    # 'jupyter-notebook/rawdata'
]
nested_array_of_file_pairs = [auto_detect_log_files(os.path.join(ROOT_DIR, wildcard_dir_name)) for wildcard_dir_name in wildcard_dir_names]
log_files_pairs = list(itertools.chain.from_iterable(nested_array_of_file_pairs))
l_parallelism = np.array([1, 2, 4, 8, 16, 32, 64, 128])
l_runtime = []
l_total_energy = []
l_delta_energy = []
for (rapl_log_file, cpu_mem_usage_log_file) in log_files_pairs:
    name = common_prefix([rapl_log_file, cpu_mem_usage_log_file])
    name = name.split('/')[-1].rstrip('.')
    print("Workload: %s" % name)
    fig = plt.figure()
    ax1 = fig.subplots()
    ax2 = ax1.twinx()
    plot_lines = []
    energy_data_array = get_rapl_data(os.path.join(ROOT_DIR, rapl_log_file))
    plot_lines += plot_timeseries(energy_data_array, plot_axis=ax1, prefix="RAPL", use_relative_time=False, color='orange')
    energy_stats = get_energy_stats(energy_data_array)
    l_sample_interval_s.append(energy_stats['sample_interval_s'])
    # print('Workload: %s' % rapl_log_file)
    stats_duration = energy_stats['duration']
    stats_total_energy = energy_stats['total_energy']
    stats_delta_energy = energy_stats['delta_energy']
    print('Duration: %.fs' % stats_duration)
    print('Total energy: %.fJ' % stats_total_energy)
    print('Delta energy: %.fJ' % stats_delta_energy)
    energy_stats_text = f'{stats_duration:.0f}s {stats_total_energy:.0f}J / {stats_delta_energy:.0f}J'
    l_runtime.append(energy_stats['duration'])
    l_total_energy.append(energy_stats['total_energy'])
    l_delta_energy.append(energy_stats['delta_energy'])
    cpu_mem_usage_data_array = get_cpu_mem_usage_data(os.path.join(ROOT_DIR, cpu_mem_usage_log_file))
    plot_lines += plot_timeseries(cpu_mem_usage_data_array, plot_axis=ax2, prefix="Usage", use_relative_time=False, color='blue', index=0)
    # TODO: plot interval boundaries on timeseries
    l_timestamps = energy_stats['timestamps']
    plot_interval_boundary(l_timestamps, energy_stats['idle_start_range'], 'start\nidle', plot_axis=ax2, use_relative_time=False)
    plot_interval_boundary(l_timestamps, energy_stats['running_range'], f'running\n{energy_stats_text}', plot_axis=ax2, use_relative_time=False)
    plot_interval_boundary(l_timestamps, energy_stats['idle_end_range'], 'end\nidle', plot_axis=ax2, use_relative_time=False)
    # plot_labels = [line.get_label() for line in plot_lines]
    # plot_labels = ax1.lines + ax2.lines
    # ax1.legend(plot_lines, plot_labels)
    # plt.xlim(0, 20)
    # plt.ylim(105, 115)
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Power (W)', color='orange')
    ax2.set_ylabel('Utilization (%)', color='blue')
    ax1.set_ylim(0, 300)
    ax2.set_ylim(0, 120)
    ax1.tick_params(axis='x', rotation=15)
    ax1.grid()
    # plt.locator_params(axis='y', nbins=5)
    fig.legend(loc='center left', bbox_to_anchor=(1, 0.52))
    # ax2.legend(loc='lower center')
    plt.title('Workload: %s' % name)
    plt.savefig('%s.svg' % name, bbox_inches='tight')

In [None]:
l_total_energy_per_job = l_total_energy / l_parallelism
l_delta_energy_per_job = l_delta_energy / l_parallelism
fig = plt.figure()
ax1 = fig.subplots()
ax2 = ax1.twinx()
ax1.plot(l_parallelism, l_runtime, marker='.', label='Runtime', color='black', linestyle=get_linestyle(0))
ax2.plot(l_parallelism, l_total_energy_per_job, marker='.', label='Total energy/job', color='blue', linestyle=get_linestyle(1))
ax2.plot(l_parallelism, l_delta_energy_per_job, marker='.', label='Delta energy/job', color='blue', linestyle=get_linestyle(2))

ax1.set_xscale('log')
ax1.set_xlabel('# of parallel jobs')
ax1.set_ylabel('Runtime (s)', color='black')
ax2.set_ylabel('Energy/job (J)', color='blue')
ax1.set_ylim(0, 3000)
ax2.set_ylim(0, None)
ax1.set_xticks(l_parallelism)
ax1.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
# ax1.tick_params(axis='x', rotation=15)
ax1.grid()
# plt.locator_params(axis='y', nbins=5)
fig.legend(loc='center left', bbox_to_anchor=(1, 0.52))