In [None]:
import os, re, sys, argparse, glob
import csv
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import *
from matplotlib_helper import *

In [None]:
def plot_cdf_array(array, label, include_count = False, index=0, color=None):
    x = sorted(array)
    y = np.linspace(0., 1., len(array) + 1)[1:]
    if include_count:
        label += ' (%d)' % len(array)
    if color is None:
        color = get_next_color()
    plt.plot(x, y, label=label, color=color, linestyle=get_linestyle(index))

In [None]:
def plot_timeseries(data_array, timestamp_column_name='timestamp', prefix=None, use_relative_time=False):
    x = [entry[timestamp_column_name] for entry in data_array]
    if use_relative_time:
        start_time = x[0]
        x = [(t - start_time).total_seconds() for t in x]
    data_keys = []
    for key in data_array[0].keys():
        if key == timestamp_column_name:
            continue
        data_keys.append(key)
    for key in data_keys:
        data_series = [entry[key] for entry in data_array]
        label = ('%s - ' % prefix if prefix else '') + key
        plt.plot(x, data_series, label=label)

In [None]:
def get_rapl_data(rapl_log_file):
    with open(rapl_log_file, 'r') as f:
        csv_reader = csv.DictReader(f)
        column_names = csv_reader.fieldnames
        required_columns = set(['timestamp', 'total_intel_energy', 'total_cpu_energy', 'total_dram_energy'])
        assert [required_column in column_names for required_column in required_columns]
        data_array = []
        for row in csv_reader:
            timestamp = datetime.fromisoformat(row['timestamp'])
            total_intel_energy = float(row['total_intel_energy'])
            total_cpu_energy = float(row['total_cpu_energy'])
            total_dram_energy = float(row['total_dram_energy'])
            data_array.append({
                'timestamp': timestamp,
                'total_intel_energy': total_intel_energy,
    #             'total_cpu_energy': total_cpu_energy,
    #             'total_dram_energy': total_dram_energy,
            })
    return data_array

In [None]:
# data_array is a list of dicts, each of which has 'timestamp', 'total_intel_energy', ...
# it's already sorted by timestamp
def get_energy_stats(data_array):
    assert len(data_array) > 2, "Time series is too short"
    l_timestamp = np.array([entry['timestamp'] for entry in data_array])
    delta_timestamps = [int(delta.total_seconds()) for delta in np.diff(l_timestamp, n=1)]
    sample_interval_s = delta_timestamps[0]
    assert all(delta == sample_interval_s for delta in delta_timestamps)
#     print('Sample interval: %ds' % sample_interval_s)

    # detect idle power draw
    l_power = np.array([entry['total_intel_energy']/sample_interval_s for entry in data_array])
#     print(l_power)
    delta_power = np.diff(l_power, n=1)
#     print(delta_power)
    assert len(l_power) == len(delta_power) + 1
    POWER_DIFF_THRESHOLD = 1
    IDLE_POWER_STD_THRESHOLD = 0.01
    index_workload_start = np.argmax(delta_power > POWER_DIFF_THRESHOLD)
    index_workload_end = len(delta_power) - np.argmax(delta_power[::-1] < -POWER_DIFF_THRESHOLD)
    l_power_idle_before = l_power[:index_workload_start]
    l_power_workload = l_power[index_workload_start:index_workload_end]
    l_power_idle_after = l_power[index_workload_end:]
    avg_power_idle_before = np.average(l_power_idle_before)
    std_power_idle_before = np.std(l_power_idle_before)
    avg_power_idle_after = np.average(l_power_idle_after)
    std_power_idle_after = np.std(l_power_idle_after)
#     print(avg_power_idle_before, std_power_idle_before, avg_power_idle_after, std_power_idle_after)

    assert std_power_idle_before / avg_power_idle_before < IDLE_POWER_STD_THRESHOLD, "Idle power std is too high"
    assert std_power_idle_after / avg_power_idle_after < IDLE_POWER_STD_THRESHOLD, "Idle power std is too high"
    
    # "Idle power before/after difference is too high"
    avg_power_idle = np.average([avg_power_idle_before, avg_power_idle_after])
    if np.abs(avg_power_idle_before - avg_power_idle_after) > POWER_DIFF_THRESHOLD:
        avg_power_idle = avg_power_idle_before

#     print('Workload duration: %ds' % len(l_power_workload) * sample_interval_s)
#     print('Idle power: %.fW' % (avg_power_idle / sample_interval_s))
    return {
        'duration': len(l_power) * sample_interval_s,
        'start_index': index_workload_start,
        'total_energy': np.sum(l_power_workload),
        'delta_energy': np.sum(l_power_workload) - len(l_power_workload) * avg_power_idle,
        'sample_interval_s': sample_interval_s,
        'idle_power': avg_power_idle,
    }

In [None]:
ROOT_DIR = '/data/scripts'

l_sample_interval_s = []
for log_file in [
#         'ffmpeg-Rain.csv',
        'ffmpeg-Rain-10x.csv',
        'ffmpeg-Rain.data-copy.1G.csv',
#         'ffmpeg-Rain.data-copy.100G.csv',
#         'spark-wordcount-short.csv',
#         'spark-wordcount-long.csv',
#         'spark-wordcount-long.data-copy.1G.csv',
#         'spark-wordcount-long.data-copy.100G.csv',
    ]:
    data_array = get_rapl_data(os.path.join(ROOT_DIR, log_file))
    plot_timeseries(data_array, prefix=log_file, use_relative_time=True)
    energy_stats = get_energy_stats(data_array)
    l_sample_interval_s.append(energy_stats['sample_interval_s'])
    print('Workload: %s' % log_file)
    print('Duration: %.fs' % energy_stats['duration'])
    print('Total energy: %.fJ' % energy_stats['total_energy'])
    print('Delta energy: %.fJ' % energy_stats['delta_energy'])

# plt.xlim(0, 20)
# plt.ylim(105, 115)
plt.grid()
plt.legend()
plt.xlabel('Time (s)')
plt.ylabel('Power (W)')
plt.title('RAPL total energy (sampled every %ds)' % np.average(l_sample_interval_s))
# plt.savefig('rapl-ffmpeg-Rain.png')