In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sortedcontainers import SortedDict
from collections import namedtuple
from operator import attrgetter
from pandarallel import pandarallel

In [3]:
cache_dir = os.path.join(os.environ['VS'], "energy_analysis_plot_cache", "dvfs_energy_gains")
os.makedirs(cache_dir, exist_ok=True)
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
dvfs_options = SortedDict()
dvfs_options[1.0] = 0
dvfs_options[1.2286] = 0.086
dvfs_options[1.5344] = 0.126

Machine = namedtuple('Machine', 'base_clock resources tdp energy_factor')
machines = [
    Machine(2.9, 128, 280, 280/128/2.9 ),
    Machine(3.7, 24, 105, 105/24/3.7 ),
    Machine(3.8, 16, 105, 105/16/3.8 ),
    Machine(4.1, 12, 95, 95/12/4.1),
]

1.066438763376932
1.3102629656683709
1.8633137119113572
1.9308943089430897


In [6]:
input_path = "./WTA/parquet/"
slack_folder = os.path.join(os.environ['VS'], "ic2e-wta-output", "look_ahead")

df_filename = os.path.join(cache_dir, "task_slack_factors.parquet")
# os.remove(df_filename)
if not os.path.exists(df_filename):

    folders = next(os.walk(input_path))[1]

    domains = [
        ("Industrial", {"google", "alibaba", "sigma", "shell"}),
        ("Engineering", {"askalon",})
    ]

    dataframes = []

    for folder in folders:
        if "google" in str(folder).lower(): continue
        if "lanl" in str(folder).lower(): continue
        if "two_sigma" in str(folder).lower(): continue
        if "alibaba" in str(folder).lower() and "100k" not in str(folder).lower(): continue

        data_folder = os.path.join(input_path, folder)

        if not os.path.exists(os.path.join(data_folder, "tasks", "schema-1.0")):
            continue

        try:
            df = pd.read_parquet(os.path.join(data_folder, "tasks", "schema-1.0"),
                         columns=[
                             "workflow_id", "id", "runtime", "resource_amount_requested"
                         ], engine='pyarrow')


            df2 = pd.read_parquet(os.path.join(slack_folder, folder.replace("_parquet", "_slack.parquet")),
                                  columns=["workflow_id", "task_id", "task_slack"],
                                  engine='pyarrow')
            df2.rename(columns={"task_id": "id"}, inplace=True)
            df2.reset_index(inplace=True, drop=True)

            df = df.merge(df2, on=["workflow_id", "id"], how='inner')
        except Exception as e:
            print(folder)
            print(df.head())
            print(df2.head())
            raise e

        # Some tasks have a runtime of 0, which would be unfair to take into account as you can delay them
        # by any factor. We filter those out.
        df = df[df['runtime'] > 0]

        # Compute the maximum delay factor tasks can have
        df["factor"] = ((df["runtime"]+df["task_slack"]) / df["runtime"])
#         df.drop(["runtime", "task_slack"], axis=1, inplace=True)
        domain = 'Scientific'
        if any(d in folder.lower() for d in domains[0][1]):
            domain = domains[0][0]
        elif any(d in folder.lower() for d in domains[1][1]):
            domain = domains[1][0]
        df["domain"] = domain

        dataframes.append(df)
    
    value_df = pd.concat(dataframes)
    value_df.reset_index(drop=True, inplace=True)
    value_df.to_parquet(df_filename, engine="pyarrow")
else:
    value_df = pd.read_parquet(df_filename, engine="pyarrow")  
    

In [64]:
# Sanity check
print("Minimum factor >= 1.0: ", value_df["factor"].min() >= 0)

Minimum factor >= 1.0:  True


In [102]:
# This cell outputs the theoretical gains by solely using DVFS to trade-off slack for reduced energy
t = '''\\begin{{table}}[t]
\\caption{{Average Energy Reduction per domain using \gls{{dvfs}}.}}
\\label{{slack:tbl:average-dvfs-gains}}
\\adjustbox{{max width=\\linewidth}}{{
\\begin{{tabular}}{{lrrr}}
\\toprule
Domain                   & Engineering & Industrial & Scientific \\\\ \\midrule
Energy reduction & {1}\\%            & {2}\\%           & {3}\\%           \\\\  \\cmidrule{{2-4}}
Overall average & \\multicolumn{{3}}{{c}}{{{0}\%}}                  \\\\  \\bottomrule
\\end{{tabular}}
}}
\\end{{table}}'''

values = []

def map_factor_to_dvfs_gain(x):
    return max([energy_saving for delay_factor, energy_saving in dvfs_options.items() if delay_factor <= x])
            

values.append(value_df["factor"].parallel_map(map_factor_to_dvfs_gain).mean() * 100)
for domain in ["Engineering", "Industrial", "Scientific"]:
#     print(value_df[value_df['domain'] == domain]["factor"].map(map_factor_to_dvfs_gain).describe())
    values.append(value_df[value_df['domain'] == domain]["factor"].parallel_map(map_factor_to_dvfs_gain).mean() * 100)

print(t.format(*["{:,.2f}".format(v) for v in [
    *values
]]))

\begin{table}[t]
\caption{Average Energy Reduction per domain using \gls{dvfs}.}
\label{slack:tbl:average-dvfs-gains}
\adjustbox{max width=\linewidth}{
\begin{tabular}{lrrr}
\toprule
Domain                   & Engineering & Industrial & Scientific \\ \midrule
Energy reduction & 7.62\%            & 11.35\%           & 4.24\%           \\  \cmidrule{2-4}
Overall average & \multicolumn{3}{c}{11.31\%}                  \\  \bottomrule
\end{tabular}
}
\end{table}


In [10]:
# This cell outputs the theoretical gains by solely using heterogeneity to trade-off slack for reduced energy
t = '''\\begin{{table}}[t]
\\caption{{Average energy reduction per domain using heterogeneity.}}
\\label{{slack:tbl:average-heterogeneity-gains}}
\\adjustbox{{max width=\\linewidth}}{{
\\begin{{tabular}}{{lrrr}}
\\toprule
Domain                   & Engineering & Industrial & Scientific \\\\ \\midrule
Energy reduction & {1}\\%            & {2}\\%           & {3}\\%           \\\\  \\cmidrule{{2-4}}
Overall average & \\multicolumn{{3}}{{c}}{{{0}\%}}                  \\\\  \\bottomrule
\\end{{tabular}}
}}
\\end{{table}}'''

values = []
fastest_machine =  max([m for m in machines], key=attrgetter('base_clock'))
def map_factor_to_machine_efficiency(x):
    best_machine = min([m for m in machines if fastest_machine.base_clock / m.base_clock <= x],
                         key= lambda m: (fastest_machine.base_clock / m.base_clock) * m.energy_factor)
    return 1 - ((fastest_machine.base_clock / best_machine.base_clock) * best_machine.energy_factor / fastest_machine.energy_factor)
            

values.append(value_df["factor"].parallel_map(map_factor_to_machine_efficiency).mean() * 100)
for domain in ["Engineering", "Industrial", "Scientific"]:
#     print(value_df[value_df['domain'] == domain]["factor"].map(map_factor_to_dvfs_gain).describe())
    values.append(value_df[value_df['domain'] == domain]["factor"].parallel_map(map_factor_to_machine_efficiency).mean() * 100)

print(t.format(*["{:,.2f}".format(v) for v in [
    *values
]]))

\begin{table}[t]
\caption{Average energy reduction per domain using heterogeneity.}
\label{slack:tbl:average-heterogeneity-gains}
\adjustbox{max width=\linewidth}{
\begin{tabular}{lrrr}
\toprule
Domain                   & Engineering & Industrial & Scientific \\ \midrule
Energy reduction & 28.31\%            & 41.61\%           & 16.68\%           \\  \cmidrule{2-4}
Overall average & \multicolumn{3}{c}{41.47\%}                  \\  \bottomrule
\end{tabular}
}
\end{table}


In [13]:
# This cell outputs the theoretical gains by using heterogeneity and afterwards DVFS to trade-off slack for reduced energy
t = '''\\begin{{table}}[t]
\\caption{{Average energy reduction per domain using both heterogeneity and \\gls{{dvfs}}.}}
\\label{{slack:tbl:average-heterogeneity-dvfs-gains}}
\\adjustbox{{max width=\\linewidth}}{{
\\begin{{tabular}}{{lrrr}}
\\toprule
Domain                   & Engineering & Industrial & Scientific \\\\ \\midrule
Energy reduction & {1}\\%            & {2}\\%           & {3}\\%           \\\\  \\cmidrule{{2-4}}
Overall average & \\multicolumn{{3}}{{c}}{{{0}\%}}                  \\\\  \\bottomrule
\\end{{tabular}}
}}
\\end{{table}}'''

values = []
fastest_machine =  max([m for m in machines], key=attrgetter('base_clock'))
def map_factor_to_machine_and_dvfs_reduction(x):
    best_machine = min([m for m in machines if fastest_machine.base_clock / m.base_clock <= x],
                         key= lambda m: (fastest_machine.base_clock / m.base_clock) * m.energy_factor)
    dvfs_factor =  max([energy_saving for delay_factor, energy_saving in dvfs_options.items() 
                        if delay_factor * (fastest_machine.base_clock / best_machine.base_clock) <= x])

    return 1 - (((fastest_machine.base_clock / best_machine.base_clock) * best_machine.energy_factor / fastest_machine.energy_factor) * (1 - dvfs_factor))
            

values.append(value_df["factor"].parallel_map(map_factor_to_machine_and_dvfs_reduction).mean() * 100)
for domain in ["Engineering", "Industrial", "Scientific"]:
#     print(value_df[value_df['domain'] == domain]["factor"].map(map_factor_to_dvfs_gain).describe())
    values.append(value_df[value_df['domain'] == domain]["factor"].parallel_map(map_factor_to_machine_and_dvfs_reduction).mean() * 100)

print(t.format(*["{:,.2f}".format(v) for v in [
    *values
]]))

\begin{table}[t]
\caption{Average energy reduction per domain using both heterogeneity and \gls{dvfs}.}
\label{slack:tbl:average-heterogeneity-dvfs-gains}
\adjustbox{max width=\linewidth}{
\begin{tabular}{lrrr}
\toprule
Domain                   & Engineering & Industrial & Scientific \\ \midrule
Energy reduction & 32.11\%            & 47.00\%           & 18.45\%           \\  \cmidrule{2-4}
Overall average & \multicolumn{3}{c}{46.85\%}                  \\  \bottomrule
\end{tabular}
}
\end{table}


In [91]:
machines

[Machine(base_clock=2.9, resources=128, tdp=280, energy_factor=0.7543103448275862),
 Machine(base_clock=3.7, resources=24, tdp=105, energy_factor=1.1824324324324325),
 Machine(base_clock=3.8, resources=16, tdp=105, energy_factor=1.7269736842105263),
 Machine(base_clock=4.1, resources=12, tdp=95, energy_factor=1.9308943089430897)]