In [47]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
def find_upper_bound(mu, std, alpha=0.95):
    z_value = norm.ppf(alpha)
    return mu + std * z_value

In [5]:
gpu_conv_value = 0.00021648585046
gpu_no_norm_conv_value = 6958.933333333333333
per_unit_energy_cons = 1

In [6]:
original_dataset = pd.read_csv("../saved_data/ali20/ali20_g.csv")
g_truth = original_dataset["avggpu"].values[-2582:]

In [7]:
hbnn_results = pd.read_csv("output_HBNN-ali20_g-gpu-w288-h2.csv")
hbnn_results.drop(labels=["Unnamed: 0"], inplace=True, axis=1)
hbnn_results["true_gpu"] = g_truth
hbnn_results["model"] = ["HBNN" for i in range(len(hbnn_results))]
hbnn_results.rename(columns={"avggpu": "pred_gpu", "std": "pred_std", "labels": "true_norm_gpu"}, inplace=True)
first_column = hbnn_results.pop('model')
hbnn_results.insert(0, 'model', first_column)
hbnn_results["true_n_gpu"] = hbnn_results.apply(lambda row: math.ceil(row["true_norm_gpu"]/gpu_conv_value), axis=1)
hbnn_results["ub_95"] = hbnn_results.apply(lambda row: find_upper_bound(row["pred_gpu"], row["pred_std"], alpha=0.95), axis=1)
hbnn_results["pred_n_gpu_95"] = hbnn_results.apply(lambda row: math.ceil(row["ub_95"]/gpu_conv_value), axis=1)
hbnn_results.head(2)

Unnamed: 0,model,pred_gpu,pred_std,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
0,HBNN,0.79237,0.058169,0.804885,29189800.0,3718,0.88805,4103
1,HBNN,0.787413,0.058161,0.81436,29408050.0,3762,0.883079,4080


In [8]:
hbnn_results.describe()

Unnamed: 0,pred_gpu,pred_std,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
count,2582.0,2582.0,2582.0,2582.0,2582.0,2582.0,2582.0
mean,0.35742,0.049786,0.328949,18227520.0,1519.997289,0.439311,2029.774981
std,0.187234,0.007063,0.23252,5355664.0,1074.064784,0.18936,874.69592
min,0.109442,0.016079,-0.172291,6682396.0,-795.0,0.183581,849.0
25%,0.143482,0.04626,0.106536,13104650.0,492.25,0.230175,1064.0
50%,0.372455,0.04921,0.36254,19001220.0,1675.0,0.449472,2076.5
75%,0.516378,0.05296,0.51967,22620420.0,2400.75,0.599429,2769.0
max,0.825197,0.110551,0.915283,31732610.0,4228.0,0.918887,4245.0


In [9]:
hbnn_results.columns

Index(['model', 'pred_gpu', 'pred_std', 'true_norm_gpu', 'true_gpu',
       'true_n_gpu', 'ub_95', 'pred_n_gpu_95'],
      dtype='object')

In [10]:
# monte_results = pd.read_csv("output_HBNN-ali20_g-gpu-w288-h2.csv")
# from sklearn.metrics import mean_squared_error
# print(mean_squared_error(monte_results.avggpu, monte_results.labels))

In [11]:
# monte_results = pd.read_csv("output_MCDLSTM-ali20_g-gpu-w288-h2.csv")
# from sklearn.metrics import mean_squared_error
# print(mean_squared_error(monte_results.avggpu, monte_results.labels))

In [12]:
monte_results = pd.read_csv("output_MCDLSTM-ali20_g-gpu-w288-h2.csv")
monte_results.drop(labels=["Unnamed: 0"], inplace=True, axis=1)
monte_results["true_gpu"] = g_truth
monte_results["model"] = ["MCDLSTM" for i in range(len(monte_results))]
monte_results.rename(columns={"avggpu": "pred_gpu", "std": "pred_std", "labels": "true_norm_gpu"}, inplace=True)
first_column = monte_results.pop('model')
monte_results.insert(0, 'model', first_column)
monte_results["true_n_gpu"] = monte_results.apply(lambda row: math.ceil(row["true_norm_gpu"]/gpu_conv_value), axis=1)
monte_results["ub_95"] = monte_results.apply(lambda row: find_upper_bound(row["pred_gpu"], row["pred_std"], alpha=0.95), axis=1)
monte_results["pred_n_gpu_95"] = monte_results.apply(lambda row: math.ceil(row["ub_95"]/gpu_conv_value), axis=1)
monte_results.head(2)

Unnamed: 0,model,pred_gpu,pred_std,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
0,MCDLSTM,0.810325,2.384186e-07,0.804885,29189800.0,3718,0.810325,3744
1,MCDLSTM,0.798122,5.960465e-08,0.81436,29408050.0,3762,0.798122,3687


In [13]:
monte_results.describe()

Unnamed: 0,pred_gpu,pred_std,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
count,2582.0,2582.0,2582.0,2582.0,2582.0,2582.0,2582.0
mean,0.337572,6.597421e-08,0.328949,18227520.0,1519.997289,0.337572,1559.831913
std,0.214559,6.55825e-08,0.23252,5355664.0,1074.064784,0.214559,991.103168
min,-0.039172,0.0,-0.172291,6682396.0,-795.0,-0.039172,-180.0
25%,0.125868,1.117587e-08,0.106536,13104650.0,492.25,0.125868,581.5
50%,0.367512,5.960465e-08,0.36254,19001220.0,1675.0,0.367512,1698.0
75%,0.512203,8.940697e-08,0.51967,22620420.0,2400.75,0.512203,2366.5
max,0.893295,2.980232e-07,0.915283,31732610.0,4228.0,0.893295,4127.0


In [14]:
flbnn_results = pd.read_csv("output_FLBNN-ali20_g-gpu-w288-h2.csv")
flbnn_results.drop(labels=["Unnamed: 0"], inplace=True, axis=1)
flbnn_results["true_gpu"] = g_truth
flbnn_results["model"] = ["FLBNN" for i in range(len(flbnn_results))]
flbnn_results.rename(columns={"avggpu": "pred_gpu", "std": "pred_std", "labels": "true_norm_gpu"}, inplace=True)
first_column = flbnn_results.pop('model')
flbnn_results.insert(0, 'model', first_column)
flbnn_results["true_n_gpu"] = flbnn_results.apply(lambda row: math.ceil(row["true_norm_gpu"]/gpu_conv_value), axis=1)
flbnn_results["ub_95"] = flbnn_results.apply(lambda row: find_upper_bound(row["pred_gpu"], row["pred_std"], alpha=0.95), axis=1)
flbnn_results["pred_n_gpu_95"] = flbnn_results.apply(lambda row: math.ceil(row["ub_95"]/gpu_conv_value), axis=1)
flbnn_results.head(2)

Unnamed: 0,model,pred_gpu,pred_std,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
0,FLBNN,0.614491,0.064371,0.804885,29189800.0,3718,0.720371,3328
1,FLBNN,0.614068,0.063943,0.81436,29408050.0,3762,0.719245,3323


In [15]:
lstmq_results = pd.read_csv("output_LSTMQ-ali20_g-gpu-w288-h2.csv")
lstmq_results.drop(labels=["Unnamed: 0", "std"], inplace=True, axis=1)
lstmq_results["true_gpu"] = g_truth
lstmq_results["model"] = ["LSTMQ" for i in range(len(lstmq_results))]
lstmq_results.rename(columns={"avggpu": "pred_gpu", "labels": "true_norm_gpu"}, inplace=True)
first_column = lstmq_results.pop('model')
lstmq_results.insert(0, 'model', first_column)
lstmq_results["true_n_gpu"] = lstmq_results.apply(lambda row: math.ceil(row["true_norm_gpu"]/gpu_conv_value), axis=1)
lstmq_results["ub_95"] = lstmq_results["pred_gpu"].values
lstmq_results["pred_n_gpu_95"] = lstmq_results.apply(lambda row: math.ceil(row["ub_95"]/gpu_conv_value), axis=1)
lstmq_results.head(2)

Unnamed: 0,model,pred_gpu,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
0,LSTMQ,0.867105,0.804885,29189800.0,3718,0.867105,4006
1,LSTMQ,0.860391,0.81436,29408050.0,3762,0.860391,3975


In [35]:
perc_upper_bound = 0.05
lstm_results = pd.read_csv("output_LSTM-ali20_g-gpu-w288-h2.csv")
lstm_results.drop(labels=["Unnamed: 0"], inplace=True, axis=1)
lstm_results["true_gpu"] = g_truth
lstm_results["model"] = ["LSTM" for i in range(len(lstm_results))]
lstm_results.rename(columns={"avggpu": "pred_gpu", "labels": "true_norm_gpu"}, inplace=True)
first_column = lstm_results.pop('model')
lstm_results.insert(0, 'model', first_column)
lstm_results["true_n_gpu"] = lstm_results.apply(lambda row: math.ceil(row["true_norm_gpu"]/gpu_conv_value), axis=1)
lstm_results["ub_95"] = lstm_results["pred_gpu"].values
lstm_results["ub_95"] = [value+(perc_upper_bound*value) for value in lstm_results["ub_95"].values]
lstm_results["pred_n_gpu_95"] = lstm_results.apply(lambda row: math.ceil(row["ub_95"]/gpu_conv_value), axis=1)
lstm_results.head(2)

Unnamed: 0,model,pred_gpu,true_norm_gpu,true_gpu,true_n_gpu,ub_95,pred_n_gpu_95
0,LSTM,0.835576,0.804885,29189800.0,3718,0.877354,4053
1,LSTM,0.822173,0.81436,29408050.0,3762,0.863281,3988


<h2>Baselines</h2>

a. Running exactly the required number of GPUs that would be specified by our schedulers acting as an oracle. Will have minimal energy use, and a 100% success rate.

b. Always running the maximum number of GPUs acting as a dummy predictor. Will have maximal energy use, and a 100% success rate.

c. Always running the GPUs that were specified by the oracle for the previous time window.

<h2>Energy scenario 1</h2>

All GPU machines have the same computational power and they consume the same amount of energy.

The scenario has no memory (no GPU state transitions), i.e. the energy consumption is calculated independently from the previous GPUs states

In [105]:
turn_on_cost = 100
running_cost = 230
n_total_gpus = 6742
energy_baseline_a = hbnn_results["true_n_gpu"].values.sum()
energy_baseline_b = n_total_gpus*len(hbnn_results)
baseline_c_values = list(hbnn_results["true_n_gpu"].values[:-1])
baseline_c_values.insert(baseline_c_values[0], 0) # because I do not know the first value
energy_baseline_c = np.sum(baseline_c_values)

In [106]:
scenario_1_costs = {"baseline_a": energy_baseline_a,
                "baseline_b": energy_baseline_b,
                "baseline_c": energy_baseline_c,
                "HBNN": hbnn_results["pred_n_gpu_95"].values.sum(),
                "MCD": monte_results["pred_n_gpu_95"].values.sum(),
                "HBNN++": flbnn_results["pred_n_gpu_95"].values.sum(),
                "LSTMQ": lstmq_results["pred_n_gpu_95"].values.sum(),
                "LSTM": lstm_results["pred_n_gpu_95"].values.sum(),
                }

In [107]:
model_column = [model for model in scenario_1_costs]
energy_value = [100 - round(scenario_1_costs[model]/scenario_1_costs["baseline_b"]*100, 2) for model in scenario_1_costs]
d = {"model": model_column, "% energy savings": energy_value}
df_scenario_1 = pd.DataFrame(data=d)
df_scenario_1

Unnamed: 0,model,% energy savings
0,baseline_a,77.45
1,baseline_b,0.0
2,baseline_c,77.45
3,HBNN,69.89
4,MCD,76.86
5,HBNN++,67.7
6,LSTMQ,72.98
7,LSTM,74.35


<h2>Energy scenario 2</h2>

All GPUs are of different characteristics. At the beginning of each time window, all GPUs are OFF.

<h2>Energy scenario 3</h2>

All GPU machines have the same computational power and they consume the same amount of energy when ON.
We model also the state of a GPU {ON, OFF}, and there is a fixed cost on transitioning from OFF to ON. No cost from ON to OFF.

Initial scenario: All GPUs needed are ON, and there is no associated cost for this. Then, GPUs are switched ON and OFF based on the predicted workload. The energy consumtpion is calculated accordingly.

In [108]:
def calculate_consumption(n_gpu_predicted, history, fixed_cost_run, fixed_cost_switch):
    if n_gpu_predicted > history:
        transition_cost = (n_gpu_predicted - history)*fixed_cost_switch
    else:
        transition_cost = 0
    return transition_cost + n_gpu_predicted*fixed_cost_run

In [109]:
n_total_gpus = 6742
running_cost = 230
turn_on_cost = 100
scenario_2_costs = {}
models = {"MCD": monte_results,
          "HBNN": hbnn_results,
          "HBNN++": flbnn_results,
          "LSTMQ": lstmq_results,
          "LSTM": lstm_results
          }

In [110]:
total_cost = 0
gpu_history = hbnn_results["true_n_gpu"].values
for i, n_gpu in enumerate(gpu_history):
    if i == 0:
        total_cost += n_gpu
        continue
    total_cost += calculate_consumption(n_gpu, gpu_history[i-1], running_cost, turn_on_cost)
scenario_2_costs["baseline_a"] = total_cost
scenario_2_costs["baseline_b"] = n_total_gpus*len(hbnn_results)*running_cost + turn_on_cost*n_total_gpus

In [None]:
# baseline c

In [111]:
for model in models:
    total_cost = 0
    gpu_history = models[model]["pred_n_gpu_95"].values
    for i, n_gpu in enumerate(gpu_history):
        if i == 0:
            total_cost += n_gpu*running_cost
            continue
        total_cost += calculate_consumption(n_gpu, gpu_history[i-1], running_cost, turn_on_cost)
    scenario_2_costs[model] = total_cost

In [112]:
model_column = [model for model in scenario_2_costs]
energy_value = [100 - round(scenario_2_costs[model]/scenario_2_costs["baseline_b"]*100, 2) for model in scenario_2_costs]
d = {"model": model_column, "% energy savings": energy_value}
df_scenario_2 = pd.DataFrame(data=d)
df_scenario_2

Unnamed: 0,model,% energy savings
0,baseline_a,77.27
1,baseline_b,0.0
2,MCD,76.67
3,HBNN,69.76
4,HBNN++,67.65
5,LSTMQ,72.77
6,LSTM,74.14


<h2>Accuracy</h2>

In [39]:
def print_results(df_results):

    mse_value = mean_squared_error(df_results.pred_gpu, df_results.true_norm_gpu)
    print(f"MSE: {mse_value}")

    mae_value = mean_absolute_error(df_results.pred_gpu, df_results.true_norm_gpu)
    print("MAE", mae_value)

    sr_value = sum(df_results.ub_95>df_results.true_norm_gpu)/df_results.shape[0]
    print("SR", sr_value)

    # up_value = sum((df_results.ub_95<df_results.true_norm_gpu)*(df_results.true_norm_gpu-df_results.ub_95))/sum((df_results.ub_95<df_results.true_norm_gpu)*df_results.true_norm_gpu)
    # up_value = sum((df_results.ub_95<df_results.true_norm_gpu)*(df_results.true_norm_gpu-df_results.ub_95))/sum(df_results.ub_95<df_results.true_norm_gpu)

    up_value = sum((df_results.ub_95<df_results.true_norm_gpu)*((df_results.true_norm_gpu-df_results.ub_95)))/sum(df_results.true_norm_gpu)
    print("UP", up_value)

    # op_value = sum((df_results.ub_95>df_results.true_norm_gpu)*(df_results.ub_95-df_results.true_norm_gpu))/sum((df_results.ub_95>df_results.true_norm_gpu)*df_results.true_norm_gpu)

    # op_value = sum((df_results.ub_95>df_results.true_norm_gpu)*(df_results.ub_95-df_results.true_norm_gpu))/sum(df_results.ub_95>df_results.true_norm_gpu)

    op_value = sum((df_results.ub_95>df_results.true_norm_gpu)*((df_results.ub_95-df_results.true_norm_gpu)))/sum(df_results.true_norm_gpu)
    print("OP", op_value)
    return mse_value, mae_value, sr_value, up_value, op_value

In [42]:
models = {"MCDLSTM": monte_results,
          "HBNN": hbnn_results,
          "FLBNN": flbnn_results,
          "LSTMQ": lstmq_results,
          "LSTM": lstm_results
          }
mse_values = []
mae_values = []
sr_values = []
up_values = []
op_values = []
for model in models:
    print(model)
    a, b, c, d, e = print_results(models[model])
    mse_values.append(a)
    mae_values.append(b)
    sr_values.append(c)
    up_values.append(d)
    op_values.append(e)
    print()
data = {"model": ["MCDLSTM", "HBNN", "FLBNN", "LSTMQ", "LSTM"],
        "MSE": mse_values,
        "MAE": mae_values,
        "SR": sr_values,
        "UP": up_values,
        "OP": op_values
        }
df_accuracy = pd.DataFrame(data=data)

MCDLSTM
MSE: 0.002068030780908928
MAE 0.03390599167512373
SR 0.6107668474051123
UP 0.03842959421563328
OP 0.06464405041741941

HBNN
MSE: 0.005862803756676761
MAE 0.05507856979953394
SR 0.9686289697908598
UP 0.0038730693286546914
OP 0.3393694417527197

FLBNN
MSE: 0.009376300999237301
MAE 0.0755649909627305
SR 0.9523625096824168
UP 0.008301780086253503
OP 0.44098144561955077

LSTMQ
MSE: 0.006166121405354145
MAE 0.06926118064031304
SR 0.935708752904725
UP 0.0059744431136667855
OP 0.20457833957367744

LSTM
MSE: 0.0032931226669689773
MAE 0.04235054682901876
SR 0.8679318357862122
UP 0.011700511268966924
OP 0.149381862297774



In [43]:
df_accuracy

Unnamed: 0,model,MSE,MAE,SR,UP,OP
0,MCDLSTM,0.002068,0.033906,0.610767,0.03843,0.064644
1,HBNN,0.005863,0.055079,0.968629,0.003873,0.339369
2,FLBNN,0.009376,0.075565,0.952363,0.008302,0.440981
3,LSTMQ,0.006166,0.069261,0.935709,0.005974,0.204578
4,LSTM,0.003293,0.042351,0.867932,0.011701,0.149382


In [22]:
df_accuracy.to_csv("accuracy_table.csv")