In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from tad4bj import DataStorage

EXPERIMENT_SIZE_FIELDS = [
    "number_of_fragments", 
    "points_per_fragment", 
]

PLATFORM_FIELDS = [
    "nodes",
    "cpus_per_node",
    "backends_per_node",
]

In [None]:
def smart_mean(row):
    it_times = row["iteration_time"]
    
    if it_times:
        it_times = it_times[1:]
        count = len(it_times)
        if count > 0:
            #mean = sum(it_times) / count
            #mean = min(it_times)
            row["mean_time"] = np.mean(it_times)
            row["std_time"] = np.std(it_times)

    if not row['dataclay']:
        row['mode'] = "COMPSs"
    elif row['use_split']:
        row['mode'] = "dC+split"
    else:
        row['mode'] = "dC"

    return row

db = DataStorage("kmeans-split")
df = db.to_dataframe().apply(smart_mean, axis=1).query("start_ts > '2021-02-09'")

In [None]:
byproduct = df.groupby(EXPERIMENT_SIZE_FIELDS + ['mode'] + PLATFORM_FIELDS).agg({"mean_time": ["mean"], "std_time": ["mean"]})
# The "condensed" dataframe, containing the rellevant data properly condensed
cdf = byproduct.reset_index()
# Flatten columns
cdf.columns = [col[0] for col in cdf.columns.values]
cdf

In [None]:
sns.barplot(data=cdf, x="number_of_fragments", hue="mode", y="mean_time")