In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

sns.set(rc={"figure.figsize":(10, 5)})

from tad4bj import DataStorage

EXPERIMENT_SIZE_FIELDS = [
    "number_of_fragments",
    "points_per_fragment", 
]

PLATFORM_FIELDS = [
    "nodes",
    "cpus_per_node",
    "backends_per_node",
    "dataclay",
    "use_split",
]

STD_VALUE_THRESHOLD = 2
VIOLIN_BW = 0.1

ESTIMATOR_TO_USE = np.mean
#from functools import partial
#ESTIMATOR_TO_USE = partial(np.percentile, q=25)

HANDPICKED_OUTLIERS = [
    # A lot of those seem to be for the executions done during 15th june at 7am. 
    # 6am executions were still fine.
    # Temperature issues? General jitter? Sysadmin updating the system?
    # We may never know
    23429588, 23424157, 23424125, 23505880, 23505871, 23505863, 23429586, 23505895, 23505868,
    23505904, 23505912, 23505886, 23505860,
    23505887, 23505892, 23505915, 23505913, 23505864,
    23503326, 23503411, 23429587, 23505920, 23505865,
]

def plot_things(data, estimator=ESTIMATOR_TO_USE):
    sns.barplot(data=data, x="nodes", hue="mode", y="iteration_time", 
                estimator=estimator)
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    plt.title("All")
    plt.show()

#     fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12,10))
    
#     sns.barplot(data=data.query("dataclay == 0"), x="nodes", hue="mode", y="iteration_time", 
#                 palette="Set2", estimator=estimator, ax=ax1)

#     sns.violinplot(data=data.query("dataclay == 0"), x="nodes", hue="mode", y="iteration_time", 
#                    scale='width',
#                    palette="Set2", split=True, ax=ax2, inner="quartile", bw=VIOLIN_BW)

#     ax1.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
#     ax2.get_legend().remove()
#     plt.suptitle("COMPSs executions")
#     plt.show()

    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12,10))

    sns.barplot(data=data.query("dataclay == 1"), x="nodes", hue="mode", y="iteration_time",
                estimator=estimator, ax=ax1, palette="Set2")
    sns.violinplot(data=data.query("dataclay == 1"), x="nodes", hue="mode", y="iteration_time", 
                   scale='width',
                   split=True, ax=ax2, inner="quartile", bw=VIOLIN_BW, palette="Set2")
    
    ax1.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    ax2.get_legend().remove()
    plt.suptitle("dataClay executions")
    plt.show()

In [None]:
def smart_mean(row):
    row["fragments_per_node"] = row["number_of_fragments"] / (row["nodes"] - 1)
    if not row['dataclay']:
        row['mode'] = "COMPSs"
        if row["tracing"] == 1.0:
            row['mode'] += " (tracing)"
    elif row['use_split']:
        row['mode'] = "dC+split"
    else:
        row['mode'] = "dC"
    
    
    if not row['compss_scheduler'] or not row['compss_working_dir']:
        row["iteration_time"] = np.nan
        return row
    
    compss_settings_suffix = f" [{row['compss_scheduler'].rsplit('.', 1)[-1]}/{row['compss_working_dir']}]"
    
    row['mode'] += compss_settings_suffix
    return row

# Let's keep only those executions that "make sense"
# (dataClay makes sense with FIFODataLocationScheduler and local_disk)
# (when there is no dataClay, both schedulers make sense, but location should be done with local_disk)
query_compss_gpfs = "((dataclay == 0) and (compss_scheduler == 'es.bsc.compss.scheduler.fifodatanew.FIFODataScheduler') and (compss_working_dir == 'gpfs'))"
query_compss_ld = "((dataclay == 0) and (compss_scheduler == 'es.bsc.compss.scheduler.fifodatalocation.FIFODataLocationScheduler') and (compss_working_dir == 'local_disk'))"
query_dataclay = "((dataclay == 1) and (compss_scheduler == 'es.bsc.compss.scheduler.fifodatalocation.FIFODataLocationScheduler') and (compss_working_dir == 'local_disk'))"

db = DataStorage("kmeans-split")
df = db.to_dataframe().query("start_ts > '2022-06-10'").query("tracing == 0")

df = df[df.id.isin(HANDPICKED_OUTLIERS) == False]

# Remove old split experiments which had a bogus _multiplicity_ and 
# were inefficient
to_drop = df.query("(start_ts < '2022-04-14') and (use_split == 1)").index
df = df.drop(to_drop).query(f"{query_compss_gpfs} or {query_dataclay}").apply(smart_mean, axis=1)
#df = db.to_dataframe().apply(smart_mean, axis=1).query("(start_ts > '2021-02-18 20') and (start_ts < '2021-02-20')")

In [None]:
edf = df.explode("iteration_time").sort_values('mode')
edf['iteration_time'] = edf['iteration_time'].astype(float)

# Weak scaling (small blocks)

- 2304 fragments **per node**
- 128000 points per fragment

This shows the behaviour of split in a typical scalability environment. The quantity of objects is high and increases with the number of nodes. The benefits of the split should be more apparent as the number of nodes / work increases.

In [None]:
# Weak scaling
data = edf.query("((number_of_fragments / (nodes - 1)) == 2304) and (points_per_fragment == 64000)")

plot_things(data)

# Weak scaling (big blocks)

- 48 fragments **per node**
- Blocks are big

This experiment has the same size as the previous one. 

This is a bad scenario for the split; data is perfectly balanced, so there is no real benefit of doing a split. Because the job load is high, the overhead may not be extremely big.

In [None]:
# Weak scaling, with big blocks (48 blocks per node)
data = edf.query("((number_of_fragments / (nodes - 1)) == 48) and ((points_per_fragment) == (64000 * 48))")

plot_things(data)

## Blocksize sweep

8 worker nodes, analyze multiple block sizes (from 128000 points per block to 48 blocks per node). Those are the scenarios of weak scaling (see previous experiments).

In [None]:
# Weak scaling, with big blocks (48 blocks per node)
data = edf.query("(nodes == 9) and ((number_of_fragments * points_per_fragment) == 1179648000)")

def eval_granularity_index(row):
    row["granularity_index"] = row["number_of_fragments"] // (48 * 8)
    return row

data = data.apply(eval_granularity_index, axis=1)

In [None]:
sns.barplot(data=data, x="granularity_index", hue="mode", y="iteration_time", 
            estimator=ESTIMATOR_TO_USE)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.title("All")
# TODO: Change granularity_index to: fragments per core [o algo així]
plt.show()

for i in [1, 4, 16, 48]:
    ax = sns.violinplot(data=data.query("granularity_index == %d" % i),
                        x="granularity_index", hue="mode", y="iteration_time", 
                        scale='width', bw=VIOLIN_BW,
                        inner="quartile", figsize=123)
    ax.get_legend().remove()
    plt.show()

sns.barplot(data=data.query("dataclay == 1"), x="granularity_index", hue="mode", y="iteration_time", 
            estimator=ESTIMATOR_TO_USE)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.title("All")
plt.show()


## Split overhead

**Work in progress**

In [None]:
# Some low-number-of-fragments split overhead are off-the-charts
# and give some distorted outlier-full information
data = data.query("split_time < 100")

sns.barplot(data=data.query("dataclay == 1"), x="number_of_fragments", y="split_time")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

sns.violinplot(data=data.query("dataclay == 1"), x="number_of_fragments", y="split_time", 
               scale='width',
               inner="quartile", bw=VIOLIN_BW)
