In [13]:
import re
import pandas as pd
import plotly.express as px

from glob import glob

In [14]:
pt_time = re.compile(r"Done epoch (\d): Time (\d+\.\d+)")
pt_dali = re.compile(r"da(\d+)")
pt_node = re.compile(r"node(\d+)")

In [15]:
logs = sorted(glob("./**/**/torch.log"))
rows = []

for log in logs:
    with open(log, "r") as f:
        lines = f.readlines()

    time_per_epoch = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
    for line in lines:
        sh_time = pt_time.search(line)
        if sh_time:
            time_per_epoch[int(sh_time.group(1))] = float(sh_time.group(2))

    dali = int(pt_dali.search(log).group(1))
    if dali == 0:
        dali = "off"
    elif dali == 1:
        dali = "cpu"
    else:
        dali = "gpu"

    node_num = int(pt_node.search(log).group(1))
    if node_num == 4:
        gpus = "A100 * 4"
    if node_num == 5:
        gpus = "H100 * 4"
    elif node_num == 7:
        gpus = "L40 * 4"
    elif node_num == 8:
        gpus = "A40 * 4"
    elif node_num == 9:
        gpus = "L4 * 4"

    rows.append(
        {
            "gpus": gpus,
            "storage": log.split("-")[3],
            "dali": dali,
            "epoch_1": round(time_per_epoch[1], 2),
            "epoch_2": round(time_per_epoch[2], 2),
            "epoch_3": round(time_per_epoch[3], 2),
            "epoch_4": round(time_per_epoch[4], 2),
            "epoch_5": round(time_per_epoch[5], 2),
        }
    )

In [16]:
df = pd.DataFrame(rows)
df.sort_values(by=["gpus", "dali"], ascending=[True, True], inplace=True)
df.reset_index(drop=True, inplace=True)
df["mean"] = df.iloc[:, [3, 4, 5, 6, 7]].mean(axis=1)
df["standard_deviation"] = df.iloc[:, [3, 4, 5, 6, 7]].std(axis=1)
df.to_csv("data_storage.csv", index=False)
df

Unnamed: 0,gpus,storage,dali,epoch_1,epoch_2,epoch_3,epoch_4,epoch_5,mean,standard_deviation
0,A100 * 4,local,gpu,319.29,308.18,307.15,307.9,309.27,310.358,5.050769
1,A100 * 4,local,off,298.01,263.66,262.43,263.67,262.39,270.032,15.652767
2,A40 * 4,local,cpu,451.48,446.35,446.03,445.94,444.95,446.95,2.585894
3,A40 * 4,ontap,cpu,447.8,442.66,442.52,443.67,442.89,443.908,2.220624
4,A40 * 4,local,gpu,513.77,505.92,504.56,504.97,507.28,507.3,3.765043
5,A40 * 4,ontap,gpu,519.76,511.65,512.18,511.88,510.79,513.252,3.674693
6,A40 * 4,local,off,454.14,448.35,447.32,447.01,447.36,448.836,3.007479
7,A40 * 4,ontap,off,465.26,462.08,459.14,456.01,457.61,460.02,3.686591
8,H100 * 4,ontap,cpu,337.24,328.43,326.03,328.4,326.68,329.356,4.53189
9,H100 * 4,local,gpu,192.64,186.58,186.06,186.14,185.37,187.358,2.984346


In [17]:
df_1 = df[(df["storage"] == "local") & (df["dali"] == "off")]
df_1.to_csv("data_local_dali_off.csv", index=False)
df_1

Unnamed: 0,gpus,storage,dali,epoch_1,epoch_2,epoch_3,epoch_4,epoch_5,mean,standard_deviation
1,A100 * 4,local,off,298.01,263.66,262.43,263.67,262.39,270.032,15.652767
6,A40 * 4,local,off,454.14,448.35,447.32,447.01,447.36,448.836,3.007479
11,H100 * 4,local,off,257.29,229.0,228.54,227.1,225.45,233.476,13.384462
17,L4 * 4,local,off,888.14,877.61,876.97,876.49,876.19,879.08,5.093054
23,L40 * 4,local,off,390.43,424.9,425.92,422.96,424.44,417.73,15.298301


In [18]:
df_2 = df[(df["storage"] == "local") & (df["dali"] == "gpu")]
df_2.to_csv("data_local_dali_gpu.csv", index=False)
df_2

Unnamed: 0,gpus,storage,dali,epoch_1,epoch_2,epoch_3,epoch_4,epoch_5,mean,standard_deviation
0,A100 * 4,local,gpu,319.29,308.18,307.15,307.9,309.27,310.358,5.050769
4,A40 * 4,local,gpu,513.77,505.92,504.56,504.97,507.28,507.3,3.765043
9,H100 * 4,local,gpu,192.64,186.58,186.06,186.14,185.37,187.358,2.984346
15,L4 * 4,local,gpu,946.76,935.82,934.41,933.73,933.33,936.81,5.642238
21,L40 * 4,local,gpu,403.38,394.99,394.98,394.71,395.1,396.632,3.774979


In [None]:
fig = px.line(
    df,
    x="number_of_nodes",
    y="mean",
    color="dali",
    error_y="standard_deviation",
    markers=True,
    labels={"mean": "elapsed_time_per_epoch"},
)
fig.show()