In [5]:
import json
import pandas as pd

from glob import glob
from collections import defaultdict

In [6]:
pd.set_option("display.max_rows", None)  # Display all rows
pd.set_option("display.max_columns", None)  # Display all columns

In [7]:
df = pd.read_csv("./output/bloom-560m/np2-bs8/metrics.csv")
df

Unnamed: 0,time,cpu_util,cpu_mem,gpu_0_util,gpu_1_util,gpu_0_mem,gpu_1_mem,gpu_0_power,gpu_1_power,gpu_0_temp,gpu_1_temp,disk_read,disk_write
0,2023-10-20 14:57:23.567798,6.059375,18814763008,0,0,0,0,9.556,9.166,31,27,0.0,235110.4
1,2023-10-20 14:57:28.572094,6.265625,18814763008,0,0,0,0,9.556,9.166,31,27,0.0,235110.4
2,2023-10-20 14:57:33.577217,7.4125,21593022464,0,0,0,0,9.556,9.166,31,27,0.0,5719654.4
3,2023-10-20 14:57:38.734310,6.01875,23755390976,0,0,0,0,9.556,9.166,31,27,87654.4,344883.2
4,2023-10-20 14:57:43.744203,6.01875,23755390976,0,0,0,0,9.556,9.166,31,27,157286.4,472678.4
5,2023-10-20 14:57:48.748600,6.375,23941623808,0,0,0,0,9.556,9.166,31,27,157286.4,472678.4
6,2023-10-20 14:57:53.753641,5.846875,23979106304,0,0,0,0,9.361,9.747,30,27,0.0,406323.2
7,2023-10-20 14:57:58.758627,6.234375,23980040192,0,0,0,0,9.361,9.747,30,27,0.0,367820.8
8,2023-10-20 14:58:03.763678,6.73125,23938371584,0,0,0,0,9.361,9.747,30,27,0.0,266240.0
9,2023-10-20 14:58:08.885687,,23933779968,0,0,0,0,9.361,9.747,30,27,,


In [8]:
for col in df.columns:
    if not col == "time":
        print(f"[{col}] min: {df[col].min():.2f}, max: {df[col].max():.2f}, avg: {df[col].mean():.2f}")

[cpu_util] min: 5.12, max: 7.41, avg: 6.21
[cpu_mem] min: 18814763008.00, max: 23980040192.00, avg: 22509491293.09
[gpu_0_util] min: 0.00, max: 0.00, avg: 0.00
[gpu_1_util] min: 0.00, max: 0.00, avg: 0.00
[gpu_0_mem] min: 0.00, max: 0.00, avg: 0.00
[gpu_1_mem] min: 0.00, max: 0.00, avg: 0.00
[gpu_0_power] min: 9.36, max: 9.56, avg: 9.47
[gpu_1_power] min: 9.17, max: 9.75, avg: 9.43
[gpu_0_temp] min: 30.00, max: 31.00, avg: 30.55
[gpu_1_temp] min: 27.00, max: 27.00, avg: 27.00
[disk_read] min: 0.00, max: 157286.40, avg: 40222.72
[disk_write] min: 235110.40, max: 168996044.80, avg: 17751654.40


In [2]:
traces = glob("./logs/bigscience/bloom-1b1/**/*.json", recursive=True)
total_durs = {}

for trace in traces:
    with open(trace) as f:
        json_dict = json.load(f)

    total_dur = {}

    for event in json_dict["traceEvents"]:
        if not "dur" in event:
            continue

        cat = event["cat"]
        dur = event["dur"]
        name = event["name"]

        if not dur:
            continue

        if not cat in total_dur:
            total_dur[cat] = defaultdict(int)

        total_dur[cat][name] += dur

    rows = []
    for cat, sub_dict in total_dur.items():
        for name, dur in sub_dict.items():
            rows.append({
                "cat": cat,
                "name": name,
                "dur": dur,
            })

    total_durs[trace.split("/")[-2]] = pd.DataFrame(rows)

In [3]:
targets = {"aten::": "ManipulateTensor", "autograd::": "CalculateGradient", "Backward": "Backward"}

for key, val in total_durs.items():
    print(key)
    for target, alt_name in targets.items():
        len_ori = len(val)
        df_temp = val[val["name"].str.contains(target)]
        val.loc[len_ori] = ({"cat": "cpu_op", "name": alt_name, "dur": df_temp["dur"].sum()})
        val.drop(df_temp.index, inplace=True)
        val.reset_index(drop=True, inplace=True)
        print(f"{alt_name} (length: {len_ori} --> {len(val)})")
    print()

np8-bs32
ManipulateTensor (length: 226 --> 143)
CalculateGradient (length: 143 --> 120)
Backward (length: 120 --> 97)

np1-bs32
ManipulateTensor (length: 216 --> 133)
CalculateGradient (length: 133 --> 110)
Backward (length: 110 --> 87)



In [9]:
pd.set_option("display.max_rows", None)  # Display all rows
pd.set_option("display.max_columns", None)  # Display all columns
pd.set_option("display.max_colwidth", 60)

for idx, val in enumerate(total_durs.values()):
    val = val.sort_values(by=["cat", "dur"], ascending=[True, False])
    val.reset_index(drop=True, inplace=True)
    trace = traces[idx]
    val.to_csv(f"{trace[:trace.rfind('/')]}/trace.csv")
    display(val)

Unnamed: 0,cat,name,dur
0,Trace,PyTorch Profiler (0),3502589
1,cpu_op,ManipulateTensor,5108633
2,cpu_op,CalculateGradient,3396507
3,cpu_op,Backward,2518651
4,cpu_op,c10d::allreduce_,11061
5,cpu_op,record_param_comms,10246
6,cpu_op,GeLUFunction,8231
7,cpu_op,torch.distributed.ddp.reducer::copy_bucket_to_grad,5956
8,cpu_op,detach_,4
9,cuda_runtime,cudaLaunchKernel,2283745


Unnamed: 0,cat,name,dur
0,Trace,PyTorch Profiler (0),3443483
1,cpu_op,ManipulateTensor,4957827
2,cpu_op,CalculateGradient,3717495
3,cpu_op,Backward,2636824
4,cpu_op,GeLUFunction,8134
5,cpu_op,detach_,5
6,cuda_runtime,cudaLaunchKernel,2251305
7,cuda_runtime,cudaStreamSynchronize,772730
8,cuda_runtime,cudaMemcpyAsync,204751
9,cuda_runtime,cudaMemsetAsync,51274
