In [1]:
from IPython.display import display

from typing import Dict

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# !pip install matplotlib

## Configs

In [3]:
nGPUs = (1,2,4)

## Read data

In [4]:
def read_csv(f_name: str) -> pd.DataFrame:
    try:
        _df = pd.read_csv(f_name, skiprows=3)
        # print("%s UoM: "%f_name, _df.iloc[0])
        _df = _df.drop(0)
        _df = _df.astype({'Start': float, 'Duration': float})
    except:
        _df = pd.read_csv(f_name, skiprows=3, skipfooter=1)
        # print("%s UoM: "%f_name, _df.iloc[0])
        _df = _df.drop(0)
        _df = _df.astype({'Start': float, 'Duration': float})
    return _df

def get_f_name(n_gpu: int) -> str:
    return "profiler_wangchanberta_bs16_gpu%d.csv"%n_gpu



## Analysis

In [5]:
def get_ins_group(ops: str) -> str:
    if "gemm" in ops:
        return "matrix-mul"
    elif "CUDA memcpy" in ops or "nccl" in ops.lower() or "copy_device_to_device" in ops.lower()\
        or "CUDA memset" in ops:
        return "memory_mgmt"
    elif "::native" in ops or "vectorized_elementwise" in ops or "_cpp1_ii" in ops or "reduce_kernel" in ops:
        return "custom_ops"
    return "other"

def aggregate(df: pd.DataFrame) -> None:
    df["ops_group"] = df.apply(lambda x: get_ins_group(x.Name), axis=1)
    
    called_cuda_interfaces = df["Name"].unique().tolist()
    memory_interfaces = list(filter(lambda x: 'CUDA mem' in x, called_cuda_interfaces))
    memory_interfaces.append('ncclBroadcastRingLLKernel_copy_i8(ncclColl)')
    # Elapsed time
    print("--------- Elapsed time (ms): ", df["Start"].max() - df["Start"].min())
    # devices
    print("--------- devices ")
    display(df["Device"].unique().tolist())
    # memory mgmt
    print("--------- memory mgmt ")
    for mi in memory_interfaces:
        print(mi)
        display(df.query("Name == '%s'"%mi).groupby('Device')["Duration"].sum().reset_index())
    display(df[df["Name"].isin(memory_interfaces)].groupby('Device')["Duration"].sum().reset_index())
    # Top N
    print("--------- Top N ")
    display(df.groupby(['Device', 'Name'])["Duration"].sum().nlargest(10).reset_index())
    display(df.groupby(['Device', 'ops_group'])["Duration"].sum().reset_index())
    # Aggregate
    print("--------- Aggregate ")
    agg_result = df.groupby('ops_group')["Duration"].sum().reset_index()
    display(agg_result)
    agg_result["Percent"] = agg_result[["Duration",]].apply(lambda x: 100*x/x.sum())
    display(agg_result)

def analyze(n_gpus: int = nGPUs) -> None:
    for n_gpu in n_gpus:
        print("=========== Start %d GPUs ==========="%n_gpu)
        f_name = get_f_name(n_gpu)
        df = read_csv("profiler_wangchanberta_bs16_gpu%d.csv"%n_gpu)
        aggregate(df)
        print("=========== End %d GPUs =========== \n\n\n"%n_gpu)

## Check Unique GPUs

In [6]:
analyze((1,2,4))



  df = read_csv("profiler_wangchanberta_bs16_gpu%d.csv"%n_gpu)


--------- Elapsed time (ms):  2110.694652
--------- devices 


['Tesla V100-SXM2-32GB-LS (0)']

--------- memory mgmt 
[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),237.496007


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),757.402019


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),253.738735


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),1801.732152


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),3050.368913


--------- Top N 


Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_32x128_tn,135795.181913
1,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_nt,131064.973333
2,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x32_nn,129060.355232
3,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_64x32_sliced1x4_nt,123588.278827
4,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x32_sliced1x4_nt,122065.763288
5,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_tn,118811.667719
6,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_nn,118645.150663
7,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x128_tn,118569.752818
8,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x128_nn,117486.346398
9,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_64x64_tn,57767.068455


Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,335339.4
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,1263133.0
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,28986.88
3,Tesla V100-SXM2-32GB-LS (0),other,715.8498


--------- Aggregate 


Unnamed: 0,ops_group,Duration
0,custom_ops,335339.4
1,matrix-mul,1263133.0
2,memory_mgmt,28986.88
3,other,715.8498


Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,335339.4,20.596025
1,matrix-mul,1263133.0,77.579678
2,memory_mgmt,28986.88,1.78033
3,other,715.8498,0.043966







  df = read_csv("profiler_wangchanberta_bs16_gpu%d.csv"%n_gpu)
  return func(*args, **kwargs)


--------- Elapsed time (ms):  1126.125608
--------- devices 


['Tesla V100-SXM2-32GB-LS (0)', 'Tesla V100-SXM2-32GB-LS (1)']

--------- memory mgmt 
[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),233.385289
1,Tesla V100-SXM2-32GB-LS (1),0.03088


[CUDA memcpy PtoP]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),14.68876
1,Tesla V100-SXM2-32GB-LS (1),8.928476


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),385.469792
1,Tesla V100-SXM2-32GB-LS (1),240.549497


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),89.724055
1,Tesla V100-SXM2-32GB-LS (1),84.613114


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),801.296856
1,Tesla V100-SXM2-32GB-LS (1),20.10482


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),13712.522652
1,Tesla V100-SXM2-32GB-LS (1),15613.016338


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),15237.087404
1,Tesla V100-SXM2-32GB-LS (1),15967.243125


--------- Top N 


Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_nt,58427.206751
1,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_32x128_tn,58029.424337
2,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x32_nn,55980.616852
3,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_32x128_tn,55815.426552
4,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_64x32_sliced1x4_nt,55182.212475
5,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x32_sliced1x4_nt,54869.676568
6,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_nt,54861.393058
7,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x32_nn,53836.994915
8,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x128_nn,52516.189965
9,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_64x32_sliced1x4_nt,52288.061033


Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,146772.368932
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,555868.486497
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,42851.086556
3,Tesla V100-SXM2-32GB-LS (0),other,361.785871
4,Tesla V100-SXM2-32GB-LS (1),custom_ops,117979.945403
5,Tesla V100-SXM2-32GB-LS (1),matrix-mul,530388.067268
6,Tesla V100-SXM2-32GB-LS (1),memory_mgmt,40529.852734
7,Tesla V100-SXM2-32GB-LS (1),other,348.634761


--------- Aggregate 


Unnamed: 0,ops_group,Duration
0,custom_ops,264752.3
1,matrix-mul,1086257.0
2,memory_mgmt,83380.94
3,other,710.4206


Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,264752.3,18.44835
1,matrix-mul,1086257.0,75.692034
2,memory_mgmt,83380.94,5.810113
3,other,710.4206,0.049503







  df = read_csv("profiler_wangchanberta_bs16_gpu%d.csv"%n_gpu)


--------- Elapsed time (ms):  1262.04522
--------- devices 


['Tesla V100-SXM2-32GB-LS (0)',
 'Tesla V100-SXM2-32GB-LS (2)',
 'Tesla V100-SXM2-32GB-LS (3)',
 'Tesla V100-SXM2-32GB-LS (1)']

--------- memory mgmt 
[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),251.789394
1,Tesla V100-SXM2-32GB-LS (1),0.041376
2,Tesla V100-SXM2-32GB-LS (2),20.850114
3,Tesla V100-SXM2-32GB-LS (3),23.934569


[CUDA memcpy PtoP]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),18.229265
1,Tesla V100-SXM2-32GB-LS (1),10.723347


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),715.516177
1,Tesla V100-SXM2-32GB-LS (1),27.113958
2,Tesla V100-SXM2-32GB-LS (2),28.881034
3,Tesla V100-SXM2-32GB-LS (3),42.206686


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),371.85185
1,Tesla V100-SXM2-32GB-LS (1),228.911131
2,Tesla V100-SXM2-32GB-LS (2),217.912401
3,Tesla V100-SXM2-32GB-LS (3),268.138082


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),53.57811
1,Tesla V100-SXM2-32GB-LS (1),48.148365
2,Tesla V100-SXM2-32GB-LS (2),49.474574
3,Tesla V100-SXM2-32GB-LS (3),48.688193


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),119301.78898
1,Tesla V100-SXM2-32GB-LS (1),123168.228965
2,Tesla V100-SXM2-32GB-LS (2),124997.802063
3,Tesla V100-SXM2-32GB-LS (3),124178.124962


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),120712.753776
1,Tesla V100-SXM2-32GB-LS (1),123483.167142
2,Tesla V100-SXM2-32GB-LS (2),125314.920186
3,Tesla V100-SXM2-32GB-LS (3),124561.092492


--------- Top N 


Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (0),ncclReduceRingLLKernel_sum_f32(ncclColl),157060.957132
1,Tesla V100-SXM2-32GB-LS (2),ncclReduceRingLLKernel_sum_f32(ncclColl),156335.110976
2,Tesla V100-SXM2-32GB-LS (3),ncclReduceRingLLKernel_sum_f32(ncclColl),155233.676953
3,Tesla V100-SXM2-32GB-LS (1),ncclReduceRingLLKernel_sum_f32(ncclColl),152393.073526
4,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_tn,127909.728367
5,Tesla V100-SXM2-32GB-LS (2),ncclBroadcastRingLLKernel_copy_i8(ncclColl),124997.802063
6,Tesla V100-SXM2-32GB-LS (3),ncclBroadcastRingLLKernel_copy_i8(ncclColl),124178.124962
7,Tesla V100-SXM2-32GB-LS (1),ncclBroadcastRingLLKernel_copy_i8(ncclColl),123168.228965
8,Tesla V100-SXM2-32GB-LS (3),volta_sgemm_128x64_tn,122592.641869
9,Tesla V100-SXM2-32GB-LS (0),ncclBroadcastRingLLKernel_copy_i8(ncclColl),119301.78898


Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,120595.109126
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,393645.783055
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,286238.803875
3,Tesla V100-SXM2-32GB-LS (0),other,328.071178
4,Tesla V100-SXM2-32GB-LS (1),custom_ops,105301.398673
5,Tesla V100-SXM2-32GB-LS (1),matrix-mul,446267.40501
6,Tesla V100-SXM2-32GB-LS (1),memory_mgmt,285155.661257
7,Tesla V100-SXM2-32GB-LS (1),other,322.199943
8,Tesla V100-SXM2-32GB-LS (2),custom_ops,53997.735935
9,Tesla V100-SXM2-32GB-LS (2),matrix-mul,227996.47369


--------- Aggregate 


Unnamed: 0,ops_group,Duration
0,custom_ops,383738.3
1,matrix-mul,1501899.0
2,memory_mgmt,1147016.0
3,other,1270.684


Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,383738.3,12.648253
1,matrix-mul,1501899.0,49.503512
2,memory_mgmt,1147016.0,37.806352
3,other,1270.684,0.041883





