In [2]:
from IPython.display import display

import pandas as pd

## Read data

In [6]:
df = pd.read_csv("profiler_output.csv", skiprows=3)
print(df.iloc[0])
df = df.drop(0)
df = df.astype({'Start': float, 'Duration': float})
df

Start                      s
Duration                  ms
Grid X                   NaN
Grid Y                   NaN
Grid Z                   NaN
Block X                  NaN
Block Y                  NaN
Block Z                  NaN
Registers Per Thread     NaN
Static SMem               KB
Dynamic SMem              KB
Size                      MB
Throughput              GB/s
SrcMemType               NaN
DstMemType               NaN
Device                   NaN
Context                  NaN
Stream                   NaN
Src Dev                  NaN
Src Ctx                  NaN
Dst Dev                  NaN
Dst Ctx                  NaN
Name                     NaN
Correlation_ID           NaN
Name: 0, dtype: object


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Start,Duration,Grid X,Grid Y,Grid Z,Block X,Block Y,Block Z,Registers Per Thread,Static SMem,...,DstMemType,Device,Context,Stream,Src Dev,Src Ctx,Dst Dev,Dst Ctx,Name,Correlation_ID
1,9.245914,41.306943,,,,,,,,,...,Device,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],240.0
2,9.298157,0.311934,,,,,,,,,...,Device,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],253.0
3,9.299247,0.002944,,,,,,,,,...,Device,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],266.0
4,9.299432,0.002464,,,,,,,,,...,Device,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],278.0
5,9.299562,0.002464,,,,,,,,,...,Device,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],290.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75970,43.151437,0.005376,2304.0,1.0,1.0,64.0,1.0,1.0,16.0,0.0,...,,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784005.0
75971,43.151548,0.001184,3.0,1.0,1.0,64.0,1.0,1.0,16.0,0.0,...,,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784012.0
75972,43.151675,0.001184,6.0,1.0,1.0,64.0,1.0,1.0,16.0,0.0,...,,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784019.0
75973,43.151780,0.001216,1.0,1.0,1.0,64.0,1.0,1.0,16.0,0.0,...,,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784026.0


## GPU(s)

In [7]:
df["Device"].unique().tolist()

['Tesla V100-SXM2-32GB-LS (0)', 'Tesla V100-SXM2-32GB-LS (1)']

## Elapsed Time

In [8]:
df["Start"].max() - df["Start"].min()

33.906941

## CUDA Interfaces

In [4]:
called_cuda_interfaces = df["Name"].unique().tolist()
called_cuda_interfaces

['[CUDA memcpy HtoD]',
 '[CUDA memcpy PtoP]',
 '[CUDA memset]',
 'ncclBroadcastRingLLKernel_copy_i8(ncclColl)',
 'void at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_ii_cedd8df2::CatArrayBatchedCopy<float, unsigned int, int=1>(float*, at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_ii_cedd8df2::CatArrInputTensorMetadata<at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_ii_cedd8df2::CatArrayBatchedCopy<float, unsigned int, int=1>, unsigned int, int=128>, at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_ii_cedd8df2::OutputTensorSizeStride<at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_ii_cedd8df2::CatArrInputTensorMetadata, unsigned int=4>, int, at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_ii_cedd8df2::CatArrInputTensorMetadata)',
 'void at::native::_GLOBAL__N__52_tmpxft_0000330e_00000000_12_Shape_compute_80_cpp1_i

In [5]:
def get_ins_group(ops: str) -> str:
    if "gemm" in ops:
        return "matrix-mul"
    elif "CUDA memcpy" in ops or "nccl" in ops.lower() or "copy_device_to_device" in ops.lower()\
        or "CUDA memset" in ops:
        return "memory_mgmt"
    elif "::native" in ops or "vectorized_elementwise" in ops or "_cpp1_ii" in ops or "reduce_kernel" in ops:
        return "custom_ops"
    return "other"

In [6]:
df["ops_group"] = df.apply(lambda x: get_ins_group(x.Name), axis=1)
df

Unnamed: 0,Start,Duration,Grid X,Grid Y,Grid Z,Block X,Block Y,Block Z,Registers Per Thread,Static SMem,...,Device,Context,Stream,Src Dev,Src Ctx,Dst Dev,Dst Ctx,Name,Correlation_ID,ops_group
1,9.245914,41.306943,,,,,,,,,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],240.0,memory_mgmt
2,9.298157,0.311934,,,,,,,,,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],253.0,memory_mgmt
3,9.299247,0.002944,,,,,,,,,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],266.0,memory_mgmt
4,9.299432,0.002464,,,,,,,,,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],278.0,memory_mgmt
5,9.299562,0.002464,,,,,,,,,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,[CUDA memcpy HtoD],290.0,memory_mgmt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75970,43.1514,0.005376,2304.0,1.0,1.0,64.0,1.0,1.0,16.0,0,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784005.0,custom_ops
75971,43.1515,0.001184,3.0,1.0,1.0,64.0,1.0,1.0,16.0,0,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784012.0,custom_ops
75972,43.1517,0.001184,6.0,1.0,1.0,64.0,1.0,1.0,16.0,0,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784019.0,custom_ops
75973,43.1518,0.001216,1.0,1.0,1.0,64.0,1.0,1.0,16.0,0,...,Tesla V100-SXM2-32GB-LS (0),1.0,7.0,,,,,void at::native::vectorized_elementwise_kernel...,784026.0,custom_ops


### Memory management

In [7]:
memory_interfaces = list(filter(lambda x: 'CUDA mem' in x, called_cuda_interfaces))
memory_interfaces

['[CUDA memcpy HtoD]',
 '[CUDA memcpy PtoP]',
 '[CUDA memset]',
 '[CUDA memcpy DtoD]',
 '[CUDA memcpy DtoH]']

In [8]:
memory_interfaces.append('ncclBroadcastRingLLKernel_copy_i8(ncclColl)')

## Analyze the utilization

### Memory mgnts

In [9]:
for mi in memory_interfaces:
    print(mi)
    display(df.query("Name == '%s'"%mi).groupby('Device')["Duration"].sum().reset_index())

[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),164.017733
1,Tesla V100-SXM2-32GB-LS (1),0.019712


[CUDA memcpy PtoP]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),0.419357
1,Tesla V100-SXM2-32GB-LS (1),0.089632


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),4.5838
1,Tesla V100-SXM2-32GB-LS (1),2.428372


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),2.395567
1,Tesla V100-SXM2-32GB-LS (1),2.340048


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),0.299069
1,Tesla V100-SXM2-32GB-LS (1),0.184127


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),315.976963
1,Tesla V100-SXM2-32GB-LS (1),336.701188


In [10]:
df[df["Name"].isin(memory_interfaces)].groupby('Device')["Duration"].sum().reset_index()

Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),487.692489
1,Tesla V100-SXM2-32GB-LS (1),341.763079


### Runtime aggregation

In [11]:
df.groupby(['Device', 'Name'])["Duration"].sum().nlargest(10).reset_index()

Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x32_sliced1x4_nt,3061.359715
1,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x32_sliced1x4_nt,2868.311943
2,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x32_tn,1633.642295
3,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x128_tn,1573.485068
4,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x32_tn,1550.941053
5,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_tn,1530.902965
6,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x32_nn,1504.619838
7,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x128_tn,1492.82036
8,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x128_nn,1489.751305
9,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_nn,1462.603523


In [12]:
df.groupby(['Device', 'ops_group'])["Duration"].sum().reset_index()

Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,3734.905832
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,14763.939734
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,1100.304236
3,Tesla V100-SXM2-32GB-LS (0),other,3.451556
4,Tesla V100-SXM2-32GB-LS (1),custom_ops,3440.281922
5,Tesla V100-SXM2-32GB-LS (1),matrix-mul,15651.409602
6,Tesla V100-SXM2-32GB-LS (1),memory_mgmt,971.041406
7,Tesla V100-SXM2-32GB-LS (1),other,3.661411


### For all devices

In [13]:
agg_result = df.groupby('ops_group')["Duration"].sum().reset_index()
agg_result

Unnamed: 0,ops_group,Duration
0,custom_ops,7175.187754
1,matrix-mul,30415.349336
2,memory_mgmt,2071.345642
3,other,7.112967


In [14]:
agg_result["Percent"] = agg_result[["Duration",]].apply(lambda x: 100*x/x.sum())
agg_result

Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,7175.187754,18.087647
1,matrix-mul,30415.349336,76.672849
2,memory_mgmt,2071.345642,5.221573
3,other,7.112967,0.017931
