In [1]:
from IPython.display import display

from typing import Dict

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# !pip install matplotlib

## Configs

In [3]:
nGPUs = (1,2,4)

## Read data

In [4]:
def read_csv(f_name: str) -> pd.DataFrame:
    try:
        _df = pd.read_csv(f_name, skiprows=3,engine='python', error_bad_lines=False)
        # print("%s UoM: "%f_name, _df.iloc[0])
        _df = _df.drop(0)
        _df = _df.astype({'Start': float, 'Duration': float})
    except:
        _df = pd.read_csv(f_name, skiprows=3, skipfooter=1)
        # print("%s UoM: "%f_name, _df.iloc[0])
        _df = _df.drop(0)
        _df = _df.astype({'Start': float, 'Duration': float})
    return _df

def get_f_name(n_gpu: int) -> str:
    return "profiler_gpt2yelp_bs4_gpu%d.csv"%n_gpu



## Analysis

In [5]:
def get_ins_group(ops: str) -> str:
    if "gemm" in ops:
        return "matrix-mul"
    elif "CUDA memcpy" in ops or "nccl" in ops.lower() or "copy_device_to_device" in ops.lower()\
        or "CUDA memset" in ops:
        return "memory_mgmt"
    elif "::native" in ops or "vectorized_elementwise" in ops or "_cpp1_ii" in ops or "reduce_kernel" in ops:
        return "custom_ops"
    return "other"

def aggregate(df: pd.DataFrame) -> None:
    df["ops_group"] = df.apply(lambda x: get_ins_group(x.Name), axis=1)
    
    called_cuda_interfaces = df["Name"].unique().tolist()
    memory_interfaces = list(filter(lambda x: 'CUDA mem' in x, called_cuda_interfaces))
    memory_interfaces.append('ncclBroadcastRingLLKernel_copy_i8(ncclColl)')
    # Elapsed time
    print("--------- Elapsed time (ms): ", df["Start"].max() - df["Start"].min())
    # devices
    print("--------- devices ")
    display(df["Device"].unique().tolist())
    # memory mgmt
    print("--------- memory mgmt ")
    for mi in memory_interfaces:
        print(mi)
        display(df.query("Name == '%s'"%mi).groupby('Device')["Duration"].sum().reset_index())
    display(df[df["Name"].isin(memory_interfaces)].groupby('Device')["Duration"].sum().reset_index())
    # Top N
    print("--------- Top N ")
    display(df.groupby(['Device', 'Name'])["Duration"].sum().nlargest(10).reset_index())
    display(df.groupby(['Device', 'ops_group'])["Duration"].sum().reset_index())
    # Aggregate
    print("--------- Aggregate ")
    agg_result = df.groupby('ops_group')["Duration"].sum().reset_index()
    display(agg_result)
    agg_result["Percent"] = agg_result[["Duration",]].apply(lambda x: 100*x/x.sum())
    display(agg_result)

def analyze(n_gpus: int = nGPUs) -> None:
    for n_gpu in n_gpus:
        print("=========== Start %d GPUs ==========="%n_gpu)
        f_name = get_f_name(n_gpu)
        df = read_csv("profiler_gpt2yelp_bs4_gpu%d.csv"%n_gpu)
        aggregate(df)
        print("=========== End %d GPUs =========== \n\n\n"%n_gpu)

## Check Unique GPUs

In [6]:
analyze((1,2,4))



Skipping line 67450: unexpected end of data


--------- Elapsed time (ms):  16.363323
--------- devices 


['Tesla V100-SXM2-32GB-LS (0)']

--------- memory mgmt 
[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),173.327077


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),7.658966


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),0.253756


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),0.800954


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),182.040753


--------- Top N 


Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_tn,1698.524756
1,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_nn,1410.02221
2,Tesla V100-SXM2-32GB-LS (0),void at::native::vectorized_elementwise_kernel...,748.877009
3,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_64x32_sliced1x4_nt,718.032153
4,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x128_nt,570.040032
5,Tesla V100-SXM2-32GB-LS (0),void at::native::vectorized_elementwise_kernel...,502.679115
6,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_32x128_tn,464.408549
7,Tesla V100-SXM2-32GB-LS (0),_ZN2at6native27unrolled_elementwise_kernelIZZZ...,459.705535
8,Tesla V100-SXM2-32GB-LS (0),void at::native::vectorized_elementwise_kernel...,456.498958
9,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_32x128_nn,443.851096


Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,4325.839326
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,6293.153052
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,413.665189
3,Tesla V100-SXM2-32GB-LS (0),other,7.788116


--------- Aggregate 


Unnamed: 0,ops_group,Duration
0,custom_ops,4325.839326
1,matrix-mul,6293.153052
2,memory_mgmt,413.665189
3,other,7.788116


Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,4325.839326,39.181745
1,matrix-mul,6293.153052,57.000897
2,memory_mgmt,413.665189,3.746816
3,other,7.788116,0.070542







Skipping line 447678: unexpected end of data


--------- Elapsed time (ms):  99.508235
--------- devices 


['Tesla V100-SXM2-32GB-LS (0)', 'Tesla V100-SXM2-32GB-LS (1)']

--------- memory mgmt 
[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),189.445695
1,Tesla V100-SXM2-32GB-LS (1),0.390557


[CUDA memcpy PtoP]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),2.150175
1,Tesla V100-SXM2-32GB-LS (1),0.918139


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),28.819911
1,Tesla V100-SXM2-32GB-LS (1),20.914438


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),1.812935
1,Tesla V100-SXM2-32GB-LS (1),1.141238


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),3.333587
1,Tesla V100-SXM2-32GB-LS (1),2.386565


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),3521.914578
1,Tesla V100-SXM2-32GB-LS (1),3700.675143


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),3747.476881
1,Tesla V100-SXM2-32GB-LS (1),3726.42608


--------- Top N 


Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_tn,7380.638151
1,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_tn,7209.306293
2,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_nn,6313.397334
3,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_nn,6144.265826
4,Tesla V100-SXM2-32GB-LS (1),ncclBroadcastRingLLKernel_copy_i8(ncclColl),3700.675143
5,Tesla V100-SXM2-32GB-LS (1),ncclReduceRingLLKernel_sum_f32(ncclColl),3646.06101
6,Tesla V100-SXM2-32GB-LS (0),ncclReduceRingLLKernel_sum_f32(ncclColl),3635.091898
7,Tesla V100-SXM2-32GB-LS (0),ncclBroadcastRingLLKernel_copy_i8(ncclColl),3521.914578
8,Tesla V100-SXM2-32GB-LS (0),void at::native::vectorized_elementwise_kernel...,3223.492764
9,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_64x32_sliced1x4_nt,3120.157816


Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,19167.726555
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,27676.138367
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,8468.581223
3,Tesla V100-SXM2-32GB-LS (0),other,34.95676
4,Tesla V100-SXM2-32GB-LS (1),custom_ops,16166.040273
5,Tesla V100-SXM2-32GB-LS (1),matrix-mul,26773.880864
6,Tesla V100-SXM2-32GB-LS (1),memory_mgmt,9594.198584
7,Tesla V100-SXM2-32GB-LS (1),other,33.684987


--------- Aggregate 


Unnamed: 0,ops_group,Duration
0,custom_ops,35333.766828
1,matrix-mul,54450.019231
2,memory_mgmt,18062.779807
3,other,68.641747


Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,35333.766828,32.742157
1,matrix-mul,54450.019231,50.456298
2,memory_mgmt,18062.779807,16.737937
3,other,68.641747,0.063607





--------- Elapsed time (ms):  113.000039
--------- devices 


['Tesla V100-SXM2-32GB-LS (0)',
 'Tesla V100-SXM2-32GB-LS (2)',
 'Tesla V100-SXM2-32GB-LS (3)',
 'Tesla V100-SXM2-32GB-LS (1)']

--------- memory mgmt 
[CUDA memcpy HtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),3100.428255
1,Tesla V100-SXM2-32GB-LS (1),0.261245
2,Tesla V100-SXM2-32GB-LS (2),2.117567
3,Tesla V100-SXM2-32GB-LS (3),2.061065


[CUDA memcpy PtoP]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),1.021369
1,Tesla V100-SXM2-32GB-LS (1),0.535999


[CUDA memcpy DtoH]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),4.46872
1,Tesla V100-SXM2-32GB-LS (1),1.247449
2,Tesla V100-SXM2-32GB-LS (2),1439.438756
3,Tesla V100-SXM2-32GB-LS (3),1439.310283


[CUDA memset]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),13.390262
1,Tesla V100-SXM2-32GB-LS (1),10.736368
2,Tesla V100-SXM2-32GB-LS (2),10.905927
3,Tesla V100-SXM2-32GB-LS (3),11.963802


[CUDA memcpy DtoD]


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),1.010455
1,Tesla V100-SXM2-32GB-LS (1),0.590585
2,Tesla V100-SXM2-32GB-LS (2),0.604696
3,Tesla V100-SXM2-32GB-LS (3),0.596185


ncclBroadcastRingLLKernel_copy_i8(ncclColl)


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),4208.870268
1,Tesla V100-SXM2-32GB-LS (1),4372.958164
2,Tesla V100-SXM2-32GB-LS (2),4452.41737
3,Tesla V100-SXM2-32GB-LS (3),4465.193033


Unnamed: 0,Device,Duration
0,Tesla V100-SXM2-32GB-LS (0),7329.189329
1,Tesla V100-SXM2-32GB-LS (1),4386.32981
2,Tesla V100-SXM2-32GB-LS (2),5905.484316
3,Tesla V100-SXM2-32GB-LS (3),5919.124368


--------- Top N 


Unnamed: 0,Device,Name,Duration
0,Tesla V100-SXM2-32GB-LS (0),ncclReduceRingLLKernel_sum_f32(ncclColl),5482.552254
1,Tesla V100-SXM2-32GB-LS (3),ncclReduceRingLLKernel_sum_f32(ncclColl),5411.964607
2,Tesla V100-SXM2-32GB-LS (2),ncclReduceRingLLKernel_sum_f32(ncclColl),5384.446903
3,Tesla V100-SXM2-32GB-LS (1),ncclReduceRingLLKernel_sum_f32(ncclColl),5303.46093
4,Tesla V100-SXM2-32GB-LS (3),ncclBroadcastRingLLKernel_copy_i8(ncclColl),4465.193033
5,Tesla V100-SXM2-32GB-LS (2),ncclBroadcastRingLLKernel_copy_i8(ncclColl),4452.41737
6,Tesla V100-SXM2-32GB-LS (1),ncclBroadcastRingLLKernel_copy_i8(ncclColl),4372.958164
7,Tesla V100-SXM2-32GB-LS (0),ncclBroadcastRingLLKernel_copy_i8(ncclColl),4208.870268
8,Tesla V100-SXM2-32GB-LS (1),volta_sgemm_128x64_tn,3991.2125
9,Tesla V100-SXM2-32GB-LS (0),volta_sgemm_128x64_tn,3975.629278


Unnamed: 0,Device,ops_group,Duration
0,Tesla V100-SXM2-32GB-LS (0),custom_ops,10293.078968
1,Tesla V100-SXM2-32GB-LS (0),matrix-mul,14621.97759
2,Tesla V100-SXM2-32GB-LS (0),memory_mgmt,13381.122406
3,Tesla V100-SXM2-32GB-LS (0),other,18.584924
4,Tesla V100-SXM2-32GB-LS (1),custom_ops,9246.981013
5,Tesla V100-SXM2-32GB-LS (1),matrix-mul,14896.915461
6,Tesla V100-SXM2-32GB-LS (1),memory_mgmt,10554.359265
7,Tesla V100-SXM2-32GB-LS (1),other,17.572509
8,Tesla V100-SXM2-32GB-LS (2),custom_ops,8509.42012
9,Tesla V100-SXM2-32GB-LS (2),matrix-mul,14424.244212


--------- Aggregate 


Unnamed: 0,ops_group,Duration
0,custom_ops,36693.225301
1,matrix-mul,57910.628717
2,memory_mgmt,47659.894966
3,other,72.240242


Unnamed: 0,ops_group,Duration,Percent
0,custom_ops,36693.225301,25.779303
1,matrix-mul,57910.628717,40.685865
2,memory_mgmt,47659.894966,33.484079
3,other,72.240242,0.050753





