In [None]:
from collections import namedtuple
import numpy as np
import pandas as pd
import torch
import re

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
# file = '/home/ubuntu/MLPerf_ROCm/mlperf_training/v0p6/rnn_translator/pytorch/rnn_translator-nvtx.csv'
file = '/home/ubuntu/DeepLearningMisc/microbench/resnext101_32x8d-nvtx-1042s877m.csv'

initdf = pd.read_csv(file, skiprows=5,
                 names=['start','duration','gridX','gridY','gridZ','blockX','blockY','blockZ',
                        'registersPerThread','staticSMem','dynamicSMem','size','throughput',
                        'srcMemType','dstMemType','device','context','stream','name','corrid'])
# staticSMem - KB, dynamicSMem - KB, size - MB, throughput - GB/s
# print(initdf.shape)
# display(initdf.tail())
initdf.dropna(subset=['name'], inplace=True)
initdf.drop(['gridX','gridY','gridZ','blockX','blockY','blockZ','srcMemType','dstMemType','device'], axis=1, inplace=True)

# demangling the name
initdf['name'] = initdf['name'].apply(torch._C._demangle)

startprof = initdf.index[initdf['name'].str.contains("\[Marker\] __start_profile")].tolist()
assert len(startprof) == 1
stopprof = initdf.index[initdf['name'].str.contains("\[Marker\] __stop_profile")].tolist()
assert len(stopprof) == 1
initdf = initdf.loc[startprof[0]:stopprof[0], :] 
# print(initdf.shape)

df = initdf.dropna(subset=['registersPerThread','staticSMem','dynamicSMem','size','throughput'], how='all')
print(df.shape)
display(df.head())

## Working with markers

In [None]:
#contains markers and Cuda Launch Kernels
markers = initdf[(initdf['name'].str.contains("\[Range start\]")) | (initdf['name'].str.contains("\[Range end\]")) | (initdf['name'].str.contains("Marker")) | (initdf['name'] == "cudaLaunchKernel") | (initdf['name'].str.contains("cudaMemcpy"))]
print(markers.shape)
display(markers.head(50))

In [None]:
Marker = namedtuple('Marker', 'index name depth')
# Op = namedtuple('Op', [])

In [None]:
stack = []
opsToCorrid = {}  # marker operation index -> cuda launch kernel correlation id

# # Helper dicts (not essential)
# opsIndexToName = {}  # marker operation index -> pytorch operation name

for index, row in markers.iloc[1:-1].iterrows():
    name = row['name']
    if "pin_memory" in name:
        continue

    if "[Range start]" in name:
        pat = re.compile(r'\[Range start\] (?P<name>[a-zA-Z0-9_:]*), (seq = \d+)?(, )?(?P<size>sizes = \[[\[\],\d ]*\])? \(Domain: \<unnamed\>\)')
        details = pat.match(name)
        if not details:
            print(" *** Error handling regex name matching:{}".format(name))
            continue
        mname = details.group('name')
        if details.group('size'):
            mname = mname + ", " + details.group('size')
        marker = Marker(index, mname, len(stack))
#         print("pushing into stack: {}, {}".format(index, row['name']))
        stack.append(marker)
        opsToCorrid[marker] = []
        
    elif "[Range end]" in name:
        marker = stack.pop()
        top = markers.loc[marker.index, 'name']
        match = top.replace("start","end")
        tmpst = []
        while(len(stack) and (match != name)):
            tmpst.append(marker)
            marker = stack.pop()
            match = markers.loc[marker.index, 'name'].replace("start", "end")
        
        if len(tmpst):
            print(" *** does not match; this shouldn't happen ideally: {}".format(index))
        
        while(len(tmpst)):
            m = tmpst.pop()
            stack.append(m)

#         if match != name:
# #             print("Popped from stack: {}".format(top))
# #         else:
#             print(" *** does not match; this shouldn't happen ideally: {}".format(index))
#             stack.append(marker)
        
    elif (name == "cudaLaunchKernel") or ("cudaMemcpy" in name):
#         print("cuda launch kernel: {}".format(row['corrid']))
        for marker in stack:
            opsToCorrid[marker].append(row['corrid'])
        if len(stack) == 0:
            print(" *** Kernel with corrid: {} doesn't lie between any markers".format(row['corrid']))
    else:
        print(" *** wrong option")

print(len(opsToCorrid))

In [None]:
delkeys = []
for i, corrids in opsToCorrid.items():
    if len(corrids) == 0:
        delkeys.append(i)

for key in delkeys:
    opsToCorrid.pop(key, None)
    
print(len(opsToCorrid))

In [None]:
allCorrids = []
for i, corrids in opsToCorrid.items():
    allCorrids.extend(corrids)
print(len(allCorrids))

allCorrids = set(allCorrids)
print(len(allCorrids))

# print(opsToCorrid[7096])

In [None]:
corridToKernelIndex = {}  # cuda launch kernel correlation id -> index of kernel with corresponding correlation id
# Helper dicts (not essential)
kernelIndexToRow = {}

for corrid in allCorrids:
    rowIndex = df.index[df['corrid'] == int(corrid)].tolist()
    assert len(rowIndex) == 1, "multiple kernels with same corrid"
    corridToKernelIndex[corrid] = rowIndex[0]
    
    # can remove
    kernelIndexToRow[rowIndex[0]] = df.loc[rowIndex[0]]

print(len(corridToKernelIndex))

In [None]:
opsToKernelIndex = {}  # marker operation index -> index of corresponding kernel call
for opIndex, corrids in opsToCorrid.items():
    opsToKernelIndex[opIndex] = []
    for corrid in corrids:
        opsToKernelIndex[opIndex].append(corridToKernelIndex[corrid])

print(len(opsToKernelIndex))
# print(opsToKernelIndex)  # add 6 to indices to get line numbers in csv

In [None]:
mappingdf = pd.DataFrame(list([marker.name, 
                               marker.depth, 
                               [kernelIndexToRow[kid]['name']+"["+str(kernelIndexToRow[kid]['stream'])+"]" for kid in kids], 
                               len(kids),
                               kernelIndexToRow[kids[0]]['start'],
                               sum([kernelIndexToRow[kid]['duration'] for kid in sorted(kids)])
                              ] for marker,kids in opsToKernelIndex.items()), 
                         columns=['pyName', 'depth', 'kernelNames', 'numKernels', 'startTime', 'sumKernelDuration'])
mappingdf['startTime'] = mappingdf['startTime'].astype(float)
print(mappingdf.shape)
display(mappingdf.head(50))
# mappingdf.to_csv('/home/ubuntu/DeepLearningMisc/microbench/resnet101_32x8d-mapping.csv')
# mappingdf.to_csv('/home/ubuntu/logs/rnn_translator-mapping.csv')

In [None]:
topdf = mappingdf[mappingdf['depth']==0]
topdf.drop(['depth'], axis=1, inplace=True)
print(topdf.shape)
display(topdf.head())

In [None]:
topdf['duration'] = topdf['startTime'].shift(-1, axis=0) - topdf['startTime']
print(topdf.shape)
display(topdf.head(50))
# topdf.to_csv('/home/ubuntu/logs/ncf-mapping-top.csv')
# topdf.to_csv('/home/ubuntu/logs/rnn_translator-mapping-top.csv')

In [None]:
topdf.dtypes

In [None]:
# toOps = topdf[topdf['pyName'].str.contains("to,")].index.tolist()
toOps = topdf[topdf['pyName'].str.contains("conv2d, sizes = \[\[32, 3, 224, 224\], \[64, 3, 7, 7\], \[\]")].index.tolist()

# should be sets of 3 consecutive indices
print(toOps)
#ignore first batch
# batchStartIndices = [toOps[i] for i in range(len(toOps)) if i%4 == 0]
batchStartIndices = toOps
print(batchStartIndices)

In [None]:
b1 = topdf.loc[batchStartIndices[0]:batchStartIndices[1]-1]
b1.reset_index(inplace=True, drop=True)
# display(b1)
b2 = topdf.loc[batchStartIndices[1]:batchStartIndices[2]-1]
b2.reset_index(inplace=True, drop=True)

b3 = topdf.loc[batchStartIndices[2]:batchStartIndices[3]-1]
b3.reset_index(inplace=True, drop=True)

b4 = topdf.loc[batchStartIndices[3]:batchStartIndices[4]-1]
b4.reset_index(inplace=True, drop=True)

b5 = topdf.loc[batchStartIndices[4]:]
b5.reset_index(inplace=True, drop=True)
display(b4)

In [None]:
print(b1.shape, b2.shape, b3.shape, b4.shape, b5.shape)
# all must be equal

In [None]:
pivotBatch = pd.concat([b1,b2,b3,b4,b5], axis=1, keys=['b1','b2','b3','b4','b5'])
print(pivotBatch.shape)
display(pivotBatch[[('b1','pyName'), ('b2','pyName'), ('b3','pyName'), ('b4','pyName'), ('b5','pyName')]])

In [None]:
ref = pivotBatch[[('b1','sumKernelDuration'), ('b2','sumKernelDuration'), ('b3','sumKernelDuration'), ('b4','sumKernelDuration'), ('b5','sumKernelDuration')]]
pivotBatch['diffDuration'] = ref.max(axis=1) - ref.min(axis=1)
# display(pivotBatch)
# pivotBatch.sort_values('diffDuration', ascending=False, inplace=True)
pivotBatch[[('b1', 'pyName'), ('b1','kernelNames'), ('b1', 'numKernels'), ('b1','sumKernelDuration'), ('b2','sumKernelDuration'), ('b3','sumKernelDuration'), ('b4','sumKernelDuration'), ('b5','sumKernelDuration'), ('diffDuration','')]]


In [None]:
ref.sum()

#### fwd pass

In [None]:
#consider b4 batch
fwd_end = b4.index[b4['pyName'].str.contains("log_softmax")].tolist()
assert len(fwd_end) == 1, "ERROR"
fwd = b4.loc[:fwd_end[0]]
display(fwd)
fwd.to_csv('/home/ubuntu/logs/resnext101_32x8d-fwdall-nv-1042s877m.csv')

In [None]:
fwd = fwd[fwd['pyName']!="add_, sizes = [[], [], []]"]
fwd.reset_index(inplace=True, drop=True)
display(fwd)


In [None]:
fwd[fwd['pyName'].str.contains("batch_norm")]

In [None]:
tmp = fwd[~(fwd['pyName'].str.startswith("conv2d") | fwd['pyName'].str.startswith("batch_norm"))]
# display(tmp[tmp['numKernels'] > 1])
display(tmp)

## NCF

In [None]:
df.rename(columns={'duration':'sumDuration'}, inplace=True)
df['start'] = pd.to_numeric(df['start'])
df['duration'] = df['start'].shift(-1, axis=0) - df['start']
df['start'] = df['start'].apply(str)
display(df)

In [None]:
tmp = df[(df['name'] == '[CUDA memcpy HtoD]') & (df['size'] == 0.015625)].index.tolist()
print(tmp)
batchStartIndices = [tmp[i] for i in range(len(tmp)) if i%2==0]
print(batchStartIndices)

In [None]:
gemmdf = df[df['name'].str.contains('sgemm')]
gemmdf.drop(['registersPerThread','staticSMem','dynamicSMem','size','throughput','context','stream','corrid'], axis=1, inplace=True)
b2gemm = gemmdf.loc[batchStartIndices[1]:batchStartIndices[2]]
b2gemm.reset_index(inplace=True, drop=True)
b3gemm = gemmdf.loc[batchStartIndices[2]:batchStartIndices[3]]
b3gemm.reset_index(inplace=True, drop=True)
b4gemm = gemmdf.loc[batchStartIndices[3]:batchStartIndices[4]]
b4gemm.reset_index(inplace=True, drop=True)
b5gemm = gemmdf.loc[batchStartIndices[4]:batchStartIndices[5]]
b5gemm.reset_index(inplace=True, drop=True)
b6gemm = gemmdf.loc[batchStartIndices[5]:]
b6gemm.reset_index(inplace=True, drop=True)

In [None]:
display(b2gemm)

In [None]:
pivot = pd.concat([b2gemm, b3gemm, b4gemm, b5gemm, b6gemm], axis=1, keys=['b2','b3','b4','b5','b6'])

pivot['avgSumDuration'] = pivot[[('b2','sumDuration'),('b3','sumDuration'),('b4','sumDuration'),('b5','sumDuration'),('b6','sumDuration')]].mean(axis=1)
pivot['avgDuration'] = pivot[[('b2', 'duration'),('b3', 'duration'),('b4', 'duration'),('b5', 'duration'),('b6', 'duration')]].mean(axis=1)
display(pivot)

In [None]:
for i in pivot[('avgDuration','')].tolist():
    print(i)

In [None]:
b2gemm['sumDuration'].quantile([0.1,0.25,0.5,0.75,0.9])

## RNN TRANSLATOR

In [None]:
df.drop(['duration'], axis=1, inplace=True)
df.rename(columns={'kernelDuration':'duration'}, inplace=True)
df.rename(columns={'duration':'kernelDuration'}, inplace=True)
df['start'] = pd.to_numeric(df['start'])
df['duration'] = df['start'].shift(-1, axis=0) - df['start']
df['start'] = df['start'].apply(str)
display(df)

In [None]:
df[df['name'].str.contains('indexSelectLargeIndex')]

In [None]:
df[df['name'] == '[CUDA memcpy HtoD]']