In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/10.2.89"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
#datadirs = ["../scripts/tf_cnn_kernels_nsight/runs/386219"]
#datadirs = ["../scripts/tf_cnn_kernels_nsight/runs/386058"]
#datadirs = os.path.join(homedir,"data/tf_2.0b/new_nsight")
datadirs = ["../data/pytorch_1.5"]
outputdir = "../results/pytorch_1.5"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["ID", "Name", "Network Name", "Batch Size", "Pass", "Precision"]
    resultkeys = ["Precision", "Network Name", "Batch Size", "Pass", "Name"]
    
    tc_peak_perf_flops = 125*10**12
    
    #as metricdf use df_summary
    metricdf = df_metrics.copy()
    profiledf = pd.DataFrame(columns=selectkeys)

    ####### Get timing information
    ### CUDA Time
    # get cycles
    metricname = "CUDA Cycles"
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="total"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # get rates
    metricname = "CUDA Rates"
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # check consistency
    if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):
        raise ValueError("CUDA Time data not consistent")
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["CUDA Rates"]] *= 1e9
    # manual merge and compute CUDA Time
    cyclesdf["CUDA Rates"] = list(ratesdf["CUDA Rates"])
    cyclesdf["CUDA Time"] = cyclesdf["CUDA Cycles"] / cyclesdf["CUDA Rates"]
    # merge with output
    profiledf = cyclesdf[selectkeys+['CUDA Time']].copy()
    
    ### Tensor Core Time
    # get cycles
    metricname = "TC Cycles"
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__pipe_tensor_op_hmma_cycles_active") & (metricdf["Metric Type"]=="total"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # get rates
    metricname = "TC Rates"
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__pipe_tensor_op_hmma_cycles_active") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # check consistency
    if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):
        raise ValueError("TC Time data not consistent")
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["TC Rates"]] *= 1e9
    # manual merge and compute CUDA Time
    cyclesdf["TC Rates"] = list(ratesdf["TC Rates"])
    cyclesdf["TC Time"] = cyclesdf["TC Cycles"] / cyclesdf["TC Rates"]
    # merge & cleanup
    profiledf = profiledf.merge(cyclesdf[selectkeys+['TC Time']], on=selectkeys, how="outer").fillna(0.)
    
    ### Combine
    del profiledf['ID']
    del metricdf['ID']
    profiledf['Invocations'] = 1
    profiledf = profiledf.groupby(resultkeys).sum().reset_index()
    #profiledf.sort_values(by=resultkeys, inplace=True)
    #profiledf.reset_index(drop=True, inplace=True)

    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    
    ### FP32 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_fadd_pred_on',
               'smsp__sass_thread_inst_executed_op_ffma_pred_on',
               'smsp__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP32 FLOPs"]], on=resultkeys, how="inner")
    
    ### FP16 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_hadd_pred_on',
               'smsp__sass_thread_inst_executed_op_hfma_pred_on',
               'smsp__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP16 FLOPs"]], on=resultkeys, how="inner")
    
    #### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"] == "sm__inst_executed_pipe_tensor_op_hmma", resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "TC Utilization"})
    tmpdf["TC Utilization"] = 0.01 * tmpdf["TC Utilization"]
    profiledf = profiledf.merge(tmpdf[resultkeys+["TC Utilization"]], on=resultkeys, how="inner")
    profiledf["TC Utilization"] = profiledf["TC Utilization"] / profiledf['Invocations']
    profiledf["TC FLOPs"] = tc_peak_perf_flops * profiledf["TC Utilization"] * profiledf["TC Time"]
    
    ### Total FLOPs
    profiledf["FLOPs"] = profiledf["FP32 FLOPs"] + profiledf["FP16 FLOPs"] + profiledf["TC FLOPs"] #+ metricdf["FP64 FLOPs"]
    
    ### FLOPs fractions
    #profiledf["FP64 FLOPs Fraction"] = profiledf["FP64 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP32 FLOPs Fraction"] = profiledf["FP32 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP16 FLOPs Fraction"] = profiledf["FP16 FLOPs"]/profiledf["FLOPs"]
    profiledf["TC FLOPs Fraction"]   = profiledf["TC FLOPs"]/profiledf["FLOPs"]
    
    
    ####### Get number of bytes
    
    ### Shared transactions
    #project out
    shareddf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__data_pipe_lsu_wavefronts_mem_shared_op"), resultkeys+["Metric Value"] ].copy()
    shareddf = shareddf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "Shared Transactions"})
    #add to timings
    profiledf = profiledf.merge(shareddf[resultkeys+["Shared Transactions"]], on=resultkeys, how="inner")
    
    ### L1 atomic transactions
    # project out
    metrics = ['l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom',
               'l1tex__t_set_accesses_pipe_lsu_mem_global_op_red',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_red']
    atomicdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    # get reads and writes
    atomicdf = atomicdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Atomic Transactions"})
    # add to timings
    profiledf = profiledf.merge(atomicdf[resultkeys+["L1 Atomic Transactions"]], on=resultkeys, how="inner")

    ### Local transactions 
    # project out
    localdf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_local_op"), resultkeys+["Metric Value"] ].copy()
    localdf = localdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "Local Transactions"})
    # add to timings
    profiledf = profiledf.merge(localdf[resultkeys+["Local Transactions"]], on=resultkeys, how="inner")
    
    ### Global transactions 
    # project out
    globaldf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_global_op"), resultkeys+["Metric Value"] ].copy()
    globaldf = globaldf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "Global Transactions"})
    # add to timings
    profiledf = profiledf.merge(globaldf[resultkeys+["Global Transactions"]], on=resultkeys, how="inner")
    
    ### L1 Bytes
    profiledf["L1 Transactions"] = (profiledf["Shared Transactions"] + profiledf["L1 Atomic Transactions"]
                            + profiledf["Local Transactions"] + profiledf["Global Transactions"])
    profiledf["L1 Bytes"] = profiledf["L1 Transactions"] * 32
    
    ### L2 atomic & reduction
    metricdf.loc[(metricdf["Metric Name"].str.contains("lts__t_sectors_op")) & (metricdf["Metric Type"]=="total"), ["Metric Value"]] *= 2

    ### L2 transactions
    # project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_sectors_op"), resultkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Transactions"})
    l2df["L2 Bytes"] = l2df["L2 Transactions"] * 32
    # add to timings
    profiledf = profiledf.merge(l2df[resultkeys+["L2 Transactions", "L2 Bytes"]], on=resultkeys, how="inner")

    ### DRAM Bytes
    # project out
    dramdf = metricdf.loc[metricdf["Metric Name"].str.contains("dram__sectors"), resultkeys+["Metric Value"] ].copy()
    dramdf = dramdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "DRAM Transactions"})
    dramdf["DRAM Bytes"] = dramdf["DRAM Transactions"] * 32
    # add to timings
    profiledf = profiledf.merge(dramdf[resultkeys+["DRAM Transactions", "DRAM Bytes"]], on=resultkeys, how="inner")
        
    ### Host Memory Bytes
    # project out
    sysmemdf = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_sectors_aperture_sysmem_op"), resultkeys+["Metric Value"] ].copy()
    sysmemdf = sysmemdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "SYSMEM Transactions"})
    sysmemdf["SYSMEM Bytes"] = sysmemdf["SYSMEM Transactions"] * 32
    # add to timings
    profiledf = profiledf.merge(sysmemdf[resultkeys+["SYSMEM Transactions", "SYSMEM Bytes"]], on=resultkeys, how="inner")

    
    ### Get performance
    profiledf["Performance GFlop/s"]      = profiledf["FLOPs"]      / (profiledf["CUDA Time"]*10**9)
    profiledf["FP32 Performance GFlop/s"] = profiledf["FP32 FLOPs"] / (profiledf["CUDA Time"]*10**9)
    profiledf["FP16 Performance GFlop/s"] = profiledf["FP16 FLOPs"] / (profiledf["CUDA Time"]*10**9)
    profiledf["TC Performance GFlop/s"]   = profiledf["TC FLOPs"]   / (profiledf["TC Time"]*10**9)

    
    ### Get AI
    # L1
    profiledf["L1 AI"]        = profiledf["FLOPs"]      / profiledf["L1 Bytes"]
    profiledf["FP32 L1 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["FP16 L1 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["TC L1 AI"]     = profiledf["TC FLOPs"]   / profiledf["L1 Bytes"]
    # L2
    profiledf["L2 AI"]        = profiledf["FLOPs"]      / profiledf["L2 Bytes"]
    profiledf["FP32 L2 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["FP16 L2 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["TC L2 AI"]     = profiledf["TC FLOPs"]   / profiledf["L2 Bytes"]
    # DRAM
    profiledf["DRAM AI"]      = profiledf["FLOPs"]      / profiledf["DRAM Bytes"]
    profiledf["FP32 DRAM AI"] = profiledf["FP32 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["FP16 DRAM AI"] = profiledf["FP16 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["TC DRAM AI"]   = profiledf["TC FLOPs"]   / profiledf["DRAM Bytes"]
    # SYSMEM
    profiledf["SYSMEM AI"]      = profiledf["FLOPs"]      / profiledf["SYSMEM Bytes"]
    profiledf["FP32 SYSMEM AI"] = profiledf["FP32 FLOPs"] / profiledf["SYSMEM Bytes"]
    profiledf["FP16 SYSMEM AI"] = profiledf["FP16 FLOPs"] / profiledf["SYSMEM Bytes"]
    profiledf["TC SYSMEM AI"]   = profiledf["TC FLOPs"]   / profiledf["SYSMEM Bytes"]

    
    ### Cleanup
    profiledf.sort_values(by=resultkeys).reset_index(drop=True, inplace=True)
    #print(metricdf[['CUDA Time Avg', 'TC Time Avg']])
    
    return profiledf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".ncu-rep") or (os.path.splitext(x)[-1] == ".csv"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"]).reset_index(drop=True)
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):

In [6]:
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
all_passes = set([re.match(r'.*\.pass_(.*?)\.', x).groups()[0] for x in recorddf["prefix"].unique()])

#metrics
df_profiles = []

for pref in all_prefixes:    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        
        #project frame
        files = recorddf.loc[recorddf["prefix"].apply(lambda x: re.match(r'.*\.pass_(.*?)\.', x).groups()[0]) == pas, "file"].values

        #project the invididual files
        metricfiles = [x for x in files if x.endswith(".ncu-rep")]
        metriccsvs  = [x for x in files if x.endswith(".csv")]
        
        ImportFromNsight = True
        if len(metricfiles) == len(metriccsvs):
            ImportFromNsight = False
            metricfiles = metriccsvs
        
        for metricfile in metricfiles:
            
            #print the file
            print(metricfile)
            
            #get the parameters from the filename
            parameters = parse_filename_nsight(os.path.basename(metricfile))
        
            #metrics
            metricdf = import_nsight_metric(ImportFromNsight, metricfile, cuda_dir=cudadir)
            for key in parameters:
                metricdf[key] = parameters[key]
        
            #fuse read/write metrics together:
            unique_metrics = metricdf["Metric Name"].unique()
            unique_metrics = set([x.split(".")[0].replace("_write","").replace("_read","").replace("_ld","").replace("_st","") for x in unique_metrics])
            #add the metric type
            metricdf["Metric Type"] = "total"
            #read
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_read"), "Metric Type" ] = "read"
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_ld"), "Metric Type" ] = "read"
            #write
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_write"), "Metric Type" ] = "write"
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_st"), "Metric Type" ] = "write"
            #rate
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
        
            for metric in unique_metrics:
                metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric

            #append to DF:
            df_metrics.append(metricdf)
    
    #concat the frames
    metricdf = pd.concat(df_metrics).reset_index(drop=True)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles).reset_index(drop=True)

../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_read.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_write.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum.csv
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum.csv
../data/pytorch_1.5/pr

In [7]:
profiledf

Unnamed: 0,Precision,Network Name,Batch Size,Pass,Name,CUDA Time,TC Time,Invocations,FP32 FLOPs,FP16 FLOPs,...,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI,SYSMEM AI,FP32 SYSMEM AI,FP16 SYSMEM AI,TC SYSMEM AI
0,mixed,deepCam,2,backward,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1...,0.063457,0.063608,384,1.298989e+09,0.0,...,0.000000,86.558155,461.544485,0.110112,0.000000,461.434373,6.330027e+07,1.510171e+04,0.000000,6.328516e+07
1,mixed,deepCam,2,backward,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1...,0.001441,0.001439,12,2.457600e+07,0.0,...,0.000000,176.590749,413.373309,0.117225,0.000000,413.256085,3.224081e+07,9.142857e+03,0.000000,3.223167e+07
2,mixed,deepCam,2,backward,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1...,0.012022,0.012020,6,4.158259e+07,0.0,...,0.000000,421.062833,1362.422128,0.056049,0.000000,1362.366079,7.520624e+08,3.093943e+04,0.000000,7.520315e+08
3,mixed,deepCam,2,backward,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x2...,0.031397,0.031395,12,4.246733e+07,0.0,...,0.000000,133.167388,1486.701917,0.020842,0.000000,1486.681075,1.126947e+09,1.579886e+04,0.000000,1.126932e+09
4,mixed,deepCam,2,backward,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_64x32...,0.005062,0.005076,24,5.632819e+07,0.0,...,0.000000,28.108023,64.476321,0.017258,0.000000,64.459064,3.914544e+07,1.047771e+04,0.000000,3.913497e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,mixed,deepCam,2,forward,volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f...,0.000477,0.000477,6,6.370099e+07,3981312.0,...,0.010521,63.151051,262.915682,0.698876,0.043680,262.173126,1.783048e+07,4.739657e+04,2962.285714,1.778013e+07
111,mixed,deepCam,2,forward,volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f...,0.001821,0.001844,6,3.397386e+08,21233664.0,...,0.010270,57.676963,234.274116,0.665453,0.041591,233.567072,8.899231e+07,2.527817e+05,15798.857143,8.872373e+07
112,mixed,deepCam,2,forward,volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nt,0.015462,0.015516,36,2.006581e+09,0.0,...,0.000000,59.159849,226.962966,0.387155,0.000000,226.575811,1.458735e+08,2.488320e+05,0.000000,1.456247e+08
113,mixed,deepCam,2,forward,volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1,0.002050,0.000000,12,2.199808e+10,0.0,...,0.000000,0.000000,30.543450,30.543450,0.000000,0.000000,8.183808e+06,8.183808e+06,0.000000,0.000000e+00


# Compute AI Results

In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()#.reset_index()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction"] = combineddf["FP32 FLOPs"] / combineddf["FLOPs"]
combineddf["FP16 FLOPs Fraction"] = combineddf["FP16 FLOPs"] / combineddf["FLOPs"]
combineddf["TC FLOPs Fraction"]   = combineddf["TC FLOPs"]   / combineddf["FLOPs"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs"]      / (combineddf["CUDA Time"]*10**9)
combineddf["FP32 Performance GFlop/s"] = combineddf["FP32 FLOPs"] / (combineddf["CUDA Time"]*10**9)
combineddf["FP16 Performance GFlop/s"] = combineddf["FP16 FLOPs"] / (combineddf["CUDA Time"]*10**9)
combineddf["TC Performance GFlop/s"]   = combineddf["TC FLOPs"]   / (combineddf["TC Time"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs"]      / combineddf["L1 Bytes"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L1 Bytes"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L1 Bytes"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs"]   / combineddf["L1 Bytes"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs"]      / combineddf["L2 Bytes"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L2 Bytes"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L2 Bytes"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs"]   / combineddf["L2 Bytes"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs"]      / combineddf["DRAM Bytes"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs"]   / combineddf["DRAM Bytes"]

combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

In [9]:
combineddf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CUDA Time,TC Time,Invocations,FP32 FLOPs,FP16 FLOPs,TC Utilization,TC FLOPs,FLOPs,FP32 FLOPs Fraction,FP16 FLOPs Fraction,...,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI,SYSMEM AI,FP32 SYSMEM AI,FP16 SYSMEM AI,TC SYSMEM AI
Precision,Network Name,Batch Size,Pass,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
mixed,deepCam,2,backward,1.349231,0.260532,9136,1799501000000.0,372580700000.0,10.091165,21737610000000.0,23909690000000.0,0.075262,0.015583,...,0.5798,33.827449,98.23172,7.393158,1.530729,89.307834,5278744000.0,556285700.0,188272300.0,4534186000.0
mixed,deepCam,2,forward,0.615992,0.119549,4782,1388716000000.0,61459690000.0,3.279819,9519808000000.0,10969980000000.0,0.126592,0.005603,...,0.255036,39.503903,90.478692,11.453912,0.50691,78.51787,1458617000.0,342393200.0,3427154.0,1112797000.0


# Export Data

In [10]:
combineddf.to_csv("./combined.csv")
profiledf.to_csv("./profile.csv")