In [1]:
# IMPORTS from tvm, different required packages and the profiling infrastructure

from numpy.core.numeric import full
import tvm
from tvm.contrib import utils, graph_executor as runtime
from tvm.relay.op.nn.nn import dense, dilate, conv2d
#####
import numpy as np
import pynvml as nv
# from func_timeout import func_timeout
import time
import psutil

#####
from components import description_vector as dv
from components import serializer
from components import profiling

In [2]:
# set default figure size
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (20, 10)

In [3]:
# helpful to suppress output of debug runtime run function
from contextlib import contextmanager
import sys, os

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

In [4]:
nv.nvmlInit()
deviceCount = nv.nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nv.nvmlDeviceGetHandleByIndex(i)
    print("GPU", i, ":", nv.nvmlDeviceGetName(handle))

GPU 0 : b'NVIDIA GeForce RTX 3070'


In [9]:
# defining important variables for the profiling system

target = "cuda"
target_class = "cuda"

device = "3070"
dev_idx = 0
dev = tvm.device(str("cuda"), dev_idx)
time_min_res = 0.2


state_path = "./states"
state_file = "state"

layer_name = "conv2d"

metrics = profiling.get_metrics(target, device, backend="nvml", dev_idx=dev_idx)
print(metrics)

['nvml:::NVIDIA_GeForce_RTX_3070:device_0:power']


In [10]:
nv.nvmlInit()
handle = nv.nvmlDeviceGetHandleByIndex(0)

#metrics.append("nvml:::NVIDIA_GeForce_GTX_980_Ti:device_1:pstate") #unable to read using TVM PAPI Profiler due to limitation to integer data
metrics.append("nvml:::NVIDIA_GeForce_RTX_3070:device_"+str(dev_idx)+":gpu_utilization")
metrics.append("nvml:::NVIDIA_GeForce_RTX_3070:device_"+str(dev_idx)+":memory_utilization")
metrics.append("nvml:::NVIDIA_GeForce_RTX_3070:device_"+str(dev_idx)+":graphics_clock")
metrics.append("nvml:::NVIDIA_GeForce_RTX_3070:device_"+str(dev_idx)+":sm_clock")
metrics.append("nvml:::NVIDIA_GeForce_RTX_3070:device_"+str(dev_idx)+":memory_clock")
metrics.append("nvml:::NVIDIA_GeForce_RTX_3070:device_"+str(dev_idx)+":allocated_memory")

In [11]:
config = {}
config["n"] = 1
config["h"] = 225
config["w"] = 225
config["c"] = 3
config["pad"] = 0
config["dilation"] = 1
config["kernel"] = 3
config["strides"] = 1
config["grps"] = 1
config["channels"] = 32

In [12]:
measurements = {}
a_range = [3]
b_range = range(1, 30, 1)
for n in a_range:
    for i in b_range:
        state = i
        config["c"] = n
        config["kernel"] = i
        
        #print(config)
        # prepare input tensors
        repeat = 1024
        required = int(config["n"]) * int(config["c"]) * int(config["h"]) * int(config["w"])
        '''
        inp_shape = (
            int(config["h"]),
            int(config["w"]),
            int(config["c"]),
            int(config["n"])
        )
        '''
        inp_shape = (
            int(config["n"]),
            int(config["c"]),
            int(config["h"]),
            int(config["w"]),
        )
        rand_data = np.random.rand(int(np.ceil(required/repeat)))
        inp_data = np.repeat(rand_data, repeat)[:required].reshape(inp_shape).astype("float32")
        #inp_data = np.random.rand(np.prod(inp_shape)).reshape(inp_shape).astype("float32")

        '''
        weight_shape = (
            int(config["kernel"]),
            int(config["kernel"]),
            int(config["c"] / config["grps"]),
            int(config["channels"]),
        )
        '''
        weight_shape = (
            int(config["channels"]),
            int(config["c"] / config["grps"]),
            int(config["kernel"]),
            int(config["kernel"])
        )
        required = int(config["channels"]) * int(config["c"] / config["grps"]) * int(config["kernel"]) * int(config["kernel"])
        rand_data = np.random.rand(int(np.ceil(required/repeat)))
        weight_data = np.repeat(rand_data, repeat)[:required].reshape(weight_shape).astype("float32")
        #weight_data = np.random.rand(np.prod(weight_shape)).reshape(weight_shape).astype("float32")
        x = tvm.relay.var("data", tvm.relay.TensorType(inp_shape), dtype="float32")
        y = tvm.relay.Constant(tvm.nd.array(weight_data))

        # compile with TVM
        expr = conv2d(
            data = x,
            weight= y,
            strides=int(config["strides"]),
            padding=int(config["pad"]),
            dilation=int(config["dilation"]),
            groups=int(config["grps"]),
            channels=int(config["channels"]),
            kernel_size=int(config["kernel"]),
            data_layout="NCHW",
            kernel_layout="OIHW",
            #data_layout="NHWC",
            #kernel_layout="HWIO",
        )
        
        mod = tvm.ir.IRModule.from_expr(expr)
        params = {}
        with tvm.transform.PassContext(opt_level=3):
            compiled_graph_lib = tvm.relay.build_module.build(mod, target_class, params=params)

        # measuring the execution time
        from tvm.contrib.debugger import debug_executor as graph_runtime

        ## building runtime
        debug_g_mod = graph_runtime.GraphModuleDebug(
            compiled_graph_lib["debug_create"]("default", dev),
            [dev],
            compiled_graph_lib.get_graph_json(),
            "."
        )
        
        #### NEW FUNCTION TO GET THE LAYER RUNTIME
        t_start  = time.monotonic()
        times = debug_g_mod.run_individual(10, 3, 1000)
        t_end = time.monotonic()
        
        try:
            for idx, node in enumerate(debug_g_mod.debug_datum._nodes_list):
                if layer_name in node["op"]:
                    layer_time = float(times[idx])*1000
                    actual_layer_name = node["op"]
            # print(1)
            print(layer_time, "ms")

            runs = int(max(1, np.ceil(time_min_res / (layer_time/1000))))

            # determine the noise
            iterations = 20
            powers = []
            gpu_utils = []
            mem_utils = []
            gpu_clocks = []
            sm_clocks = []
            mem_clocks = []
            alloc_memory = []
            profile_times = []

            # burn in 
            t_burn_in = 5
            t_start = time.monotonic()
            t_end = t_start + t_burn_in
            while time.monotonic() < t_end:
                # run debug runtime without profiling as burn in
                with suppress_stdout():
                    test_data = debug_g_mod.profile(collectors=[], data=tvm.nd.array(inp_data.astype("float32")), runs=runs)
            print(1)
            p_start = time.monotonic()
            for r in range(0, iterations):        
                # reload the Metric Collector due to issues with the PAPI backend
                data_collector = tvm.runtime.profiling.PAPIMetricCollector({dev: metrics}, component="nvml")    

                # run debug runtime with time measurements only
                #with suppress_stdout():
                test_data = debug_g_mod.profile(collectors=[data_collector], data=tvm.nd.array(inp_data.astype("float32")), runs=runs)
                pstate = nv.nvmlDeviceGetPowerState(handle)
                #print("\r",(r+1),"PState:", pstate, end="")

                # extract measurement of current run
                powers.append(test_data.calls[0][metrics[0]].value)
                gpu_utils.append(test_data.calls[0][metrics[1]].value)
                mem_utils.append(test_data.calls[0][metrics[2]].value)
                gpu_clocks.append(test_data.calls[0][metrics[3]].value)
                sm_clocks.append(test_data.calls[0][metrics[4]].value)
                mem_clocks.append(test_data.calls[0][metrics[5]].value)
                alloc_memory.append(test_data.calls[0][metrics[6]].value)
                profile_times.append(test_data.calls[0]["Duration (us)"].microseconds/1000000/runs) # in seconds
                #time.sleep(1)
            print(2)
            p_delta = time.monotonic() - p_start
            avg_power = np.mean(powers)/1000
            max_power = np.max(powers)/1000
            min_power = np.min(powers)/1000
            std_power = np.std(powers)/1000
            #calculate Z-Score
            z_scores = ((np.array(powers)/1000) - avg_power)/std_power
            cleaned_powers = []
            threshold = 0.25
            while len(cleaned_powers) < 3:
                cleaned_powers = []
                threshold += 0.05
                for idx, score in enumerate(z_scores):
                    if abs(score) < threshold:
                        cleaned_powers.append(powers[idx]/1000)
            print(3)
            layer_power = np.median(cleaned_powers)
            layer_memory = np.median(alloc_memory)/(1024**3)

            #print()
            measurements[state] = (layer_time, layer_power, layer_memory)
            print(state, (layer_time, layer_power, layer_memory))
            #print()
        except:
            print("failed measurement")
            #measurements[state] = (-1, -1, -1)

[03:26:35] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:65: Op #0 tvmgen_default_fused_nn_conv2d:
[03:26:35] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 0: 22.2101 us/iter
[03:26:35] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 1: 22.207 us/iter
[03:26:35] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 2: 22.2122 us/iter


failed measurement


[03:26:42] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:65: Op #0 tvmgen_default_fused_nn_conv2d:
[03:26:42] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 0: 21.8625 us/iter
[03:26:42] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 1: 21.8639 us/iter
[03:26:42] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 2: 21.8725 us/iter


failed measurement


[03:26:50] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:65: Op #0 tvmgen_default_fused_nn_conv2d:
[03:26:50] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 0: 363.566 us/iter
[03:26:50] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 1: 363.399 us/iter
[03:26:50] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 2: 363.454 us/iter


failed measurement


[03:26:57] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:65: Op #0 tvmgen_default_fused_nn_conv2d:
[03:26:57] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 0: 150.145 us/iter
[03:26:57] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 1: 150.122 us/iter
[03:26:57] /root/wang/tvm/src/runtime/graph_executor/debug/graph_executor_debug.cc:68: Iteration: 2: 150.237 us/iter


KeyboardInterrupt: 

In [18]:
debug_g_mod.debug_datum._nodes_list

[{'op': 'param',
  'name': 'data',
  'inputs': [],
  'attrs': {'T': 'type: float32'},
  'shape': [1, 3, 225, 225]},
 {'op': 'param',
  'name': 'p0',
  'inputs': [],
  'attrs': {'T': 'type: float32'},
  'shape': [32, 3, 4, 4]},
 {'op': 'tvmgen_default_fused_nn_conv2d',
  'name': 'tvmgen_default_fused_nn_conv2d',
  'attrs': {'num_outputs': '1',
   'num_inputs': '2',
   'flatten_data': '0',
   'func_name': 'tvmgen_default_fused_nn_conv2d',
   'out_layout': '',
   'data_layout': 'NCHW',
   'kernel_layout': 'OIHW',
   'hash': '84e7f8ea0e0f2896',
   'T': 'type: float32'},
  'inputs': ['data', 'p0'],
  'shape': [1, 32, 222, 222]}]