In [20]:
# Benchmark all the different variants of dace convolution
# Benchmark it with respect to cudnn, tensorflow's conv2D
# 1DConv, 2DConv, 3DConv
# Verification and benchmarking

In [21]:
from torch import nn
import click
import dace
import numpy as np
from pprint import pprint
import dace.libraries.blas
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import glob
import os

In [22]:
# Functions for plotting

def barplot(title, labels=False):
    x = list(sorted(TIMES.keys()))
    bars = [np.median(TIMES[key].timings) for key in x]
    yerr = [np.std(TIMES[key].timings) for key in x]
    color = [('#86add9' if 'dace' in key else 'salmon') for key in x]

    p = plt.bar(x, bars, yerr=yerr, color=color)
    plt.ylabel('Runtime [s]'); plt.xlabel('Implementation'); plt.title(title); 
    if labels:
        plt.gca().bar_label(p)
    pass

def barplotvalues(title, labels=False):
    x = list(sorted(TIMES.keys()))
    bars = [np.median(TIMES[key]) for key in x]
    yerr = [np.std(TIMES[key]) for key in x]
    color = [('#86add9' if 'dace' in key else 'salmon') for key in x]

    p = plt.bar(x, bars, yerr=yerr, color=color)
    plt.ylabel('Runtime [s]'); plt.xlabel('Implementation'); plt.title(title); 
    if labels:
        plt.gca().bar_label(p)
    pass

In [23]:
# Define symbolic sizes for arbitrary inputs
rows = dace.symbol('rows')
cols = dace.symbol('cols')
indepth = dace.symbol('indepth')
inputimages = dace.symbol('inputimages')
outdepth = dace.symbol('outdepth')
chunklength = dace.symbol('chunklength', dtype=dace.int64, integer=True, positive=True)

w = dace.symbol('w')

# Define data type to use
dtype = dace.float64
np_dtype = np.float64

In [24]:
# Different convolution variants

# Simple convolution
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
def dace_simple(Input: dtype[inputimages, rows, cols, indepth], 
                      kernel: dtype[ w, w, indepth, outdepth], 
                      Output: dtype[inputimages, rows, cols, outdepth]):
    Output[:] = 0
    for i,j,d,od,m,n in dace.map[w/2:rows-w/2, w/2:cols-w/2,0:indepth,0:outdepth, 0:w, 0:w]:
            Output[0, i, j, od] += Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[ m, n, d, od]




# Split into parallel and non parallel maps
# TODO: Find why auto optimize is not working for simpleparallel
@dace.program(device=dace.DeviceType.GPU)
def dace_simpleparallel(Input: dtype[inputimages, rows, cols, indepth],
                              kernel: dtype[ w, w, indepth, outdepth],
                              Output: dtype[inputimages, rows, cols, outdepth]
                              ):
    Output[:] = 0

    for i, j, od in dace.map[w/2:rows-w/2, w/2:cols-w/2, 0:outdepth]:
        tmp = np.zeros([1], dtype = Input.dtype)
        for d,m,n in dace.map[0:indepth,0:w,0:w]:
            tmp = tmp + Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[m, n, d, od]
        Output[0,i,j,od] = tmp

        
# TODO: Function returns incorrect values for last rows. Debug the error.
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
def dace_allreduce(Input: dtype[inputimages, rows, cols, indepth], 
                           kernel: dtype[ w, w, indepth, outdepth], 
                           Output: dtype[inputimages, rows, cols, outdepth]):
    tmp = np.zeros([inputimages, rows, cols, indepth*w*w, outdepth], dtype = Input.dtype)
    for i,j,d,od,m,n in dace.map[w/2:rows-w/2, w/2:cols-w/2,0:indepth,0:outdepth, 0:w, 0:w]:
        with dace.tasklet:
            in_A << Input[0, i - w/2 + m, j - w/2 + n, d]
            in_B << kernel[m, n, d, od]
            out >> tmp[0, i, j, (d*(w*w)) + (m*w)+n, od]
            out = in_A * in_B

    dace.reduce(lambda a,b:a+b, tmp, Output, axis=3, identity=0)

In [25]:
# Prepare data with numpy
InputImages = 1
Rows = 9
Cols = 9
InChannels = 1
OutChannels = 2
W = 3
Stride = 1 # TODO: Code dace convolutions with stride

# Prepare data with numpy
Input = np.random.rand(InputImages, Rows, Cols, InChannels).astype(np_dtype)
kernel = np.random.rand(W, W, InChannels, OutChannels).astype(np_dtype)
Output = np.zeros((InputImages, Rows, Cols, OutChannels), dtype=np_dtype)

In [26]:
# Verification
input = tf.convert_to_tensor(Input)
filter = tf.convert_to_tensor(kernel)

op = tf.nn.conv2d(input, filter, strides=[1, 1, 1, 1], padding="VALID")


def verify_with_ref(dace_fun, dace_fun_name, refop, Input, kernel, Output):
    dace_fun(Input, kernel, Output)
    opdace = tf.convert_to_tensor(Output)
    opdace = opdace[:,int(W/2):Rows-int(W/2),int(W/2):Cols-int(W/2),:]
    if(sum(sum(sum(sum(opdace-op))))==0):
        print(f"Verification successfull for {dace_fun_name}")
    else:
        print(f"!!! Incorrect convolution for {dace_fun_name}")

verify_with_ref(dace_simple, 'dace_simple', op, Input, kernel, Output)
verify_with_ref(dace_simpleparallel, 'dace_simpleparallel', op, Input, kernel, Output)
verify_with_ref(dace_allreduce, 'dace_allreduce', op, Input, kernel, Output)



Verification successfull for dace_simple




Verification successfull for dace_simpleparallel




!!! Incorrect convolution for dace_allreduce


In [None]:
# Place holder function for tf reference code for profiling.
def timetfgpu():
    op=tf.nn.conv2d(input, filter, strides=[1, 1, 1, 1], padding='VALID')
    
# Warmup
%timeit -r 1 -n 1 dace_allreduce(Input, kernel, Output)
%timeit -r 1 -n 1 dace_simple(Input, kernel, Output)
%timeit -r 1 -n 1 dace_simpleparallel(Input, kernel, Output)
%timeit -r 1 -n 1 timetfgpu()

In [None]:
# Naive profiling of baselines
TIMES = {}
TIMES['dace_simple'] = %timeit -o dace_simple(Input, kernel, Output)
TIMES['dace_allreduce'] = %timeit -o dace_allreduce(Input, kernel, Output)
TIMES['dace_simpleparallel'] = %timeit -o dace_simpleparallel(Input, kernel, Output)
TIMES['tfgpu'] = %timeit -o timetfgpu()

barplot('Naive Baseline Profiling', labels=True)

In [None]:
# Dace profiling method, Returns median values in ms
def rundaceprofiling(dace_fun, dace_fun_name, reps):
    # Temporarily set the DACE_profiling config to True
    with dace.config.set_temporary('profiling', value=True):
        # You can control the number of times a program is run with the treps configuration
        with dace.config.set_temporary('treps', value=reps):
            dace_fun(Input,kernel,Output)
    dace_profile_fun = dace_fun_name
    list_of_files = glob.glob(f'.dacecache/{dace_profile_fun}/profiling/results-*.csv')
    latest_file = max(list_of_files, key=os.path.getctime)
    df = pd.read_csv(latest_file)
    return df['Runtime_sec'].median()*1000

TIMES = {}
nrepeat = 100
TIMES['dace_simple'] = rundaceprofiling(dace_simple, 'dace_simple', nrepeat)
TIMES['dace_allreduce'] = rundaceprofiling(dace_allreduce, 'dace_allreduce', nrepeat)
TIMES['dace_simpleparallel'] = rundaceprofiling(dace_simpleparallel, 'dace_simpleparallel', nrepeat)
x = %timeit -o timetfgpu()
TIMES['tfgpu'] = np.median(x.timings) 

barplotvalues("Baselines", True)