In [49]:
# Benchmark all the different variations of dace convolution
# Benchmark it with respect to cudnn, pytorch's conv2D
# 1DConv, 2DConv, 3DConv
# Verification and benchmarking

In [50]:
from torch import nn
import click
import dace
import numpy as np
from pprint import pprint
import dace.libraries.blas
import matplotlib.pyplot as plt
import tensorflow as tf

In [51]:
def barplot(title, labels=False):
    x = ['numpy'] + list(sorted(TIMES.keys() - {'numpy'}))
    bars = [np.median(TIMES[key].timings) for key in x]
    yerr = [np.std(TIMES[key].timings) for key in x]
    color = [('#86add9' if 'dace' in key else 'salmon') for key in x]

    p = plt.bar(x, bars, yerr=yerr, color=color)
    plt.ylabel('Runtime [s]'); plt.xlabel('Implementation'); plt.title(title); 
    if labels:
        plt.gca().bar_label(p)
    pass

In [52]:
# Define symbolic sizes for arbitrary inputs
rows = dace.symbol('rows')
cols = dace.symbol('cols')
indepth = dace.symbol('indepth')
inputimages = dace.symbol('inputimages')
outdepth = dace.symbol('outdepth')
chunklength = dace.symbol('chunklength', dtype=dace.int64, integer=True, positive=True)

w = dace.symbol('w')

# Define data type to use
dtype = dace.float64
np_dtype = np.float64

In [53]:
# Different convolution versions
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
#@dace.program(device=dace.DeviceType.GPU)
def convolutionallparallel(Input: dtype[inputimages, rows, cols, indepth], 
                           kernel: dtype[ w, w, indepth, outdepth], 
                           Output: dtype[inputimages, rows, cols, outdepth]):
    tmp = np.zeros([inputimages, rows, cols, indepth*w*w, outdepth], dtype = Input.dtype)
    for i,j,d,od,m,n in dace.map[w/2:rows-w/2, w/2:cols-w/2,0:indepth,0:outdepth, 0:w, 0:w]:
        with dace.tasklet:
            in_A << Input[0, i - w/2 + m, j - w/2 + n, d]
            in_B << kernel[m, n, d, od]
            out >> tmp[0, i, j, (d*(w*w)) + (m*w)+n, od]

            out = in_A * in_B

    dace.reduce(lambda a,b:a+b, tmp, Output, axis=3, identity=0)


# Simple convolution
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
#@dace.program(device=dace.DeviceType.GPU)
def convolutionsimple(Input: dtype[inputimages, rows, cols, indepth], 
                      kernel: dtype[ w, w, indepth, outdepth], 
                      Output: dtype[inputimages, rows, cols, outdepth]):
    Output[:] = 0
    for i,j,d,od,m,n in dace.map[w/2:rows-w/2, w/2:cols-w/2,0:indepth,0:outdepth, 0:w, 0:w]:
            #Output[0, i, j, od] += Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[ w - 1 - m, w - 1 - n, d, od]
            Output[0, i, j, od] += Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[ m, n, d, od]


# Reduction along input depth
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
#@dace.program(device=dace.DeviceType.GPU)
def convolutionindepthreduce(Input: dtype[inputimages,rows, cols, indepth], 
                             kernel: dtype[ w, w, indepth, outdepth], 
                             Output: dtype[inputimages, rows, cols, outdepth]):
    for i, j, od in dace.map[w/2:rows-w/2, w/2:cols-w/2, 0:outdepth]:
        tmp = np.zeros([indepth*w*w], dtype = Input.dtype)
        for d,m,n in dace.map[0:indepth,0:w,0:w]:
            with dace.tasklet:
                in_A << Input[0, i - w / 2 + m, j - w / 2 + n, d]
                #in_B << kernel[w - 1 - m, w - 1 - n, d, od]
                in_B << kernel[m, n, d, od]
                out >> tmp[(d*(w*w)) + (m*w)+n]

                out = in_A * in_B
        Output[0,i,j,od] = dace.reduce(lambda a, b: a + b, tmp, identity=0)


# Split into parallel and non parallel maps
# TODO: Find why auto optimize is not working for simpleparallel
@dace.program(device=dace.DeviceType.GPU)
def convolutionsimpleparallel(Input: dtype[inputimages, rows, cols, indepth],
                              kernel: dtype[ w, w, indepth, outdepth],
                              Output: dtype[inputimages, rows, cols, outdepth]
                              ):
    Output[:] = 0

    for i, j, od in dace.map[w/2:rows-w/2, w/2:cols-w/2, 0:outdepth]:
        tmp = np.zeros([1], dtype = Input.dtype)
        for d,m,n in dace.map[0:indepth,0:w,0:w]:
            #tmp = tmp + Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[w - 1 - m, w - 1 - n, d, od]
            tmp = tmp + Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[m, n, d, od]
        Output[0,i,j,od] = tmp


In [54]:
# Prepare data with numpy
#Input = np.random.rand(InChannels, Rows, Cols).astype(np_dtype)
#kernel = np.random.rand(OutChannels, InChannels, W, W).astype(np_dtype)
#Output = np.zeros((OutChannels, Rows, Cols), dtype=np_dtype)
InputImages = 1
Rows = 9
Cols = 9
InChannels = 1
OutChannels = 2
W = 3
Stride = 1 # TODO: Code dace convolutions with stride

# Prepare data with numpy
Input = np.random.rand(InputImages, Rows, Cols, InChannels).astype(np_dtype)
kernel = np.random.rand(W, W, InChannels, OutChannels).astype(np_dtype)
Output = np.zeros((InputImages, Rows, Cols, OutChannels), dtype=np_dtype)

In [55]:
convolutionsimple(Input, kernel, Output)
opdace = tf.convert_to_tensor(Output)
opdace = opdace[:,int(W/2):Rows-int(W/2),int(W/2):Cols-int(W/2),:]
input = tf.convert_to_tensor(Input)
filter = tf.convert_to_tensor(kernel)

op = tf.nn.conv2d(input, filter, strides=[1, 1, 1, 1], padding='VALID')

if(sum(sum(sum(sum(opdace-op))))==0):
    print("Verification successfull")

# TODO: Look at the benchmarking code and check how you can make this code better.
# TODO: Convert the jupyter notebook to a script.

Verification succesfull


In [56]:
# TIMES = {}

# TIMES['allparallel'] = %timeit -o convolutionallparallel(Input, kernel, Output)
# TIMES['simple'] = %timeit -o convolutionsimple(Input, kernel, Output)
# TIMES['indepthreduce'] = %timeit -o convolutionindepthreduce(Input, kernel, Output)
# TIMES['simpleparallel'] = %timeit -o convolutionsimpleparallel(Input, kernel, Output)