In [1]:
# Benchmark all the different variations of dace convolution
# Benchmark it with respect to cudnn, pytorch's conv2D
# 1DConv, 2DConv, 3DConv
# Verification and benchmarking

In [2]:
from torch import nn
import click
import dace
import numpy as np
from pprint import pprint
import dace.libraries.blas
import matplotlib.pyplot as plt
import tensorflow as tf


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def barplot(title, labels=False):
    x = ['numpy'] + list(sorted(TIMES.keys() - {'numpy'}))
    bars = [np.median(TIMES[key].timings) for key in x]
    yerr = [np.std(TIMES[key].timings) for key in x]
    color = [('#86add9' if 'dace' in key else 'salmon') for key in x]

    p = plt.bar(x, bars, yerr=yerr, color=color)
    plt.ylabel('Runtime [s]'); plt.xlabel('Implementation'); plt.title(title); 
    if labels:
        plt.gca().bar_label(p)
    pass

In [4]:
# Trying out convolution
# m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
# m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
# input = torch.randn(20, 16, 50, 100)
# output = m(input)

In [19]:
# Define symbolic sizes for arbitrary inputs
rows = dace.symbol('rows')
cols = dace.symbol('cols')
indepth = dace.symbol('indepth')
inputimages = dace.symbol('inputimages')
outdepth = dace.symbol('outdepth')
chunklength = dace.symbol('chunklength', dtype=dace.int64, integer=True, positive=True)

w = dace.symbol('w')

# Define data type to use
dtype = dace.float64
np_dtype = np.float64

In [58]:
# Different convolution versions
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
#@dace.program(device=dace.DeviceType.GPU)
def convolutionallparallel(Input: dtype[inputimages, rows, cols, indepth], 
                           kernel: dtype[ w, w, indepth, outdepth], 
                           Output: dtype[inputimages, rows, cols, outdepth]):
    tmp = np.zeros([inputimages, rows, cols, indepth*w*w, outdepth], dtype = Input.dtype)
    for i,j,d,od,m,n in dace.map[w/2:rows-w/2, w/2:cols-w/2,0:indepth,0:outdepth, 0:w, 0:w]:
        with dace.tasklet:
            in_A << Input[0, i - w/2 + m, j - w/2 + n, d]
            in_B << kernel[w-1-m, w-1-n, d, od]
            out >> tmp[0, i, j, (d*(w*w)) + (m*w)+n, od]

            out = in_A * in_B

    dace.reduce(lambda a,b:a+b, tmp, Output, axis=3, identity=0)


# Simple convolution
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
#@dace.program(device=dace.DeviceType.GPU)
def convolutionsimple(Input: dtype[inputimages, rows, cols, indepth], 
                      kernel: dtype[ w, w, indepth, outdepth], 
                      Output: dtype[inputimages, rows, cols, outdepth]):
    Output[:] = 0
    for i,j,d,od,m,n in dace.map[w/2:rows-w/2, w/2:cols-w/2,0:indepth,0:outdepth, 0:w, 0:w]:
            Output[0, i, j, od] += Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[ w - 1 - m, w - 1 - n, d, od]


# Reduction along input depth
@dace.program(auto_optimize=True, device=dace.DeviceType.GPU)
#@dace.program(device=dace.DeviceType.GPU)
def convolutionindepthreduce(Input: dtype[inputimages,rows, cols, indepth], 
                             kernel: dtype[ w, w, indepth, outdepth], 
                             Output: dtype[inputimages, rows, cols, outdepth]):
    for i, j, od in dace.map[w/2:rows-w/2, w/2:cols-w/2, 0:outdepth]:
        tmp = np.zeros([indepth*w*w], dtype = Input.dtype)
        for d,m,n in dace.map[0:indepth,0:w,0:w]:
            with dace.tasklet:
                in_A << Input[0, i - w / 2 + m, j - w / 2 + n, d]
                in_B << kernel[w - 1 - m, w - 1 - n, d, od]
                out >> tmp[(d*(w*w)) + (m*w)+n]

                out = in_A * in_B
        Output[0,i,j,od] = dace.reduce(lambda a, b: a + b, tmp, identity=0)


# Split into parallel and non parallel maps
# TODO: Find why auto optimize is not working for simpleparallel
@dace.program(device=dace.DeviceType.GPU)
def convolutionsimpleparallel(Input: dtype[inputimages, rows, cols, indepth],
                              kernel: dtype[ w, w, indepth, outdepth],
                              Output: dtype[inputimages, rows, cols, outdepth]
                              ):
    Output[:] = 0

    for i, j, od in dace.map[w/2:rows-w/2, w/2:cols-w/2, 0:outdepth]:
        tmp = np.zeros([1], dtype = Input.dtype)
        for d,m,n in dace.map[0:indepth,0:w,0:w]:
            tmp = tmp + Input[0, i - w / 2 + m, j - w / 2 + n, d] * kernel[w - 1 - m, w - 1 - n, d, od]
        Output[0,i,j,od] = tmp


In [59]:
# Prepare data with numpy
#Input = np.random.rand(InChannels, Rows, Cols).astype(np_dtype)
#kernel = np.random.rand(OutChannels, InChannels, W, W).astype(np_dtype)
#Output = np.zeros((OutChannels, Rows, Cols), dtype=np_dtype)
InputImages = 1
Rows = 9
Cols = 9
InChannels = 1
OutChannels = 2
W = 3
Stride = 1 # TODO: Code dace convolutions with stride

# Prepare data with numpy
Input = np.random.rand(InputImages, Rows, Cols, InChannels).astype(np_dtype)
kernel = np.random.rand(W, W, InChannels, OutChannels).astype(np_dtype)
Output = np.zeros((InputImages, Rows, Cols, OutChannels), dtype=np_dtype)

In [None]:
TIMES = {}

TIMES['allparallel'] = %timeit -o convolutionallparallel(Input, kernel, Output)
TIMES['simple'] = %timeit -o convolutionsimple(Input, kernel, Output)
TIMES['indepthreduce'] = %timeit -o convolutionindepthreduce(Input, kernel, Output)
TIMES['simpleparallel'] = %timeit -o convolutionsimpleparallel(Input, kernel, Output)



In [47]:
opdace = tf.convert_to_tensor(Output)

input = tf.convert_to_tensor(Input)
filter = tf.convert_to_tensor(kernel)

op = tf.nn.conv2d(input, filter, strides=[1, 1, 1, 1], padding='VALID')


# TODO: Verify dace output with op.
# TODO: Look at the benchmarking code and check how you can make this code better.
# TODO: Convert the jupyter notebook to a script.

# Given an input tensor of shape batch_shape + [in_height, in_width, in_channels] 
# and a filter / kernel tensor of shape [filter_height, filter_width, in_channels, out_channels], 

# refConv = nn.Conv2d(in_channels=InChannels, out_channels=OutChannels, kernel_size=W, stride=1)
# TODO: Figure out how to call 2D convolution in the same format. 

In [48]:
# Verification code
print(op)

tf.Tensor(
[[[[2.00860497 1.61838472]
   [1.74412942 1.27407995]
   [1.84183245 1.36584612]
   [2.11589779 1.50862774]
   [1.62145124 1.73253209]
   [1.21612589 1.01742867]
   [1.41722904 1.31487633]]

  [[2.29046625 1.7421261 ]
   [1.45522289 0.94767536]
   [2.06453161 1.66854083]
   [1.88217863 1.82861582]
   [1.8113171  1.30107098]
   [1.85039381 1.21613048]
   [2.03543844 1.47276817]]

  [[2.502032   1.70942523]
   [1.61625817 1.49564158]
   [2.04833937 2.01633416]
   [2.05144994 2.05674971]
   [2.41800551 1.30399764]
   [1.32107044 1.33600429]
   [2.08616479 2.13142043]]

  [[3.06516078 2.11755522]
   [2.27745266 2.226651  ]
   [2.85256219 1.96124932]
   [2.33858708 1.71852988]
   [2.0173419  2.04252604]
   [2.28737377 1.67202275]
   [2.34861269 2.13661163]]

  [[2.02203427 2.00577376]
   [2.55221594 2.2342773 ]
   [2.93716282 1.81138556]
   [2.00827084 1.88063236]
   [1.97798295 1.58341122]
   [2.17384518 1.56495373]
   [1.92584004 1.38440471]]

  [[2.24222304 1.56281551]
   [2.9

In [50]:
#print(Output)

In [51]:
print(opdace)

tf.Tensor(
[[[[0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]
   [0.         0.        ]]

  [[0.         0.        ]
   [2.09000829 1.3715652 ]
   [1.1328043  1.27737368]
   [1.85538061 1.39029791]
   [1.99338967 1.4203543 ]
   [1.70547395 1.57285532]
   [1.25847695 1.01554551]
   [1.24998661 1.50468344]
   [0.         0.        ]]

  [[0.         0.        ]
   [2.38846561 1.79449533]
   [1.65409753 1.08800423]
   [2.71168631 1.58391867]
   [1.88110361 2.04440605]
   [1.75536367 1.27033845]
   [1.65609647 1.15036212]
   [2.46868225 1.4303703 ]
   [0.         0.        ]]

  [[0.         0.        ]
   [2.65170448 1.79061805]
   [1.84043673 1.66458651]
   [2.08749182 2.02371857]
   [2.49787779 1.85729677]
   [2.04645178 1.77924027]
   [2.2772024  1.28334171]
   [2.70250859 2.15014235]
   [0.         0.        ]]

  [[0.   