In [1]:
import tvm
import numpy as np

- Global declarations of environment

In [2]:
tgt_host="llvm"
# Change it to respective GPU if gpu is enabled Ex: cuda, opencl
tgt="cuda"

## Describe the Computation

In [3]:
n = tvm.var("n")
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")

In [4]:
print(type(C))

<class 'tvm.tensor.Tensor'>


## Schedule the Computation

- A schedule is a set of transformation of computation that transforms the loop of computations in the program

- After we construct the schedule, by default the schedule computes C in a serial manner in a row-major order.

In [5]:
s = tvm.create_schedule(C.op)

In [6]:
type(s)

tvm.schedule.Schedule

In [7]:
s?

[0;31mType:[0m        Schedule
[0;31mString form:[0m schedule(0x1d46a90)
[0;31mFile:[0m        ~/opt/miniconda3/envs/tvm/lib/python3.6/site-packages/tvm-0.2.0-py3.6-linux-x86_64.egg/tvm/schedule.py
[0;31mDocstring:[0m   Schedule for all the stages.


- Use the split construct to split the first axis of C
    - this will split the original iteration axis into product of two iterations.

In [8]:
bx, tx = s[C].split(C.op.axis[0], factor=64)

In [9]:
bx

iter_var(i.outer, )

In [10]:
tx

iter_var(i.inner, )

- Finally bind the iteration axis `bx` and `tx` to threads in the GPU compute grid. 
- These are GPU specific constructs that allows us to generate code that runs on GPU. 

In [11]:
if tgt == "cuda":
    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))

## Compilation

- After finishing to specify the schedule, we can compile it into a TVM function. 
- By default TVM compiles into a type-erased function that can be directly called from python side

In [12]:
fadd = tvm.build(sch=s, args=[A, B, C], target=tgt, target_host=tgt_host, name="myadd")

In [13]:
fadd

Module(llvm, 21b49b0)

In [14]:
type(fadd)

tvm.module.Module

In [15]:
fadd?

[0;31mSignature:[0m   [0mfadd[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m)[0m[0;34m[0m[0m
[0;31mType:[0m        Module
[0;31mString form:[0m Module(llvm, 21b49b0)
[0;31mFile:[0m        ~/opt/miniconda3/envs/tvm/lib/python3.6/site-packages/tvm-0.2.0-py3.6-linux-x86_64.egg/tvm/module.py
[0;31mDocstring:[0m   Module container of all TVM generated functions


## Run the function

- Create a gpu context
- Use `tvm.nd.array` to copy data to gpu
- fadd runs the actual computation
- Use `asnumpy()` to copy the gpu array to cpu so that we can use this to verify correctness. 

In [16]:
ctx = tvm.context(dev_type=tgt, dev_id=0)

In [17]:
n = 1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx=ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx=ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx=ctx)

In [18]:
%%time
fadd(a, b, c)

CPU times: user 17 ms, sys: 274 Âµs, total: 17.2 ms
Wall time: 17.1 ms


In [19]:
np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

In [20]:
c.asnumpy()

array([1.0944474 , 1.3696101 , 1.4258708 , ..., 1.0076509 , 1.4810429 ,
       0.42075178], dtype=float32)

In [21]:
a.asnumpy(), b.asnumpy()

(array([0.6563842 , 0.5160306 , 0.96769214, ..., 0.21516554, 0.57610613,
        0.11985361], dtype=float32),
 array([0.43806314, 0.8535795 , 0.45817864, ..., 0.7924853 , 0.9049368 ,
        0.30089816], dtype=float32))

In [22]:
a

<tvm.NDArray shape=(1024,), gpu(0)>
array([0.6563842 , 0.5160306 , 0.96769214, ..., 0.21516554, 0.57610613,
       0.11985361], dtype=float32)

## Inspect the Generated Code 

In [23]:
if tgt == "cuda":
    dev_module = fadd.imported_modules[0]
    print("-----------------------GPU code--------------------------")
    print(dev_module.get_source())
    
else:
    print(fadd.get_source())

-----------------------GPU code--------------------------
extern "C" __global__ void myadd__kernel0( float* __restrict__ C,  float* __restrict__ A,  float* __restrict__ B, int n) {
  if (((int)blockIdx.x) < (n / 64)) {
    C[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] = (A[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] + B[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))]);
  } else {
    if ((((int)blockIdx.x) * 64) < (n - ((int)threadIdx.x))) {
      C[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] = (A[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] + B[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))]);
    }
  }
}




## Save Compiled Module

- Besides runtime compilation, we can save the compiled modules into file and load them back later. This is called ahead of time compilation

- The following code first does the following step:
    - It saves the compiled host module into an object file.
    - Then it saves the device module into a ptx file.
    - cc.create_shared calls a env compiler(GCC) to create a shared library.

In [24]:
from tvm.contrib import cc
from tvm.contrib import util

In [25]:
temp = util.tempdir()
fadd.save(file_name=temp.relpath("myadd.o"))

In [26]:
if tgt == "cuda":
    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))

In [27]:
cc.create_shared(output=temp.relpath("myadd.so"), objects=[temp.relpath("myadd.o")])

In [28]:
print(temp.listdir())

['myadd.tvm_meta.json', 'myadd.o', 'myadd.ptx', 'myadd.so']


## Load Compiled Module

In [29]:
fadd1 = tvm.module.load(temp.relpath("myadd.so"))
if tgt == "cuda":
    fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx"))
    fadd1.import_module(fadd1_dev)
fadd1(a, b, c)
np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())


## Pack Everything into One Library

In the above example, we store the device and host code seperatedly. TVM also supports export everything as one shared library. Under the hood, we pack the device modules into binary blobs and link them together with the host code. 

In [30]:
fadd.export_library(temp.relpath("mypack.so"))

In [31]:
fadd2 = tvm.module.load(temp.relpath("mypack.so"))

In [32]:
fadd2(a, b, c)

In [33]:
np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

**NOTE**: Runtime API and Thread-Safety

The compiled modules of TVM do not depend on the TVM compiler. Instead, it only depends on a minimum runtime library. TVM runtime library wraps the device drivers and provides thread-safe and device agnostic call into the compiled functions.

This means you can call the compiled TVM function from any thread, on any GPUs.

In [35]:
%load_ext version_information
%version_information numpy, tvm

The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information


Software,Version
Python,3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]
IPython,6.4.0
OS,Linux 4.13.0 41 generic x86_64 with debian stretch sid
numpy,1.14.3
tvm,0.2.0
Thu May 17 17:10:38 2018 CDT,Thu May 17 17:10:38 2018 CDT
