This notebook aims at presenting the API of hybrid cuda.

## Initializing the CUDA context in the kernel runner process

In [9]:
import inspect
import hybridcuda
cures = hybridcuda.initcuda()
cures

0

In [10]:
class hybridkernel:
    gridDimX = 1
    blockDimX = 1
    shared = 0
    stream = 0
    def __init__(self, func):
        self.hc = hybridcuda.processfunction(func)
        self.hc = hybridcuda.cudajitcode(self.hc)
        self.hc = hybridcuda.ptxlinkcode(self.hc)
        
    def __call__(self, *args):
        self.hc = hybridcuda.launch(self.hc, self.gridDimX,1,1, self.blockDimX,1,1, self.shared,self.stream, *args)

    def __getitem__(self, args):
        if (type(args) != tuple):
            self.grid = args
            return self
        # args is a tuple...
        if (len(args) > 0):
            self.grid = args[0]
        if (len(args) > 1):
            self.block = args[1]
        if (len(args) > 2):
            self.shared = args[2]
        if (len(args) > 3):
            self.stream = args[3]
        return self

#decorator definition
def hybridfunction(func):
    return hybridkernel(func)

## 1. Hello World sample

Kernel definition

In [23]:
@hybridfunction
def mykernel(N : int, a,b,c):
    for i in range(0,N):
        c[i] = a[i] + b[i]

Running kernel on GPU

In [26]:
## prepare some data
import numpy as np
N = 10
a = np.ones(N)
b = np.ones(N)
c = np.zeros(N)

## launch kernel
mykernel[1,1](N,a,b,c)
c


array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

## 2. Walkthrough a simple example (without syntaxic sugar)

### Function to be transpiled

In [11]:
def func(N : int, a,b,c):
    for i in range(0,N):
        c[i] = a[i] + b[i]

### 1. Generate cuda source code 

In [12]:
hc = hybridcuda.processfunction(func)
hc

{'version': 42,
 'cuda': '// GENERATED BY HYBRIDIZER/PYTHON \n\n#include <hybpython.cuh>\n\n// declare all functions -- prototypes \n\n// declare all functions -- implementations \n\n// declare global function \nextern "C" \n__global__ void func (hybpython::pylong N , hybpython::thing a , hybpython::thing b , hybpython::thing c) \n{\n\tfor ( hybpython::pylong i = hybpython::cast <hybpython::pylong,int>::from (hybpython::pylong (0)) ; i < (N) ; ++ i  ) \n\t{\n\thybpython::set_index < hybpython::thing , hybpython::pylong , hybpython::thing > ( c , i , hybpython::binary_op <hybpython::binaryops::add <hybpython::thing,hybpython::thing,hybpython::thing>>::eval <> ( hybpython::get_index < hybpython::thing , hybpython::pylong > ( a , i ) , hybpython::get_index < hybpython::thing , hybpython::pylong > ( b , i ) ) ) ;\n\t}\n}\n\n',
 'kernelname': 'func',
 'argtypes': ['hybpython::pylong',
  'hybpython::thing',
  'hybpython::thing',
  'hybpython::thing']}

Function call returns a dictionary with the following entries:
* `version`: a version number
* `cuda`: string with the cuda source code of the generated module
* `kernelname`: the kernel function name - *that is the exported symbol of the kernel function*
* `argtypes`: contains the argument types in CUDA format

### 2. Generate PTX from CUDA source

In [17]:
hc = hybridcuda.cudajitcode(hc)
hc.keys()

dict_keys(['version', 'cuda', 'kernelname', 'argtypes', 'ptx', 'nvrtclog'])

Two entries are added: 
* `ptx`: holds a string with the ptx assembly code
* `nvrtclog`: holds the log from the compilation

### 3. Generate CUBIN from PTX

In [18]:
hc = hybridcuda.ptxlinkcode(hc)
hc.keys()

dict_keys(['version', 'cuda', 'kernelname', 'argtypes', 'ptx', 'nvrtclog', 'cubin'])

`cubin` entry is added which is a memory view

### 4. Launching the kernel

In [19]:
## prepare some data
import numpy as np
N = 10
a = np.ones(N)
b = np.ones(N)
c = np.zeros(N)

## launch kernel
hc = hybridcuda.launch(hc, 1,1,1, 1,1,1, 0,0, N,a,b,c)
c


array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [21]:
# Arguments of the launch function
hybridcuda.launch.__doc__

'Launch kernel.'

## DRAFT-BOOK

In [13]:
import os
os.getpid()

5968

In [5]:
hk = hybridkernel(func)

In [6]:
dir(hk)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'blockDimX',
 'gridDimX',
 'hc',
 'shared',
 'stream']

In [7]:
hk.hc

{'version': 42,
 'cuda': '// GENERATED BY HYBRIDIZER/PYTHON \n\n#include <hybpython.cuh>\n\n// declare all functions -- prototypes \n\n// declare all functions -- implementations \n\n// declare global function \nextern "C" \n__global__ void func (hybpython::pylong N , hybpython::thing a , hybpython::thing b , hybpython::thing c) \n{\n\tfor ( hybpython::pylong i = hybpython::cast <hybpython::pylong,int>::from (hybpython::pylong (0)) ; i < (N) ; ++ i  ) \n\t{\n\thybpython::set_index < hybpython::thing , hybpython::pylong , hybpython::thing > ( c , i , hybpython::binary_op <hybpython::binaryops::add <hybpython::thing,hybpython::thing,hybpython::thing>>::eval <> ( hybpython::get_index < hybpython::thing , hybpython::pylong > ( a , i ) , hybpython::get_index < hybpython::thing , hybpython::pylong > ( b , i ) ) ) ;\n\t}\n}\n\n',
 'kernelname': 'func',
 'argtypes': ['hybpython::pylong',
  'hybpython::thing',
  'hybpython::thing',
  'hybpython::thing'],
 'ptx': '//\n// Generated by NVIDIA NVVM

In [3]:
import numpy as np
N = 10
a = np.ones(N)
b = np.ones(N)
c = np.zeros(N)

In [9]:
hk[1,1](N,a,b,c)

In [4]:
c

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [5]:
def hybridfunction(func):
    return hybridkernel(func)

@hybridfunction
def func2(N : int, a,b,c):
    for i in range(0,N):
        c[i] = a[i] + b[i]

In [6]:
func2[1,1](N,a,b,c)

In [7]:
c

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [6]:
dir(func)

['__annotations__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [8]:
mytuple = 12,15
func.__call__(*mytuple)

27

In [9]:
def func2(*a):
    return func(*a)

In [10]:
func2(12,15)

27

In [8]:
import os
os.getpid()

20652

In [58]:
class itemgetter:
    grid = 1
    block = 1
    def __getitem__(self, args):
        if (type(args) != tuple):
            self.grid = args
            return self
        # args is a tuple...
        if (len(args) > 0):
            self.grid = args[0]
        if (len(args) > 1):
            self.block = args[1]
        if (len(args) > 2):
            self.shared = args[2]
        if (len(args) > 3):
            self.stream = args[3]
        return self

In [59]:
ig = itemgetter()

In [60]:
ig[12].grid

12

In [61]:
ig[12].block

1

In [63]:
ig[15,12].block

len (args) = 2


12