<a href="https://colab.research.google.com/github/anshulsawant/llm-systems/blob/main/CUDA_simulator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import numba
import os
# os.environ['NUMBA_ENABLE_CUDASIM'] = '1'
import numpy as np
from numba import cuda

In [42]:
@cuda.jit
def vec_add(A, B, n, out):
    x = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bdx = cuda.blockDim.x
    i = bx * bdx + x
    if i < n:
      out[i] = A[i] + B[i]

In [43]:
n = 14
A = np.arange(n)
B = np.ones_like(A)
C = np.zeros_like(A)

In [44]:
griddim = 4
blockdim = 4
vec_add[griddim, blockdim](A, B, n, C)
C

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [None]:
def index_to_position(index, strides, num_dims):
    '''
     Converts a multidimensional tensor index into a single-dimensional position in storage
     based on strides.
     Args:
        index: index tuple of ints
        strides: tensor strides
        num_dims: number of dimensions in the tensor, e.g. shape/strides of [2, 3, 4] has 3 dimensions

     Returns:
        int - position in storage
    '''
    position = 0;
    for i in range(num_dims):
        position += index[i] * strides[i];
    return position;

def to_index(ordinal, shape, out_index, num_dims):
    '''
     Convert an ordinal to an index in the shape. Should ensure that enumerating position 0 ... size of
     a tensor produces every index exactly once. It may not be the inverse of index_to_position.
     Args:
        ordinal: ordinal position to convert
        shape: tensor shape
        out_index: return index corresponding to position
        num_dims: number of dimensions in the tensor

     Returns:
        None (Fills in out_index)
    '''
    cur_ord = ordinal;
    for i in reversed(range(num_dims)):
        sh = shape[i];
        out_index[i] = cur_ord % sh;
        cur_ord /= sh;

def broadcast_index(big_index, big_shape, shape, out_index, num_dims_big, num_dims):
    '''
     Convert a big_index into big_shape to a smaller out_index into shape following broadcasting rules.
     In this case it may be larger or with more dimensions than the shape given.
     Additional dimensions may need to be mapped to 0 or removed.

     Args:
        big_index: multidimensional index of bigger tensor
        big_shape: tensor shape of bigger tensor
        nums_big_dims: number of dimensions in bigger tensor
        out_index: multidimensional index of smaller tensor
        shape: tensor shape of smaller tensor
        num_dims: number of dimensions in smaller tensor

     Returns:
        None (Fills in out_index)
    '''
    for i in range(num_dims):
        if shape[i] > 1:
            out_index[i] = big_index[i + (num_dims_big - num_dims)]
        else:
            out_index[i] = 0
