In [20]:
from __future__ import annotations
from typing import Optional, Tuple, Union, Any, Dict, Callable, Type, List, ClassVar
from enum import Enum, auto
from abc import ABC
import numpy as np

import os
os.environ['DEBUG'] = '1'

In [7]:
from tinygrad.ops import Device
Device.DEFAULT = "CLANG"

In [8]:
from tinygrad.tensor import Tensor

In [10]:
a = Tensor([2])
b = Tensor([3])

res = a + b
print(f"{a.numpy()} + {b.numpy()} = {res.numpy()}")
assert res.numpy()[0] == 5

[2.] + [3.] = [5.]


In [None]:
import tinygrad.mlops as mlops

In [None]:
# okay so tensor class time
class Tensor:
    # some class attributes
    grad: Optional[Tensor] # grad is a tensor
    requires_grad: Optional[bool]

    # this is the graph for the autograd engine
    _ctx: Optional[Function] # pretty sure we'll define this later

    # this is where the data and other properties lice
    lazydata: LazyBuffer

    # high level ope (hlops) are defined on this class. ex: relu
    def relu(self): return self.maximum(0)

    # log is an mlp, this is the wrapper function in Tensor
    def log(self): return mlops.Log.apply(self)

# all the definitions of the derivatives are subclasses of Function (like mlops.Log)
# there's only 18 mlops for derivatives for everything (in tinygrad/mlops.py)
# read mlops.py and tensor.py --- they seem to be the core. maybe read mlops first?

# heres autodiff
class Function:
    # exampole forward and backward methods
    def forward(self, x:LazyBuffer) -> LazyBuffer: pass
    def backward(self, x:LazyBuffer) -> LazyBuffer: pass

In [13]:
from tinygrad.helpers import DType

# this is where the properties live that I tought were part of the tensor class
# LazyBuff is like a Tensor without derivatives, at the mlop layer

class LazyBuffer:
    # these three define the "type" of the buffer and are returned as Tensor properties
    device: str
    shape: Tuple[int, ...]
    dtype: DType
    # som eclas attrs


    # a ShapeTracker tracks reshapes and permutes
    # all MovementOps are zero copy 
    # the ShapeTracker specifies how the data in the RawBuffer matches to the shape
    st: ShapeTracker

    # if the LazyBuffer is realized, it has a RawBuffer
    realized: Optional[RawBuffer]

    # if the lazybuffer is unrealized, it has a LazyOp
    # this is the comp needed to realize the LazyBuffer
    op: Optional[LazyOp]

# LazyOp
# in a tree they form an AST for a single GPU kernel
class LazyOp:
    op: Op # they type of rthe compute
    src: Tuple[Union[LazyOp, LazyBuffer], ...] # the sources
    arg: Optional[Any] = None # the arguments

# theres currently 28 ops you have to implement for an accelerator
class UnaryOps(Enum): NOOP = auto(); EXP2 = auto(); LOG2 = auto(); CAST = auto(); SIN = auto()
class BinaryOps(Enum):   ADD = auto();  SUB = auto();  MUL = auto();  DIV = auto();  CMPLT = auto(); MAX = auto()
class ReduceOps(Enum):   SUM = auto();  MAX = auto()
class MovementOps(Enum): RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); STRIDE = auto()
class TernaryOps(Enum):  MULACC = auto(); WHERE = auto()
class LoadOps(Enum):     EMPTY = auto(); RAND = auto(); CONST = auto(); FROM = auto(); CONTIGUOUS = auto(); CUSTOM = auto()
# If you have a compiledbuffer (devicebuffer)
# you dont have to implement the MovementOps
# as they are handles bu the ShapeTracker

Op = Union[UnaryOps, BinaryOps, ReduceOps, MovementOps, TernaryOps, LoadOps]

# most of tinygrad/lazy.py is concerned with fusing Ops into LasyOps ASTs that map to GPU kernels
# it's beyond the scope of this tutorial but can read file if interested... maybe i will

In [16]:
# Example LazyBuffer for 2+3
from tinygrad.tensor import Tensor
from tinygrad.ops import LazyOp, BinaryOps, LoadOps

# 2 + 3 from before
res = Tensor([2]) + Tensor([3])
print(type(res.lazydata), res.lazydata)

lazyop: LazyOp = res.lazydata.op
assert lazyop.op == BinaryOps.ADD
assert len(lazyop.src) == 2

# first source is 2 which comes from the CPU
# the source is a LazyBuffer that is a "CPU" Tensor
# again, a LazyOp AST is like a GPU kerner. you have to copy the data on the device first
assert lazyop.src[0].op.op == LoadOps.FROM
assert lazyop.src[0].op.src[0].device == "CPU"
assert lazyop.src[0].op.src[0].op.src[0].realized._buf[0] == 2, "the src of the FROM LazyOP is a LazyBuffer on the CPU holding [2.]"
assert res.lazydata.realized is None, "the LazyBuffer is not realized yet"

<class 'tinygrad.lazy.LazyBuffer'> <LB (1,) dtypes.float op=BinaryOps.ADD st=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),))>


In [17]:
res.realize()

<Tensor <LB (1,) dtypes.float op=buffer<1, dtypes.float, 5020201360> st=ShapeTracker(views=(View(shape=(1,), strides=(0,), offset=0, mask=None, contiguous=True),))> on CLANG with grad None>

In [19]:
assert res.lazydata.realized is not None, "the LazyBuffer is realized!"
# brings us to DeviceBuffer (the realized ClangBuffer is a subclass of DeviceBuffer)
assert 'RawMallocBuffer' in str(type(res.lazydata.realized))
# can copy the device buffer to CPU
assert res.lazydata.realized.toCPU()[0] == 5, "when put in numpy with toCPU, it's 5"

In [21]:
# you can either write an "Interpreted" backend or "Compiled" backend

class Interpreted:
    # they have a backing RawBufer
    buffer: Type[RawBuffer]

    # and they have a lookup table to functions for the ops
    fxn_for_op: Dict[Op, Callable] = {
        UnaryOps.EXP2: lambda x: np.exp2(x),
        BinaryOps.ADD: lambda x, y: x + y
    }

# compiled bckends take a little more (ex: GPU and LLVM)
class Compiled:
    # they also have a backingRawBuffer
    buffer: Type[RawBuffer]

    # a code generator, which compiles the AST
    codegen: Type[Linearizer]

    # and a runtime, which runs the generated code
    runtime: Type[Runtime]

# runtime is what actually runs the kernels for a compiled backend
class Runtime(ABC):
    # name is the name of the function, and prg is the code
    # the constuctor takes the code and compiles it
    def __init__(self, name:str, prg:str): pass
    # call runs the code on the bufs. NOTE: the output is always bufs[0], but this is kust a convention
    def __call__(self, global_size:Optional[List[int]], local_size:Optional[List[int]], bufs:List[RawBuffer]): pass


# Rawbuffer is where the data is actually held, its pretty close to just memory
class RawBuffer(ABC):
    # create an empty rawbuffer that holds size elements of type dtype
    # buf is an opaque container class
    def __init__(self, size:int, dtype:DType, buf:Any): raise NotImplementedError

    # fromCPU is a classmethod that creates a RawBudder, its a classmethod since some runtimes are 0 copy
    @classmethod
    def fromCPU(cle:RawBuffer, x:np.ndarray) -> RawBuffer: raise NotImplementedError

    # toCPU converts the RawBuffer to a numpy array with shape (size,). many backends are 0 copy here
    def toCPU(self) -> np.ndarray: raise NotImplementedError

# RawNumpyBuffer is a RawBuffer example for numpy. It's very simple... or so you say
class RawNumpyBuffer(RawBuffer):
    def __init__(self, buf: np.ndarray):
        super().__init__(buf.size, dtypes.from_np(buf.dtype). buf)
    @classmethod
    def fromCPU(cls, x): return cls(x)
    def toCPU(self): return self.buf

# example 2+3 in raw clang 

# RawMallocBuffer is the simplest concrete version of RawBuffer (in tinygrad/ops.py)
# its used for CLANG and LLVM backends
# its just malloc(size * dtype.itemsize)
from tinygrad.runtime.lib import RawMallocBuffer

# ClangProgram is the simplest runtime
# __init__ calls clang, and __call__ calls the function in the *.so outputted by clang
# in CLANG, global_size and local_size are ignored
from tinygrad.runtime.ops_clang import ClangProgram

# a concrete example looks like this, this adds two size1 RawBuffer
# first we create two numpy buffers containing 2 and 3
# then we copy the numpy in to RawMallocBuffers
# last, we create an empty output buffer
from tinygrad.helpers import dtypes
numpy_a, numpy_b = np.array([2], dtype=np.float32), np.array([3], dtype=np.float32)
input_a, input_b = RawMallocBuffer.fromCPU(numpy_a), RawMallocBuffer.fromCPU(numpy_b)
output = RawMallocBuffer(1, dtypes.float32)

program = ClangProgram("add", f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}")
program(None, None, [output, input_a, input_b])
print(output.toCPU())
assert output.toCPU()[0] == 5, "the output is 5"

ArgumentError: argument 1: <class 'TypeError'>: Don't know how to convert parameter 1

In [23]:
# Linarizer

# the first sttep of transforming an ASI into code is to linearize it (think like topsort on the AST)
# for that we use the Linearizer, which turns an AST into a list of (linear) UOps

class UOps(Enum): LOOP = auto(); DEFINE_LOCAL = auto(); LOAD = auto(); ALU = auto(); CONST = auto(); ENDLOOP = auto(); STORE = auto()

class UOp:
    uop: UOps
    dtype: Optional[DType]
    vin: Tuple[UOp, ...]
    arg: Any
    num: int # UOps are unique

class Linearizer:
    # create the kernel with the AST
    # NOTE: The AST containes the ComiledBuffers themselves as the root nodes. this will change
    def __init__(self, ast:LazyOp): pass
    def linearize(self): pass

    uops: List[UOp] # the linearized UOps

from tinygrad.tensor import Tensor
result = Tensor(2).realize() + Tensor(3).realize()

# use the real linearizer to liearize 2 + 3

from tinygrad.codegen.linearizer import Linearizer
sched = result.lazydata.schedule()
linearizer = Linearizer(sched[-1].ast)
linearizer.linearize()

for uop in linearizer.uops: print(uop)

   0 UOps.DEFINE_GLOBAL  : ptr.dtypes.float          []                               ('data0', dtypes.float)
   1 UOps.CONST          : dtypes.float              []                               2.0
   2 UOps.CONST          : dtypes.float              []                               3.0
   3 UOps.ALU            : dtypes.float              [1, 2]                           BinaryOps.ADD
   4 UOps.CONST          : dtypes.int32              []                               0
   5 UOps.STORE          :                           [0, 4, 3]                        None


In [26]:
from tinygrad.tensor import Tensor

result = Tensor(2) + Tensor(3)

# we have a global cache used by the JIT
# from there, we can see the generated clang code

from tinygrad.jit import CacheCollector
CacheCollector.start() # enable the cache
result.realize() # realize the result
cache_saved = CacheCollector.finish() # disable the cash

# theres 1 ASTRunner in the cache
assert len(cache_saved) == 1
prg, bufs, _ = cache_saved[0]

print(prg.prg)

void E_n2(float* restrict data0) {
  data0[0] = (2.0f+3.0f);
}


In [27]:
from tinygrad.shape.shapetracker import ShapeTracker

# create a virtual (10, 10) Tensor. this is just a shape, there's no actual tensor
a = ShapeTracker.from_shape((10, 10))

print(a)

ShapeTracker(views=(View(shape=(10, 10), strides=(10, 1), offset=0, mask=None, contiguous=True),))


In [28]:
a = a.permute((1, 0))
print(a)

ShapeTracker(views=(View(shape=(10, 10), strides=(1, 10), offset=0, mask=None, contiguous=False),))


In [30]:
a = a.reshape((5,2,5,2))
print(a)

ShapeTracker(views=(View(shape=(5, 2, 5, 2), strides=(2, 1, 20, 10), offset=0, mask=None, contiguous=False),))


In [32]:
a = a.reshape((100,))
print(a)

ShapeTracker(views=(View(shape=(5, 2, 5, 2), strides=(2, 1, 20, 10), offset=0, mask=None, contiguous=False), View(shape=(100,), strides=(1,), offset=0, mask=None, contiguous=True)))


In [33]:
idx, _ = a.expr_idxs()
print(idx.render())

(((idx0%10)*10)+(idx0//10))


In [34]:
idx, _ = a.expr_idxs()
print(idx.render())

(((idx0%10)*10)+(idx0//10))


In [35]:
print(a)

ShapeTracker(views=(View(shape=(5, 2, 5, 2), strides=(2, 1, 20, 10), offset=0, mask=None, contiguous=False), View(shape=(100,), strides=(1,), offset=0, mask=None, contiguous=True)))


In [36]:
a = a.simplify()
print(a)

ShapeTracker(views=(View(shape=(5, 2, 5, 2), strides=(2, 1, 20, 10), offset=0, mask=None, contiguous=False), View(shape=(100,), strides=(1,), offset=0, mask=None, contiguous=True)))


In [37]:
a = a.permute((1, 0))
print(a)

AssertionError: invalid permute (1, 0) for (100,)

In [38]:
assert a.contiguous == True

AssertionError: 

In [39]:
from tinygrad.shape.symbolic import Variable

# Variable is the base class from symbolic
# its created with a name and a min and max (inclusive)
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)

# some math examples
print((a*10).min, (a*10).max)
print((a+b).min, (a+b).max)

0 100
0 20


In [40]:
expr = (a + b*10) % 10
print(expr.render())

(a%10)


In [41]:
expr = (a*40 + b) // 20
print(expr.render())
print(expr.min, expr.max)

(a*2)
0 20
