In [44]:
import numpy as np

# Broadcasted shape
def broadcasted_shape(shape_X, shape_Y):

    max_len = max(len(shape_X), len(shape_Y))
    min_len = min(len(shape_X), len(shape_Y))
    swapped = False

    if len(shape_X) < max_len:
        swapped = True
        L, S = shape_Y, shape_X # L = long, S = short
    else:
        L, S = shape_X, shape_Y

    L_rev = L[::-1]
    S_rev = S[::-1]

    result_shape = []
    axes_L_expanded = []
    axes_S_expanded = []

    for i in range(min_len):
        dim_L = L_rev[i]
        dim_S = S_rev[i]
        if dim_L == 1 and dim_S != 1:
            axes_L_expanded.append(max_len  -1- i)
        elif dim_L != 1 and dim_S == 1:
            axes_S_expanded.append(max_len  -1- i)
        if dim_L == 1 or dim_S == 1 or dim_L == dim_S:
            result_shape.append(max(dim_L,dim_S))
        else:
            raise ValueError(f"Shapes {shape_X} and {shape_Y} not broadcastable")


    result_shape += L_rev[(min_len):]

    result_shape = tuple(result_shape[::-1])
    axes_L_expanded = tuple(axes_L_expanded[::-1])
    axes_S_expanded = tuple(axes_S_expanded[::-1])

    if swapped:
        return result_shape, axes_S_expanded, max_len - min_len,  axes_L_expanded, 0
    else:
        return result_shape,  axes_L_expanded, 0, axes_S_expanded, max_len - min_len

def unbroadcast(arr, ax, pad):
    return np.sum(np.sum(arr, axis = ax, keepdims = True), axis = tuple(range(pad)))

class ag: # AutoGrad

    #################
    # ENTRYWISE OPS #
    #################

    def log(input):
        output = ag.Tensor(np.log(input.value), inputs=[input], op="log")
        def _backward():
            input.grad += output.grad / input.value
            return None
        output._backward = _backward
        return output

    def exp(input):

        output = ag.Tensor(np.exp(input.value), inputs=[input], op="exp")

        def _backward():
            input.grad += output.grad * output.value
            return None

        output._backward = _backward
        return output

    def relu(input):
        output = ag.Tensor(np.maximum(0, input.value), inputs=[input], op="relu")

        def _backward():
            input.grad += (input.value > 0)*output.grad
            return None

        output._backward = _backward
        return output



    #################
    # REDUCTIVE OPS #
    #################
    def sum(input,axis = None, keepdims = False):
        output = ag.Tensor(np.sum(input.value, axis = axis, keepdims = keepdims), inputs = [input], op='sum')
        def _backward():
            if axis == None:
                input.grad += output.grad
            elif keepdims:
                input.grad += np.sum(output.grad, axis = axis, keepdims=True)
            else:
                input.grad += np.expand_dims(output.grad, axis = axis)
            return None
        output._backward = _backward
        return output

    def matmul(input1, input2):
        return input1@input2
    ###############
    # SHAPING OPS #
    ###############

    def expand_dims(input, axis):
        output = ag.Tensor(np.expand_dims(input.value,axis=axis), inputs = [input])
        def _backward():
            input.grad += np.squeeze(output.grad, axis = axis)
            return None
        output._backward = _backward
        return output

    def moveaxis(input, source, destination):
        output = ag.Tensor(np.moveaxis(input.value, source, destination), inputs=[input], op="moveaxis")

        def _backward():
            input.grad += np.moveaxis(output.grad, source, destination)
            return None
        output._backward = _backward
        return output

    class Tensor: # Tensor with grads
        def __init__(self,
                     value,
                     op="",
                     _backward= lambda : None,
                     inputs=[],
                     label=""):

            if type(value) in [float ,int]:
                value = np.array(value)
            self.value = 1.0*value
            self.grad = np.zeros_like(self.value)

            self.shape = value.shape

            self._backward = _backward
            self.inputs = inputs

            self.op = op
            self.label = label

        def topological_sort(self):
            topo_order = []
            visited = set()

            def dfs(node):
                if node not in visited:
                    visited.add(node)
                    for input in node.inputs:
                        dfs(input)
                    topo_order.append(node)

            dfs(self)
            return topo_order

        def backward(self):
            self.grad = np.array(1.0)

            topo_order = self.topological_sort()

            for node in reversed(topo_order):
                node._backward()

        def __add__(self, other):
            if type(other) in [float, int]:
                other = ag.Tensor(1.0*other)
            result_shape, ax1, pad1, ax2, pad2 = broadcasted_shape(self.shape, other.shape)

            output = ag.Tensor(self.value + other.value,
                               inputs=[self, other], op="add")
            def _backward():
                self.grad += unbroadcast(output.grad, ax1, pad1)
                other.grad += unbroadcast(output.grad, ax2, pad2)

            output._backward = _backward
            return output

        def __sub__(self,other):
            return self + other*(-1)

        def __neg__(self):
            output = ag.Tensor(-self.value, inputs=[self], op="neg")
            def _backward():
                self.grad -= output.grad
                return None
            output._backward = _backward
            return output

        def __mul__(self, other):
            if type(other) in [float, int]:
                other = ag.Tensor(1.0*other)
            result_shape, ax1, pad1, ax2, pad2 = broadcasted_shape(self.shape, other.shape)

            output = ag.Tensor(self.value * other.value,
                               inputs=[self, other], op="mul")
            def _backward():
                self.grad += unbroadcast(output.grad*other.value, ax1, pad1)
                other.grad += unbroadcast(output.grad*self.value, ax2, pad2)

            output._backward = _backward
            return output

        def __truediv__(self,other):
            return self*(other**(-1))

        def __radd__(self, other):
            return self + other

        def __rmul__(self, other):
            return self * other

        def __rsub__(self, other):
            return (-self) + other

        def __rtruediv__(self, other):
            return ag.Tensor(other) / self

        def __pow__(self, exponent): # exponent is just a python float
            output = ag.Tensor(self.value ** exponent,
                               inputs=[self],
                               op=f"pow({exponent})")

            def _backward():

                self.grad += (exponent * self.value**(exponent-1)) * output.grad
                return None

            output._backward = _backward
            return output

        def __getitem__(self, idx):
            output = ag.Tensor(np.array(self.value[idx]),
                               inputs = [self],
                               op=f"[...]")
            def _backward():
                self.grad[idx] += output.grad # idx must not have repeats!
                return None
            output._backward = _backward
            return output

        def __matmul__(self,other):
            """
            matrix multiplication between two tensors
            where len(self.shape) > 1
            in particular, we DISALLOW vector-matrix product
            this includes the vector-vector product, in particular
            """

            assert(len(self.shape) > 1)

            output = ag.Tensor(np.matmul(self.value,other.value),
                               inputs = [self,other],
                               op="matmul")

            if len(other.value.shape) == 1:
                def _backward():
                    self.grad += np.matmul(output.grad[:,None], other.value[None,:])
                    other.grad += np.sum(np.moveaxis(self.value,-1,0)*output.grad,
                                         axis= tuple(range(1,len(self.shape))))
                    # example
                    # A = np.random.rand(3,4,5)
                    # B = np.random.rand(3,4)
                    # np.sum(np.moveaxis(A,-1,0)*B, axis= tuple(range(1,len(A.shape))))
                    # the result has shape (5,)
                    return None
            else:
                # if you reached here, you know that you have two tensors of rank ≥ 2
                # now let's write
                # self.shape = (subshape1, m,n)
                # other.shape = (subshape2, n,p)
                # note that we allow subshape1 and subshape 2 to be empty
                # finalshape = broadcast(subshape1,subshape2)
                # (self@other).shape =
                # output.grad.shape = (finalshape, m, p)
                result_shape, ax1, pad1, ax2, pad2 = broadcasted_shape(self.shape[:-2], other.shape[:-2])

                def _backward():
                    self.grad += unbroadcast(output.grad@np.moveaxis(other.value,-1,-2), ax1, pad1)
                    other.grad += unbroadcast(np.moveaxis(self.value,-1,-2)@output.grad, ax2, pad2)
                    return None
            output._backward = _backward
            return output

        def __repr__(self) -> str:
            return "Value:\n"+self.value.__repr__() + "\nGrad:\n" + self.grad.__repr__()


In [45]:
class nn:

    class BinaryCrossEntropyLoss:
        def __call__(self, input, target):
            N = target.value.shape[0]
            return ag.sum( ag.log(1.0+ ag.exp(-input*target))) / N

In [51]:
class SingleHeadAttention:
    def __init__(self, n_features):
        self.Wq = ag.Tensor(np.random.randn(n_features, n_features), label="Wq") # this was W1
        self.Wk = ag.Tensor(np.random.randn(n_features, n_features), label="Wk") # this was W2
        self.Wv = ag.Tensor(np.random.randn(n_features, n_features), label="Wv") # this was w3
    def __call__(self, Xin):
        # Xin is a (n_samples, n_context, n_features) tensor
        # Xout is *also* a (n_samples, n_context, n_features) tensor

        
        Queries = Xin @ self.Wq
        Keys = Xin @ self.Wk
        KQ = (Keys @ ag.moveaxis(Queries, 1,2))
        expKQ = ag.exp(KQ)

        # ADD SOMETHING HERE TO MAKE IT CAUSAL
        # FOUR LINES

        softmaxKQ = expKQ / ag.sum(expKQ, axis=1, keepdims=True)
        Xout = ag.moveaxis(ag.moveaxis(Xin,1,2) @ softmaxKQ, 1,2) @ self.Wv
        return Xout

class MLP:
    def __init__(self, n_features, n_hidden):
        self.Wh = ag.Tensor(np.random.randn(n_features, n_hidden), label="Whidden")
        self.bh = ag.Tensor(np.random.randn(n_hidden), label="bhidden")

    def __call__(self, Xin):
        hidden = ag.relu((Xin @ self.Wh) + self.bh)
        return hidden

class TransformerBlock:
    def __init__(self, n_features, n_hidden):
        self.att = SingleHeadAttention(n_features)
        self.mlp = MLP(n_features, n_hidden)
    def __call__(self, Xin):
        return self.mlp(self.att(Xin))

In [52]:
import numpy as np

n_context = 4
n_features = 3
n_samples = 5

X1_np = np.array([[[-0.707, -0.707, 1.0],
                   [0.963, -0.268, 1.0],
                   [0.391, 0.92, -1.0],
                   [0.899, 0.437, -1.0]],
                  [[0.327, -0.945, 1.0],
                   [0.3, -0.954, -1.0],
                   [-0.485, -0.874, -1.0],
                   [-0.694, 0.72, 1.0]],
                  [[-0.938, -0.346, 1.0],
                   [-0.742, 0.67, -1.0],
                   [0.742, 0.67, -1.0],
                   [0.322, 0.947, -1.0]]])  # (5, 4, 3)

X2_np = np.array([[[-0.707, -0.707, 1.0],
                   [0.963, -0.268, 1.0],
                   [0.391, 0.92, -1.0],
                   [1+0.899, 0.437, -1.0]], # A TINY EDIT
                  [[0.327, -0.945, 1.0],
                   [0.3, -0.954, -1.0],
                   [-0.485, -0.874, -1.0],
                   [-0.694, 0.72, 1.0]],
                  [[-0.938, -0.346, 1.0],
                   [-0.742, 0.67, -1.0],
                   [0.742, 0.67, -1.0],
                   [0.322, 0.947, -1.0]]])  # (5, 4, 3)

y_np = np.array([-1.0, -1.0, 1.0, 1.0, -1.0]) # (5,)

Wq_np = np.random.rand(3,3)
Wk_np = np.random.rand(3,3)
Wv_np = np.random.rand(3,3)


In [53]:
att = SingleHeadAttention(3)
att.Wq.value = Wq_np
att.Wk.value = Wk_np
att.Wv.value = Wv_np

X1 = ag.Tensor(X1_np)
X2 = ag.Tensor(X2_np)



In [54]:
att(X1)[0]

Value:
array([[ 0.43231678,  0.30688179, -0.93973424],
       [ 1.00697546,  0.87703063,  0.43369102],
       [ 0.20277333,  0.19984271, -0.12572647],
       [ 0.04524706,  0.05245233,  0.28037472]])
Grad:
array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [55]:
att(X2)[0]

Value:
array([[ 0.43231678,  0.30688179, -0.93973424],
       [ 1.00697546,  0.87703063,  0.43369102],
       [ 0.20277333,  0.19984271, -0.12572647],
       [ 0.187932  ,  0.13987249,  0.88162416]])
Grad:
array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])