In [3]:
import numpy as np

A = np.ones((3,3))
B = np.triu((3,3, 3))

C = A + B

dC = np.ones_like(C)
dA = np.sum(dC, axis=0)
dB = dC

# dA.shape == A.shape, dB.shape == B.shape,
A.shape == C.shape
for i, j in zip(A.shape, C.shape):
    print("not equal", i == j)

not equal True
not equal True


In [9]:
# Create a 1D array
arr = np.linspace(0, 100, 101)
arr = arr[:-1]
reshaped_arr = arr.reshape((2, 5, -1))
arr = np.ones(10)
# Reshape the array to a 2D array with 2 rows and 3 columns

(reshaped_arr + arr).shape

(2, 5, 10)

In [1]:
import torch

A = torch.ones((5, 10), requires_grad=True)

B = torch.ones((10, 5))
B = torch.triu(B)
B.requires_grad=True

A.retain_grad()
B.retain_grad()

C = A @ B
grad_output = torch.ones_like(C)
C.backward(grad_output, retain_graph=True)

C.shape
dA = grad_output @ B.T
dB = A.T @ grad_output
(dA == A.grad).all(), (dB == B.grad).all()
A.shape, B.shape, C.shape
# WHAT OF TENSORS GREATER THAN 2 DIMENSIONS?

(torch.Size([5, 10]), torch.Size([10, 5]), torch.Size([5, 5]))

In [226]:
from typing import Union
import numpy as np

class Tensor:
    def __init__(self, arr=[], _children=set(), _backward=lambda:None):
        if not isinstance(arr, np.ndarray):
            if isinstance(arr, list):
                arr = np.array(arr)
            else:
                raise ValueError(f'data should be of type "numpy.ndarray" or a scalar,but received {type(arr)}')

        self.data = arr

        self.dtype = self.dtype
        self._children = _children
        self._backward = _backward
        self.grad = np.zeros_like(self.data, dtype=np.float64)  # is this really the best way to implement this?

    def zero_grad(self):
        self.grad = np.ones_like(self.data, dtype=np.float64)

    def __add__(self, other:'Tensor'):
        y = Tensor(self.data + other.data, (self, other))

        def _backward():
            if self.data.shape == other.data.shape:
                self.grad += y.grad
                other.grad += y.grad
            else:
                print("broadcasting of sizes", self.data.shape, other.data.shape)
                for i, j in zip(self.data.shape, y.data.shape):
                    if i != j:
                        raise ValueError(f"Shapes are different self:{self.data.shape} other:{other.data.shape}")

                if len(self.data.shape) < len(y.data.shape):
                    self.grad += np.sum(y.data, axis=0)
                    other.grad += y.data

        y._backward = _backward

        return y

    def __mul__(self, other:Union['Tensor', int, float]) -> 'Tensor':
        """
            dot and scalar product
        """
        if isinstance(other, (int, float)):
            other = Tensor([other])
            y = Tensor(other.data*self.data, (self,other))
        else:
            if self.data.shape != other.data.shape:
                raise ValueError(f"Shapes are different self:{self.data.shape} other:{other.data.shape}")
            y = Tensor(self.data * other.data, (self, other))

        def _backward():
            if isinstance(other, (int, float)):
                self.grad += other * y.grad
                return


            if other.data.shape == (1, ):
                self.grad += other.data * y.grad
                return

            if self.data.shape == other.data.shape:
                self.grad += other.data * y.grad # works for two dimensional but fails for the rest
                other.grad += self.data * y.grad
            else:
                raise ValueError(f"Shapes are different self:{self.data.shape} other:{other.data.shape}")

                # raise NotImplementedError # understanding how matrix multiplcation works

            return
        y._backward = _backward
        return y

    def __matmul__(self, other: 'Tensor'):
        if not isinstance(other, Tensor):
            raise ValueError(f'data should be of type "Tensor"  {type(other)}')
        if self.data.shape != other.data.shape:
            raise ValueError(f"Shapes are different self:{self.data.shape} other:{other.data.shape}")
        y = Tensor(self.data @ other.data, (self, ))

        def _backward():
            self.grad += np.dot(y.grad, other.grad.T)
            other.grad += np.dot(self.grad.T, y.grad)
        y._backward = _backward
        return y

    def __pow__(self, n):
        if n < 0: # numpy does not support negative exponents
            y = Tensor(1/(self.data ** -n), (self,))
        else:
            y = Tensor(self.data ** n, (self,))

        def _backward():
            if n-1 < 0: # numpy does not support negative exponents
                self.grad += ((n / self.data ** -(n-1))) * y.grad
            else:
                self.grad += (n * self.data ** (n-1)) * y.grad
        y._backward = _backward
        return y

    def __div__(self, other:Union['Tensor', int, float]): # other / self
        if not isinstance(other, Tensor):
            other = Tensor(other)
        return other * self**-1

    def T(self):
        y = Tensor(self.data.T, (self,))
        def _backward():
            self.grad += y.grad.T

        y._backward = _backward
        return y

    def backward(self):
        children = []
        visited = set()

        def build_topo(node):
            if node not in visited and node is not None:
                visited.add(node)
                if node._children:
                    for child in node._children:
                        build_topo(child)
                children.append(node)
        build_topo(self)

        children.reverse()

        for child in children:

            child._backward()
        return

    def __neg__(self):
        return self * -1

    def __sub__(self, other:'Tensor'):
        return self + (-other)

    def __radd__(self, other): # other + self
        return self + other

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other:'Tensor'): # self / other
        return self * other**-1

    def __rtruediv__(self, other:'Tensor'): # other / self
        return other * self**-1

    def dtype(self, _dtype):
        return self.data.astype(_dtype)

    def __repr__(self) -> str:
        data = self.data
        grad = self.grad

        return f"Tensor<{data.tolist()}, {grad=}>" if self.grad>0 else f"Tensor<{data.tolist()}>"


def exp(x:Tensor):
    y = np.exp(x.data)
    y = Tensor(y, (x,))

    def _backward():
        dy = np.exp(x.data)
        x.grad += dy * y.grad
        return

    y._backward = _backward
    return y

def log(x:Tensor):
    y = np.log(x.data)
    y = Tensor(y, (x,))
    def _backward():
        dy = x.data ** -1
        x.grad += dy * y.grad
        return

    y._backward = _backward
    return y

# ---------------------------------- Activation functions --------------------------------

def relu(x:Tensor):
    y = np.maximum(x.data, 0)
    y = Tensor(y, (x,))

    def _backward():
        x.grad[x.data>0] += y.grad[x.data>0]
        return

    y._backward = _backward
    return y


# checking if both implementation are equal
def sigmoid_2(x:Tensor):
    return (Tensor([1])+exp(-x)) ** -1

def sigmoid(x:Tensor):
    y = 1/(1+np.exp(-x.data))
    y = Tensor(y, (x,))

    def _backward():
        dy = x.data*(1-x.data)
        x.grad += dy * y.grad
        return

    y._backward = _backward
    return y

def tanh_2(x:Tensor): # this implementation through seems attractive leads to overflow error
    return (exp(x) - exp(-x))/(exp(x) + exp(-x))

def tanh(x:Tensor):
    y = np.tanh(x.data)
    y = Tensor(y, (x,))

    def _backward():
        dy = (1-y.data**2)
        x.grad += dy * y.grad
        return

    y._backward = _backward
    return y

def sum_(x:Tensor, axis=None): # sums all elements
    if axis:
        raise NotImplementedError
    y = x.data.sum()
    y = np.array(y)
    y = Tensor(y, (x,))
    
    def _backward():
        x.grad += np.ones_like(x) * y.grad
        return

    y._backward = _backward
    return y

In [227]:
# from autograd import Tensor, log, sigmoid
import numpy as np
a = Tensor([100.0])

b = Tensor([200.0])
c = a + b
d = Tensor([5.0])

e = c * d
f = Tensor([0.1])
g = e / f
h = g ** 2
i  = -h
j = Tensor([0.9])
k = j-i
l = log(k)
m = exp(l)
n = sigmoid(m ** -0.5)
o = tanh(n)
p = relu(o)

params = [p, o,n, m,l,k,j,i, h,g,f,e,d,c,b,a]
p.zero_grad()
p.backward()
print([i.grad for i in params])
autograd_grads = [i.grad for i in params]



[array([1.]), array([1.]), array([0.78643562]), array([-7.76674749e-18]), array([-1.74751819e-09]), array([-7.76674749e-18]), array([-7.76674749e-18]), array([7.76674749e-18]), array([-7.76674749e-18]), array([-2.33002425e-13]), array([0.]), array([-2.33002425e-12]), array([0.]), array([-1.16501212e-11]), array([-1.16501212e-11]), array([-1.16501212e-11])]


In [228]:
import torch
from torch import tensor
a = tensor([100.0], requires_grad=True)
b = tensor([200.0], requires_grad=True)
c = a + b
d = tensor([5.0], requires_grad=True)
e = c * d
f = tensor([.1], requires_grad=True)
g = e / f
h = g ** 2
i  = -h
j = tensor([0.9], requires_grad=True)
k = j-i
l = torch.log(k)
m = torch.exp(l)
n = torch.sigmoid(m ** -0.5)
o = torch.tanh(n)
p = torch.relu(o)
params = [p, o,n, m,l,k,j,i, h,g,f,e,d,c,b,a]

for i in params:
    i.retain_grad()
p.backward()
print([i.grad for i in params])
torch_grads = [i.grad.numpy() for i in params]

for t, a in zip(torch_grads,autograd_grads):
    if np.abs((t - a)) > 1e-5:
        print(t-a)

[tensor([1.]), tensor([1.]), tensor([0.7864]), tensor([-2.9127e-14]), tensor([-6.5536e-06]), tensor([-2.9127e-14]), tensor([-2.9127e-14]), tensor([2.9127e-14]), tensor([-2.9127e-14]), tensor([-8.7382e-10]), tensor([0.0001]), tensor([-8.7382e-09]), tensor([-2.6215e-06]), tensor([-4.3691e-08]), tensor([-4.3691e-08]), tensor([-4.3691e-08])]
[0.00013107]


In [231]:
# no broadcasting here
import numpy as np
a = Tensor(np.ones((3,3)))

b = Tensor(np.ones((3,3)))

c = a + b
d = Tensor(np.ones((3,3)))

e = c * d
f = Tensor(np.ones((3,3)))
g = e / f
h = g ** 2
i  = -h
j = Tensor(np.ones((3,3)))
# k = j-i
# l = log(k)
# m = exp(l)
# n = sigmoid(m ** -0.5)
# o = tanh(n)
# p = relu(o)
q = sum_(j)

params = [
    q, 
    #p, o,n,m,l,k,
    j,i, h,g,f,e,d,c, b,a]
params[0].zero_grad()
params[0].backward()
autograd_grads = [i.grad for i in params]


In [232]:
#------------------------ Torch -----------------------------
import torch

a = torch.ones(3,3, requires_grad = True)
b = torch.ones(3,3, requires_grad = True)


c = a + b
d = torch.ones(3,3, requires_grad = True)

e = c + d
f = torch.ones(3,3, requires_grad = True)

g = e / f
h = g ** 2
i  = -h
j = torch.ones(3,3, requires_grad = True)

# k = j-i
# l = torch.log(k)
# m = torch.exp(l)
# n = torch.sigmoid(m ** -0.5)
# o = torch.tanh(n)
# p = torch.relu(o)
q = torch.sum(j)

params = [
    # q, p, o,n,m,l,k,j,
    q, j,i, h,g,f,e,d,c, b,a]

params[0].grad = None
for i in params:
    i.retain_grad()

params[0].backward()
torch_grads = [i.grad for i in params]

for t, a in zip(torch_grads,autograd_grads):

    # if t is None: # resolve this
    #     print("- ", a)
    #     continue
    if t is not None:
        print(t-a)
torch_grads, len(autograd_grads)

tensor(0., dtype=torch.float64)
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], dtype=torch.float64)


([tensor(1.),
  tensor([[1., 1., 1.],
          [1., 1., 1.],
          [1., 1., 1.]]),
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 11)