In [13]:
import math
import numpy as np
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [100]:
class Value:

  def __init__(self, data, _children=(), _op='', label=''):
    self.data = data
    self.grad = 0.0
    self._backward = lambda: None
    self._prev = set(_children)
    self._op = _op
    self.label = label

  def __repr__(self):
    return f'Value(data={self.data})'

  def __add__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self, other), '+')

    def _backward():
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
    out._backward = _backward

    return out

  def __radd__(self, other):
    return self + other

  def __neg__(self):
    return self * -1

  def __sub__(self, other):
    return self + (-other)

  def __mul__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data * other.data, (self, other), '*')

    def _backward():
      self.grad += other.data * out.grad
      other.grad += self.data * out.grad
    out._backward = _backward

    return out

  def __rmul__(self, other):
    return self * other

  def __truediv__(self, other):
    return self * (other**-1)

  def __pow__(self, other):
    assert isinstance(other, (int, float)), 'Only supporting int/float powers for now'
    out = Value(self.data**other, (self,), f'**{other}')

    def _backward():
      self.grad += other * (self.data**(other - 1)) * out.grad
    out._backward = _backward

    return out

  def exp(self):
    x = self.data
    out = Value(math.exp(x), (self,), 'exp')

    def _backward():
      self.grad += out.data * out.grad
    out._backward = _backward

    return out

  def tanh(self):
    x = self.data
    t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
    out = Value(t, (self,), 'tanh')

    def _backward():
      self.grad += (1 - t**2) * out.grad
    out._backward = _backward

    return out

  def relu(self):
    x = self.data
    out = Value(0 if x < 0 else x, (self,), 'relu')

    def _backward():
      self.grad += (out.data > 0) * out.grad
    out._backward = _backward

    return out

  def sigmoid(self):
    x = self.data
    x = max(min(x, 700), -700)  # Clamping to avoid overflow
    s = 1 / (1 + math.exp(-x))
    out = Value(s, (self,), 'sigmoid')

    def _backward():
      self.grad += s * (1 - s) * out.grad
    out._backward = _backward

    return out

  def elu(self, alpha=1.0):
    x = self.data
    out = Value(x if x > 0 else alpha * (math.exp(x) - 1), (self,), 'elu')

    def _backward():
      self.grad += (1 if x > 0 else alpha * math.exp(x)) * out.grad
    out._backward = _backward

    return out

  def hard_shrink(self, _lambda=0.5):
    x = self.data
    out = Value(x * (x > _lambda) + x * (x < -_lambda), (self,), 'hard_shrink')

    def _backward():
      self.grad += ((x > _lambda) + (x < _lambda)) * out.grad
    out._backward = _backward

    return out

  def hard_sigmoid(self):
    x = self.data
    out = Value((x >= 3) + ((x / 6) + 0.5) * (x > -3 and x < 3), (self,), 'hard_sigmoid')

    def _backward():
      self.grad += ((1 / 6) * (x > -3 and x < 3)) * out.grad
    out._backward = _backward

    return out

  def hard_tanh(self, min_val=-1.0, max_val=1.0):
    x = self.data
    out = Value(max_val * (x > max_val) + min_val * (x < min_val) + x * (x >= min_val and x <= max_val), (self,), 'hard_tanh')

    def _backward():
      self.grad += (x >= min_val and x <= max_val) * out.grad
    out._backward = _backward

    return out

  def hardswish(self):
    x = self.data
    out = Value(x * (x >= 3) + (x * (x + 3) / 6) * (x > -3 and x < 3), (self,), 'hardswish')

    def _backward():
      self.grad += ((x >= 3) + ((2 * x + 3) / 6) * (x > -3 and x < 3)) * out.grad
    out._backward = _backward

    return out

  def leaky_relu(self, negative_slope=0.01):
    x = self.data
    out = Value(x if (x >= 0) else negative_slope * x, (self,), 'leaky_relu')

    def _backward():
      self.grad += (1 if (x >= 0) else negative_slope) * out.grad
    out._backward = _backward

    return out

  def log_sigmoid(self):
    x = self.data
    s = 1 / (1 + math.exp(-x))
    out = Value(math.log(s), (self,), 'log_sigmoid')

    def _backward():
      self.grad += (1 - s) * out.grad
    out._backward = _backward

    return out

  def relu_6(self):
    x = min(max(self.data, 0), 6)
    out = Value(x, (self,), 'relu_6')

    def _backward():
      self.grad += (x > 0 and x < 6) * out.grad
    out._backward = _backward

    return out

  def rrelu(self, lower=0.25, upper=0.3333333333333333, training=True):
    if training:
      self.slope = random.uniform(lower, upper)
    else:
      self.slope = (lower + upper) / 2

    x = self.data
    out = Value(x if x > 0 else self.slope * x, (self,), 'rrelu')

    def _backward():
      self.grad += (1 if x > 0 else self.slope) * out.grad
    out._backward = _backward

    return out

  def selu(self):
    scale = 1.0507009873554804934193349852946
    alpha = 1.6732632423543772848170429916717
    x = scale * (max(0, self.data) + min(0, alpha * (math.exp(self.data) - 1)))
    out = Value(x, (self,), 'selu')

    def _backward():
      self.grad += (scale * (self.data > 0) + (x <= 0) * (scale * alpha * math.exp(x))) * out.grad
    out._backward = _backward

    return out

  def celu(self, alpha=1.0):
    x = max(0, self.data) + min(0, alpha * (math.exp(self.data / alpha) - 1))
    out = Value(x, (self,), 'celu')

    def _backward():
      self.grad += (1 if self.data > 0 else math.exp(self.data / alpha)) * out.grad
    out._backward = _backward

    return out

  def gelu(self):
    x = self.data
    out = Value(0.5 * x * (1 + math.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3))), (self,), 'gelu')

    def _backward():
        sech_squared = 4 / ((math.exp(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)) + math.exp(-math.sqrt(2 / math.pi) * (x + 0.044715 * x**3))) ** 2)
        self.grad += sech_squared * (math.sqrt(2 / math.pi)) * (1 + 3 * 0.044715 * x**2) * out.grad
    out._backward = _backward

    return out

  def silu(self):
    x = self.data
    s = 1 / (1 + math.exp(-x))
    out = Value(x * s, (self,), 'silu')

    def _backward():
      self.grad += s * (1 + x * (1 - s)) * out.grad
    out._backward = _backward

    return out

  def softplus(self, beta=1.0, threshold=20.0):
    x = self.data
    out_val = x if x > threshold else (1 / beta) * math.log(1 + math.exp(beta * x))
    out = Value(out_val, (self,), 'softplus')

    def _backward():
      self.grad += (1 if x > threshold else (1 / (1 + math.exp(-beta * x)))) * out.grad
    out._backward = _backward

    return out

  def mish(self, beta=1.0):
    x = self.data
    softplus_out = (1 / beta) * math.log(1 + math.exp(beta * x))
    out_val = x * math.tanh(softplus_out)
    out = Value(out_val, (self,), 'mish')

    def _backward():
      self.grad += math.tanh(softplus_out) + x * (1 / (1 + math.exp(-x))) * (1 - math.tanh(softplus_out) ** 2) * out.grad
    out._backward = _backward

    return out

  def softshrink(self, _lambda=0.5):
    x = self.data
    out = Value((x - _lambda) * (x > _lambda) + (x + _lambda) * (x < -_lambda), (self,), 'softshrink')

    def _backward():
      self.grad += ((x > _lambda) + (x < -_lambda)) * out.grad
    out._backward = _backward

    return out

  def softsign(self):
    x = self.data
    out = Value(x / (1 + abs(x)), (self,), 'softsign')

    def _backward():
      self.grad += (1 / (1 + abs(x)) ** 2) * out.grad
    out._backward = _backward

    return out

  def tanhshrink(self):
    x = self.data
    out = Value(x - math.tanh(x), (self,), 'tanhshrink')

    def _backward():
      self.grad += (math.tanh(x) ** 2) * out.grad
    out._backward = _backward

    return out

  def backward(self):

    topo = []
    visited = set()

    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v._prev:
          build_topo(child)
        topo.append(v)
    build_topo(self)

    self.grad = 1.0
    for node in reversed(topo):
      node._backward()

In [101]:
from graphviz import Digraph

def trace(root):
  # builds a set of all nodes and edges in a graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right

  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

In [102]:
class Neuron:

    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x, activation=None):
        # w * x + b
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)

        if activation is None or activation == 'linear':
          out = act
        elif activation == 'relu':
          out = act.relu()
        elif activation == 'tanh':
          out = act.tanh()
        elif activation == 'sigmoid':
          out = act.sigmoid()
        elif activation == 'elu':
          out = act.elu()
        elif activation == 'hard_shrink':
          out = act.hard_shrink()
        elif activation == 'hard_sigmoid':
          out = act.hard_sigmoid()
        elif activation == 'hard_tanh':
          out = act.hard_tanh()
        elif activation == 'hardswish':
          out = act.hardswish()
        elif activation == 'leaky_relu':
          out = act.leaky_relu()
        elif activation == 'log_sigmoid':
          out = act.log_sigmoid()
        elif activation == 'relu_6':
          out = act.relu_6()
        elif activation == 'rrelu':
          out = act.rrelu()
        elif activation == 'selu':
          out = act.selu()
        elif activation == 'celu':
          out = act.celu()
        elif activation == 'gelu':
          out = act.gelu()
        elif activation == 'silu':
          out = act.silu()
        elif activation == 'softplus':
          out = act.softplus()
        elif activation == 'mish':
          out = act.mish()
        elif activation == 'softshrink':
          out = act.softshrink()
        elif activation == 'softsign':
          out = act.softsign()
        elif activation == 'tanhshrink':
          out = act.tanhshrink()

        return out

    def parameters(self):
      return self.w + [self.b]

class Layer:

    def __init__(self, nin, nout, activation):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x, activation):
        outs = [n(x, activation=activation) for n in self.neurons]
        return outs

    def parameters(self):
      # return [p for neuron in self.neurons for p in neuron.parameters()]
      params = []
      for neuron in self.neurons:
        params.extend(neuron.parameters())

      return params

class MLP:

    def __init__(self, nin, nouts, activations):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1], activation=activations[i]) for i in range(len(nouts))]
        self.activations = activations

    def __call__(self, x):
        for i, layer in enumerate(self.layers):
            x = layer(x, activation=self.activations[i])
        return x if len(x) > 1 else x[0]

    def parameters(self):
      return [p for layer in self.layers for p in layer.parameters()]

In [103]:
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1], ['relu', 'relu', None])
output = n(x)
print(output)

Value(data=1.7135779431317035)


In [104]:
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
ys = [1.0, -1.0, -1.0, 1.0]

In [105]:
for i in range(1000):
  ypred = [n(x) for x in xs]
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

  for p in n.parameters():
    p.grad = 0.0

  loss.backward()

  for p in n.parameters():
    p.data -= 0.01 * p.grad

  print(i + 1, loss.data)

1 10.995389905980034
2 4.907802145382933
3 4.452965745273988
4 4.216329064858661
5 4.066944622610989
6 3.9564274667566792
7 3.8741668243321636
8 3.8018206422493006
9 3.7358897817333028
10 3.670938659985543
11 3.6045967763376825
12 3.532694053398319
13 3.4546558139006205
14 3.3697642901201963
15 3.2772071850587965
16 3.1768457685433082
17 3.069169801775564
18 2.957623688324585
19 2.8429186860261555
20 2.7194438869782
21 2.5878204625576093
22 2.4533452111923917
23 2.3204327105767892
24 2.174468910572834
25 2.0409653779768537
26 1.893833524682318
27 1.7682653797297174
28 1.6382607134715181
29 1.513839193360521
30 1.3983152106212284
31 1.286911146038983
32 1.1775218735202686
33 1.0764744974227405
34 0.9831792286127987
35 0.8974987752228807
36 0.8214314347559867
37 0.7522084540618272
38 0.6923362272346242
39 0.6429806628262438
40 0.6117605192954353
41 0.5853960513703771
42 0.5616014601926735
43 0.5402143262967996
44 0.5177385162041959
45 0.497927817383309
46 0.4807069011694156
47 0.46276241

In [106]:
ypred = [n(x) for x in xs]
ypred

[Value(data=1.0000000036312162),
 Value(data=-1.0000000023707856),
 Value(data=-0.9999999514623782),
 Value(data=0.9999999917177597)]

In [107]:
# beta = 1.0

# xs = np.arange(-5, 5, 0.25)
# ys = (1 / beta) * np.log(1 + np.exp(beta * xs))

# plt.plot(xs, ys); plt.grid();

In [74]:
abs(-10)

10