diff --git a/src/tricycle/__init__.py b/src/tricycle/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/tricycle/activation.py b/src/tricycle/activation.py deleted file mode 100644 index c116095..0000000 --- a/src/tricycle/activation.py +++ /dev/null @@ -1,20 +0,0 @@ -import numpy as np - -from tricycle.ops import Tensor, einsum, tensor, to_tensor - - -@to_tensor -def relu(x: Tensor) -> Tensor: - """ - Compute the relu of a tensor - """ - result = tensor(np.maximum(x, 0)) - - def diff_relu(arg: Tensor) -> Tensor: - weight = (x > 0).astype(x.dtype) - return einsum(weight, arg, subscripts="ij,ij->ij") - - result.back_fn = (diff_relu,) - result.args = (x,) - - return result diff --git a/src/tricycle/dataset.py b/src/tricycle/dataset.py deleted file mode 100644 index 6dbaa7d..0000000 --- a/src/tricycle/dataset.py +++ /dev/null @@ -1,20 +0,0 @@ -import numpy as np - - -class InfiniteDataset: - """ - Iterator that infinitely generates random batches from the dataset - """ - - def __init__(self, X, y, batch_size=32): - self.X = X - self.y = y - self.batch_size = batch_size - - def __iter__(self): - while True: - indices = np.random.choice(len(self.X), self.batch_size) - yield self.X[indices], self.y[indices] - - def __len__(self): - return len(self.X) diff --git a/src/tricycle/experiments.py b/src/tricycle/experiments.py deleted file mode 100644 index 59b34de..0000000 --- a/src/tricycle/experiments.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Some utils that help with running experiments (I couldn't find anywhere else -for them to go) -""" - -from matplotlib import pyplot as plt -from matplotlib.widgets import Slider - - -def smooth(values: list[float], factor=0.99): - """ - Apply exponential smoothing to a list of values - """ - assert 0 <= factor <= 1 - - smoothed = values[0] - for val in values[1:]: - smoothed = factor * smoothed + (1 - factor) * val - yield smoothed - - -def plot_loss(losses): - """ - Plot a loss curve with a slider to control smoothing - """ - factor = 0.99 - - fig, ax = plt.subplots(figsize=(16, 9)) - ax.set_ylim(min(losses) * 0.95, max(losses) * 1.05) - - line = ax.plot(list(smooth(losses, factor=factor))) - fig.subplots_adjust(left=0.25, bottom=0.25) - - slider_ax = fig.add_axes((0.25, 0.1, 0.65, 0.03)) - slider = Slider( - ax=slider_ax, - label="Smoothing", - valmin=0, - valmax=1, - valinit=factor, - ) - - def update(factor: float): - line[0].set_ydata(list(smooth(losses, factor=factor))) - plt.gcf().canvas.draw() - - slider.on_changed(update) - - plt.show() diff --git a/src/tricycle/initialisers.py b/src/tricycle/initialisers.py deleted file mode 100644 index 21dbcb1..0000000 --- a/src/tricycle/initialisers.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np - -from tricycle.ops import tensor - - -def init_normal(shape, name: str = "", loc=0.0, scale=0.01): - """ - Initialize a tensor with values sampled from a normal distribution - """ - return tensor(np.random.normal(size=shape, loc=loc, scale=scale), name=name) - - -def init_zero(shape, name: str = ""): - """ - Initialize a tensor with zeros - """ - return tensor(np.zeros(shape), name=name) - - -def init_xavier(shape, name: str = ""): - """ - Initialize a tensor with xavier/glorot initialisation - """ - f_in, f_out = shape - bound = np.sqrt(6) / np.sqrt(f_in + f_out) - return tensor(np.random.uniform(low=-bound, high=bound, size=shape), name=name) diff --git a/src/tricycle/loss.py b/src/tricycle/loss.py deleted file mode 100644 index f751e49..0000000 --- a/src/tricycle/loss.py +++ /dev/null @@ -1,38 +0,0 @@ -from string import ascii_letters - -from tricycle.ops import Tensor, einsum, log, mean, softmax, tensor - - -def mean_square_error(y_pred: Tensor, y_true: Tensor) -> Tensor: - """ - Copmute the mean square error between two identically shaped tensors - """ - assert y_pred.shape == y_true.shape - - errors = y_pred - y_true - - indices = ascii_letters[: len(errors.shape)] - subscripts = f"{indices}, {indices} -> {indices}" - - square_errors = einsum(errors, errors, subscripts=subscripts) - return mean(square_errors) - - -def categorical_crossentropy(y_logits: Tensor, y_true: Tensor) -> Tensor: - """ - NOTE: This does not currently work. I cannot get the test to pass - - Compute the cross entropy loss between two identically shaped tensors. - Note, y_pred are assumed to be logits. That is, they are expectected to - be-unnormalised. y_true is assumed to be normalised already. - Usually this is a one-hor encoding of a single categorical output - but sharing labels across multiple outputs is possible. - """ - y_pred = softmax(y_logits) - y_pred.name = "y_pred" - log_probs = log(y_pred) - log_probs.name = "log_probs" - loss = -einsum(log_probs, y_true, subscripts="ij,ij->i") - loss.name = "loss" - # raise Exception(f"{log_probs=}, {y_true=}, {y_pred=}, {y_true=}, {loss=}") - return einsum(loss, subscripts="i->") / tensor(loss.shape[0], requires_grad=False) diff --git a/src/tricycle/ops.py b/src/tricycle/ops.py deleted file mode 100644 index 3d8cc5f..0000000 --- a/src/tricycle/ops.py +++ /dev/null @@ -1,679 +0,0 @@ -import logging -import uuid -from functools import partial, wraps -from string import ascii_letters, ascii_lowercase -from typing import Callable, List, Optional, Tuple, Union - -import networkx as nx -import numpy as np -import pydot -from matplotlib import pyplot as plt -from networkx.drawing.nx_pydot import graphviz_layout - -# Type signature for Tensor operation -Op = Callable[..., "Tensor"] - -grad: bool = True - -logger = logging.getLogger(__name__) - - -def to_tensor(fn: Op) -> Op: - """ - A decorator to convert non-tensor arguments to tensors - """ - - @wraps(fn) - def wrapped(*args, **kwargs): - args = [arg if isinstance(arg, Tensor) else tensor(arg) for arg in args] - return fn(*args, **kwargs) - - return wrapped - - -class no_grad: - """ - A context manager to disable gradient calculation - """ - - def __enter__(self): - global grad - grad = False - - def __exit__(self, *_): - global grad - grad = True - - -class Tensor(np.ndarray): - """ - An N-dimensional grid of numbers. This is implemented as a subclass - of a standard numpy array - """ - - _id: int - args: tuple["Tensor", ...] | None = None - back_fn: tuple[Op, ...] | None = None - grad_fn: List[List[Op]] = [] - grad: Optional["Tensor"] = None - name: Optional[str] = None - requires_grad: bool = False - - def backward(self): - """ - Perform auto-differentiation by traversing the computational graph - to find every path from the root (loss) to each leaf (parameter). - - Once we reach a leaf, we calculate its gradient by applying - the gradient function for each operation that we passed along the - path - """ - - queue = [self] - labels = {} - edge_labels = {} - - G = nx.DiGraph() - G.add_node(str(self)) - labels[str(self)] = self.name or "" - - logger.info("\n\n") - while queue: - logger.info(f"stack: {queue}") - current_node = queue.pop() - - # At intermediate node - if ( - current_node.back_fn is not None - and current_node.args is not None # noqa: E501 - ): - # Get gradient functions for the operation - grad_fn = current_node.grad_fn - - logger.info( - f"{current_node.args=} {current_node.back_fn=} { current_node=}" - ) - # Update gradient functions for each parent node - for arg, op in zip(current_node.args, current_node.back_fn): - if str(arg) not in G.nodes: - G.add_node(str(arg)) - labels[str(arg)] = arg.name or "" - G.add_edge(str(arg), str(current_node)) - - try: - edge_labels[(str(arg), str(current_node))] = op.__name__ - except AttributeError: - edge_labels[(str(arg), str(current_node))] = op.func.__name__ - - arg.grad_fn = grad_fn + [op] - - # Add each arg to the stack for further processing - queue = [arg] + queue - - # At leaf node - elif current_node.requires_grad: - grad = tensor(np.ones_like(self)) - logger.warning(current_node.grad_fn) - for op in current_node.grad_fn: - # Follow the path from root to leaf, applying - # each function to get the gradient for the leaf - # - # TODO: this is slow - # - # - We can speed this up by caching intermediate values - # that are used by multiple leaves - # - # - We should also be able to speed this up by analysing - # the path to see if any operations can be fused - # - # - We should also probably ignore leaves that we don't - # need the gradient for (e.g the inputs) - grad = op(grad) - - if current_node.grad is None: - current_node.grad = grad - else: - # If there are multiple paths to this node, we - # need to add all of the gradients together - current_node.grad += grad - - fig, ax = plt.subplots(figsize=(10, 10)) - pos = graphviz_layout(G, prog="dot") - nx.draw(G, labels=labels, pos=pos, ax=ax) - nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edge_labels, ax=ax) - plt.show() - - @to_tensor - def __add__(self, other: "Tensor") -> "Tensor": - return add(self, other) - - @to_tensor - def __iadd__(self, other: "Tensor") -> "Tensor": - return add(self, other) - - @to_tensor - def __sub__(self, other: "Tensor") -> "Tensor": - return sub(self, other) - - @to_tensor - def __mul__(self, other: "Tensor") -> "Tensor": - return mul(self, other) - - @to_tensor - def __truediv__(self, other: "Tensor") -> "Tensor": - return div(self, other) - - @to_tensor - def __pow__(self, other: "Tensor") -> "Tensor": - return pow(self, other) - - @to_tensor - def __neg__(self) -> "Tensor": - return negate(self) - - @to_tensor - def __matmul__(self, other: "Tensor") -> "Tensor": - return matmul(self, other) - - def mean(self) -> "Tensor": - return mean(self) - - @property - def T(self) -> "Tensor": - return transpose(self) - - def __str__(self) -> str: - return self.__repr__() - - def __repr__(self) -> str: - if self.name is None: - return f"{super().__repr__()}" - return f"{super().__repr__()[:-1]}, {self.name=})" - - -class bind(partial): - """ - A version of partial which accepts Ellipsis (...) as a placeholder - credit: https://stackoverflow.com/questions/7811247/ - """ - - def __call__(self, *args, **keywords): - keywords = {**self.keywords, **keywords} - iargs = iter(args) - args = (next(iargs) if arg is ... else arg for arg in self.args) - return self.func(*args, *iargs, **keywords) - - -# -----------------utility functions-------- - - -def tensor( - *args, name: Optional[str] = None, requires_grad: bool = True, **kwargs -) -> Tensor: - """ - Create a new Tensor instance. First, we convert the argument to a numpy - array and then to a tensor - """ - result = np.asarray(*args, **kwargs).view(Tensor) - result.name = name - result.requires_grad = requires_grad - return result - - -@to_tensor -def nothing(x: Tensor) -> Tensor: - """ - Do nothing - """ - global grad - result = tensor(x) - if grad: - result.back_fn = (nothing,) - result.args = (x,) - return result - - -# ----------binary functions---------------- - - -@to_tensor -def sub(x: Tensor, y: Tensor) -> Tensor: - """ - Subtract two tensors - """ - global grad - result = tensor(np.subtract(x, y)) - if grad: - result.back_fn = (nothing, negate) - result.args = (x, y) - return result - - -@to_tensor -def negate(x: Tensor) -> Tensor: - """ - Swap the sign of every element of a tensor - """ - global grad - - result = tensor(np.multiply(x, -1)) - if grad: - result.back_fn = (negate,) - result.args = (x,) - return result - - -@to_tensor -def add(x: Tensor, y: Tensor) -> Tensor: - """ - Add two tensors - """ - global grad - result = tensor(np.add(x, y)) - if grad: - result.back_fn = (nothing, nothing) - result.args = (x, y) - return result - - -@to_tensor -def mul(x: Tensor, y: Tensor) -> Tensor: - """ - Multiply two tensors - """ - global grad - result = tensor(np.multiply(x, y)) - if grad: - result.back_fn = ( - partial(mul, y=y), - partial(mul, y=x), - ) - result.args = (x, y) - return result - - -@to_tensor -def div(x: Tensor, y: Tensor) -> Tensor: - """ - Divide two tensors - """ - # scalar / tensor or tensor / scalar - if len(x.shape) == 0 or len(y.shape) == 0: - minus_one = tensor(-1.0, requires_grad=False, name="minus_one") - return mul(x, pow(y, minus_one)) - - # tensor / tensor - elif len(x.shape) == len(y.shape): - one = tensor(1.0, requires_grad=False, name="one") - indices = ascii_lowercase[: len(x.shape)] - einsum(x, one / y, subscripts=f"{indices},{indices}->{indices}") - else: - raise ValueError( - "Division between two tensors is of different shapes is not supported." - ) - - -# -----------elementwise unary functions---------------- - - -@to_tensor -def pow(x: Tensor, y: Tensor) -> Tensor: - """ - Raise every element of a tensor to the power of another tensor - """ - global grad - result = tensor(np.power(x, y)) - if not grad: - return result - - result.back_fn = ( - partial(mul, y=tensor(np.power(x, y - 1)) * y), - partial(mul, y=result * log(x)), - ) - result.args = (x, y) - return result - - -@to_tensor -def exp(x: Tensor) -> Tensor: - """ - Raise every element of a tensor to the power of e - """ - global grad - result = tensor(np.exp(x)) - if not grad: - return result - result.back_fn = (partial(mul, y=result),) - result.args = (x,) - return result - - -@to_tensor -def log(x: Tensor) -> Tensor: - """ - Find the natural log of every element of a tensor - """ - global grad - result = tensor(np.log(x)) - if not grad: - return result - - result.back_fn = (partial(div, y=x),) - result.args = (x,) - return result - - -@to_tensor -def sqrt(x: Tensor) -> Tensor: - """ - Find the square root of every element of a tensor - """ - global grad - result = tensor(np.sqrt(x)) - if not grad: - return result - - def diff_sqrt(arg: Tensor) -> Tensor: - return pow(arg, -0.5) * 0.5 - - result.back_fn = (diff_sqrt,) - result.args = (x,) - return result - - -@to_tensor -def sin(x: Tensor) -> Tensor: - """ - Find the sine of every element of a tensor - """ - global grad - result = tensor(np.sin(x)) - if not grad: - return result - - result.back_fn = (cos,) - result.args = (x,) - return result - - -@to_tensor -def cos(x: Tensor) -> Tensor: - """ - Find the cosine of every element of a tensor - """ - global grad - result = tensor(np.cos(x)) - if not grad: - return result - - def diff_cos(arg: Tensor) -> Tensor: - return -sin(arg) - - result.back_fn = (diff_cos,) - result.args = (x,) - return result - - -# ------------------reduce functions---------------- - - -def reduce(x: Tensor, method: Op, subscripts: str) -> Tensor: - """ - Reduce a tensor along some dimensions by applying a reduction function - to those indices. A reduction function generates a binary tensor which - is multiplied by the input and reduces according to the subscripts - """ - indices, output = _parse_subscripts(subscripts) - assert ( - len(indices) == 1 - ), f"Can only reduce a single tensor at a time. Indices suggeststed: {len(indices)} tensors: {indices}" - - [idx] = indices - axis = tuple(i for i, char in enumerate(idx) if char not in output) - assert axis, "Tensor must be reduced along at least one axis" - binary_tensor = method(x, axis) - binary_tensor.requires_grad = False - binary_tensor.name = "binary" - - subscripts = f"{idx},{idx}->{output}" - - return einsum(x, binary_tensor, subscripts=subscripts) - - -# --------------------indicator functions--------------------- -def bmax(x: Tensor, axis: Union[int, Tuple[int]]) -> Tensor: - """ - Return a binary tensor where each element is 1 if the corresponding element - is that largest along an axis that is being reduced along - """ - return (x == np.max(x, axis=axis, keepdims=True)).astype(int) - - -def bmin(x: Tensor, axis: Union[int, Tuple[int]]) -> Tensor: - """ - Return a binary tensor where each element is 1 if the corresponding element - is that smallest along an axis that is being reduced along - """ - return (x == np.min(x, axis=axis, keepdims=True)).astype(int) - - -@to_tensor -def max(x: Tensor) -> Tensor: - """ - Find the largest element of a tensor - """ - indices = ascii_lowercase[: len(x.shape)] - return reduce(x, bmax, f"{indices}->") - - -@to_tensor -def min(x: Tensor) -> Tensor: - """ - Find the smallest element of a tensor - """ - indices = ascii_lowercase[: len(x.shape)] - return reduce(x, bmin, f"{indices}->") - - -def _parse_subscripts(subscripts: str) -> tuple[list[str], str]: - """ - Parse a subscripts string into a list of indices and a result - """ - indices, result = subscripts.split("->") - indices = indices.split(",") - return indices, result - - -@to_tensor -def einsum(*tensors: Tensor, subscripts: str) -> Tensor: - """ - The way we'll be doing tensor manipulation is with the einsum function from - numpy. This uses a version of einstein summation notation to do tensor - operations. It takes a bit of getting used to but ends up being a really - powerful way to do tensor operations. It feels like the natural way to - do deep learningbinary_tensor, - - Examples - -------- - >>> a = np.arange(25).reshape(5,5) - - Trace of a matrix: - - >>> np.einsum('ii', a) - 60 - - Sum over an axis: - - >>> np.einsum('ij->i', a) - array([ 10, 35, 60, 85, 110]) - - Tensor contraction: - - >>> a = np.arange(60.).reshape(3,4,5) - >>> b = np.arange(24.).reshape(4,3,2) - >>> np.einsum('ijk,jil->kl', a, b) - array([[4400., 4730.], - [4532., 4874.], - [4664., 5018.], - [4796., 5162.], - [4928., 5306.]]) - - """ - global grad - - indices, output = _parse_subscripts(subscripts) - - # Every single input tensor is equivalent to a dual input where - # the second input is a 1 tensor with the indices that dont appear in the - # output - if len(tensors) == 1: - one_indices = "" - one_shape = [] - for size, idx in zip(tensors[0].shape, indices[0]): - if idx not in output: - one_indices += idx - one_shape.append(size) - - indices = [indices[0], one_indices] - ones = tensor(np.ones(one_shape), requires_grad=False, name="ones") - tensors = (tensors[0], ones) - - if not tensors or len(tensors) > 2: - raise NotImplementedError( - f"Einsum is only implemented for 1 or 2 inputs. Found {len(tensors)}" - ) - - indices = ",".join(indices) - subscripts = f"{indices}->{output}" - - result = tensor(np.einsum(subscripts, *tensors)) - - if not grad: - return result - - back_fns = [] - indices, output = _parse_subscripts(subscripts) - - # The rule for differentiating einsum wrt one of its arguments is: - # Swap the indices of the output and the argument being differentiated - # in the subscripts. Then pass the current gradient as an argument in - # place of the argument being differentiated. - # for example: - # >>> a = np.arange(12).reshape(3, 4) - # >>> b = np.arange(12).reshape(4, 3) - # >>> c = np.einsum('ij,jk->ik', a, b) - # - # >>> c.backward() - # >>> assert a.grad == np.einsum('ik,jk->ij', np.ones_like(c), b) - # Notice that the indices have swapped and a has been replaced with - # the gradient of c (the current gradient) - for idx in range(len(tensors)): - left = tensors[:idx] - right = tensors[idx + 1 :] - - indices, output = _parse_subscripts(subscripts) - output, indices[idx] = indices[idx], output - indices = ",".join(indices) - diff_subscripts = f"{indices}->{output}" - - def diff_einsum( - arg: Tensor, left=left, right=right, subscripts=diff_subscripts - ) -> Tensor: - """ - Derivative of einsum wrt a single input tensor - """ - args = left + (arg,) + right - return einsum(*args, subscripts=subscripts) - - back_fns.append(diff_einsum) - - result.args = tuple(tensors) - result.back_fn = tuple(back_fns) - - return result - - -@to_tensor -def matmul(x: Tensor, y: Tensor) -> Tensor: - """ - Compute the matrix multiplication of two tensors - """ - # We are limited by the number of available letters but thankfully - # numpy has a max dimension of 32 so this shouldn't be too much of a - # problem. If we start doing giant einsums then i'll probably need to - # build a custom implementation of einsum but we should be good for now - assert len(x.shape) + len(y.shape) <= len( - ascii_letters - ), f"Cannot perform matmul on tensors with more than {len(ascii_letters)} dimensions in total" - assert len(x.shape), "Cannot perform matmul on a 0D tensor" - assert len(y.shape), "Cannot perform matmul on a 0D tensor" - - left_indices = ascii_letters[: len(x.shape) - 1] if len(x.shape) > 1 else "" - right_indices = ascii_letters[-len(y.shape) + 1 :] if len(y.shape) > 1 else "" - - subscripts = f"{left_indices}i,i{right_indices}->{left_indices}{right_indices}" - return einsum(x, y, subscripts=subscripts) - - -@to_tensor -def transpose(x: Tensor) -> Tensor: - """ - Compute the transpose of a tensor - """ - assert ( - len(x.shape) <= 2 - ), "Can only perform transpose on 2D tensors. Use einsum for more complex cases." - return einsum(x, subscripts="ij->ji") - - -@to_tensor -def mean(x: Tensor): - """ - Compute the mean of a tensor - """ - return reduce_sum(x) / x.shape[0] - - -@to_tensor -def softmax(x: Tensor): - """ - Compute the softmax of a tensor - """ - # For numeric stability we'll subtract the max of the tensor - indices = ascii_letters[: len(x.shape)] - largest_x = reduce(x, bmax, subscripts=f"{indices}->{indices[:-1]}") - ones = tensor(np.ones_like(x), requires_grad=False, name="ones") - - largest_x = einsum( - largest_x, ones, subscripts=f"{indices[:-1]},{indices}->{indices}" - ) - normalised_x = x - largest_x - - normalised_x = exp(normalised_x) - assert len(normalised_x.shape) < len( - ascii_letters - ), f"Cannot perform softmax on tensors with more than {len(ascii_letters)} dimensions" - - if len(x.shape) == 1: - one = tensor(1, name="one", requires_grad=False) - denom = one / einsum(normalised_x, subscripts="i->") - return einsum(denom, normalised_x, subscripts=",i->i") - - indices = ascii_letters[: len(x.shape) - 1] - final_letter = ascii_letters[-1] - denom = 1 / einsum( - normalised_x, subscripts=f"{final_letter}{indices}->{final_letter}" - ) - return einsum( - denom, - normalised_x, - subscripts=f"{final_letter},{final_letter}{indices}->{final_letter}{indices}", - ) - - -@to_tensor -def sigmoid(x: Tensor) -> Tensor: - """ - Compute the sigmoid of a tensor - """ - return tensor(1) / (exp(-x) + 1) diff --git a/tests/test_autograd.py b/tests/test_autograd.py deleted file mode 100644 index bcf9ead..0000000 --- a/tests/test_autograd.py +++ /dev/null @@ -1,351 +0,0 @@ -from functools import partial - -import numpy as np - -from tricycle.ops import (add, bmax, cos, div, einsum, exp, log, matmul, max, - min, mul, negate, no_grad, nothing, pow, reduce, - reduce_sum, sin, sqrt, sub, tensor) - - -def partial_func_equals(a, b): - assert a.func == b.func - assert a.args == b.args - - -def grad_fn_equal(tensor_, ops): - assert len(tensor_.grad_fn) == len(ops) - - for fn, op in zip(tensor_.grad_fn, ops): - if isinstance(op, partial): - partial_func_equals(fn, op) - else: - assert fn == op - - -def test_can_differentiate_subtract(): - x = tensor(1.0) - y = tensor(2.0) - - z = sub(y, x) - - assert z == 1.0 - - z.backward() - - assert x.grad == -1.0 - assert y.grad == 1.0 - grad_fn_equal(x, [negate]) - grad_fn_equal(y, [nothing]) - - -def test_can_differentiate_negate(): - x = tensor(1.0) - - z = negate(x) - - assert z == -1.0 - - z.backward() - - assert x.grad == -1.0 - grad_fn_equal(x, [negate]) - - -def test_can_differentiate_nothing(): - x = tensor(1.0) - - z = nothing(x) - - assert z == 1.0 - - z.backward() - - assert x.grad == 1.0 - grad_fn_equal(x, [nothing]) - - -def test_can_differentiate_add(): - x = tensor(1.0) - y = tensor(2.0) - - z = add(x, y) - - assert z == 3.0 - - z.backward() - - assert x.grad == 1.0 - grad_fn_equal(x, [nothing]) - - assert y.grad == 1.0 - grad_fn_equal(y, [nothing]) - - -def test_can_differentiate_multiply(): - x = tensor(1.0) - y = tensor(2.0) - - z = mul(x, y) - - assert z == 2.0 - - z.backward() - - assert x.grad == 2.0 - grad_fn_equal(x, [partial(mul, y=y)]) - - assert y.grad == 1.0 - grad_fn_equal(x, [partial(mul, y=x)]) - - -def test_can_differentiate_divide(): - x = tensor(1.0) - y = tensor(2.0) - - z = div(x, y) - - assert z == 0.5 - - z.backward() - - assert x.grad == 0.5 - grad_fn_equal(x, [partial(mul, y=1/y)]) - - assert y.grad == -0.25 - assert grad_fn_equal(y, [partial(mul, y=1/x), partial(mul, y=1/y)]) - - -def test_can_differentiate_reduce_sum(): - x = tensor([1.0, 2.0, 3.0]) - z = reduce_sum(x) - assert z == 6.0 - - z.backward() - - assert np.allclose(x.grad, [1, 1, 1]) - - -def test_can_differentiate_power(): - x = tensor(2.0) - y = tensor(3.0) - - z = pow(x, y) - - assert z == 8.0 - - z.backward() - - assert x.grad == 12.0 - assert len(x.grad_fn) == 1 - grad_fn = x.grad_fn[0] - assert grad_fn.__name__ == "diff_power_arg_1" - - assert y.grad == 8.0 * np.log(2.0) - assert len(y.grad_fn) == 1 - grad_fn = y.grad_fn[0] - assert grad_fn.__name__ == "diff_power_arg_2" - - -def test_can_differentiate_exp(): - x = tensor(2.0) - - z = exp(x) - - assert z == np.exp(2.0) - - z.backward() - - assert x.grad == np.exp(2.0) - grad_fn_equal(x, [partial(mul, y=np.exp(2.0))]) - - -def test_can_differentiate_log(): - x = tensor(2.0) - - z = log(x) - - assert z == np.log(2.0) - - z.backward() - - assert x.grad == 0.5 - assert len(x.grad_fn) == 1 - - -def test_can_differentiate_sqrt(): - x = tensor(2.0) - - z = sqrt(x) - - assert z == np.sqrt(2.0) - - z.backward() - - assert x.grad == 0.5 - assert len(x.grad_fn) == 1 - grad_fn = x.grad_fn[0] - assert grad_fn.__name__ == "diff_sqrt" - - -def test_can_differentiate_sin(): - x = tensor(2.0) - - z = sin(x) - - assert z == np.sin(2.0) - - z.backward() - - assert x.grad == np.cos(1.0) - grad_fn_equal(x, [cos]) - - -def test_can_differentiate_cos(): - x = tensor(2.0) - - z = cos(x) - - assert z == np.cos(2.0) - - z.backward() - - assert x.grad == -np.sin(1.0) - assert len(x.grad_fn) == 1 - grad_fn = x.grad_fn[0] - assert grad_fn.__name__ == "diff_cos" - - -def test_can_differentiate_max(): - x = tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) - - z = max(x) - - assert np.allclose(z, [6.0]) - z.backward() - - assert np.allclose(x.grad, [[0, 0, 0], [0, 0, 1.0]]) - - -def test_can_reduce_max(): - x = tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) - - z = reduce(x, bmax, "ij->i") - - assert np.allclose(z, [3.0, 6.0]) - z.backward() - - assert np.allclose(x.grad, [[0, 0, 1.0], [0, 0, 1.0]]) - - -def test_can_differentiate_min(): - x = tensor([1.0, 2.0, 3.0]) - - z = min(x) - - assert z == 1.0 - z.backward() - - assert len(x.grad_fn) == 1 - grad_fn = x.grad_fn[0] - assert grad_fn.__name__ == "diff_min" - - -def test_can_differentiate_matmul(): - a = tensor([[1.0, 2.0], [3.0, 4.0]]) - b = tensor([[5.0, 6.0], [7.0, 8.0]]) - - z = matmul(a, b) - - assert np.allclose(z, tensor([[19.0, 22.0], [43.0, 50.0]])), z - - z.backward() - - assert np.allclose(np.array(a.grad), np.array([[11, 15], [11, 15]])) - - assert np.allclose(np.array(b.grad), np.array([[4, 4], [6, 6]])) - - -def test_can_differentiate_einsum_two_tensors(): - a = np.arange(12).reshape(3, 4) - b = np.arange(12).reshape(4, 3) - - a = tensor(a) - b = tensor(b) - - output = np.array([[42, 48, 54], [114, 136, 158], [186, 224, 262]]) - - z = einsum(a, b, subscripts="ij,jk->ik") - assert np.allclose(z, output) - - z.backward() - - # einsum(np.ones_like(z), b, subscripts="ik,jk->ij") - assert np.allclose( - a.grad, np.array([[3, 12, 21, 30], [3, 12, 21, 30], [3, 12, 21, 30]]) - ) - # einsum(a, np.ones_like(z), subscripts="ij,ij->jk") - assert np.allclose( - b.grad, np.array([[12, 12, 12], [15, 15, 15], [18, 18, 18], [21, 21, 21]]) - ) - - -def test_can_differentiate_einsum_one_tensor(): - a = tensor(np.arange(12).reshape(3, 4)) - - z = einsum(a, subscripts="ij->i") - assert np.allclose(z, np.array([6, 22, 38])) - - z.backward() - - # einsum(1, ones, subscripts="i,j->ij") - assert np.allclose(a.grad, np.array([[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])) - - -def test_can_combine_ops_correctly(): - x = tensor([1.0, 2.0, 3.0]) - y = tensor([4.0, 5.0, 6.0]) - - z = ((x + y) * 0.3) ** 2 - - assert np.allclose(z, tensor([2.25, 4.41, 7.29])) - - z.backward() - - assert np.allclose(np.array(x.grad), 0.18 * np.array(x + y)) - assert np.allclose(np.array(y.grad), 0.18 * np.array(x + y)) - - -def test_can_do_linear_regression_backprop(): - x = tensor(np.linspace(-1.0, 1.0, 100)) - - # hard code slope = 2, intercept = -1 - y = tensor(np.linspace(-1.0, 1.0, 100) * 2 - 1) - - slope = tensor(0.01) # start off with a small slope - intercept = tensor(0.0) # start off with a small intercept - - # This feels a bit high, and im not sure why - learning_rate = tensor(1e-1) - - prev_loss = np.inf - for _ in range(100): - z = einsum(slope, x, subscripts=",i->i") - z += einsum(intercept, np.ones_like(x), subscripts=",i->i") - - square_error = (z - y) ** 2 - loss = einsum(square_error, subscripts="i->") / square_error.shape[0] - - # Make sure loss is decreasing - assert prev_loss > loss - prev_loss = loss - - # Do backprop - loss.backward() - - with no_grad(): - slope = slope - slope.grad * learning_rate - intercept = intercept - intercept.grad * learning_rate - - # Make sure we are close to the true slope and intercept - assert np.allclose(slope, 2.0, rtol=0.1) - assert np.allclose(intercept, -1.0, rtol=0.1) diff --git a/tests/test_basic_operations.py b/tests/test_basic_operations.py deleted file mode 100644 index 668c09a..0000000 --- a/tests/test_basic_operations.py +++ /dev/null @@ -1,146 +0,0 @@ -import math - -import numpy as np - -from tricycle.ops import (add, cos, div, einsum, exp, log, matmul, max, min, mul, negate, nothing, pow, reduce_sum, sin, sqrt, sub) - - -def test_can_negate(): - a = np.array([1, 2, 3]) - result = np.array([-1, -2, -3]) - - assert np.allclose(negate(a), result) - - -def test_can_do_nothing(): - a = np.array([1, 2, 3]) - assert np.allclose(nothing(a), a) - - -def test_can_add(): - a = np.array([1, 2, 3]) - b = np.array([4, 5, 6]) - - result = np.array([5, 7, 9]) - assert np.allclose(add(a, b), result) - - -def test_can_subtract(): - a = np.array([1, 2, 3]) - b = np.array([4, 5, 6]) - - result = np.array([-3, -3, -3]) - assert np.allclose(sub(a, b), result) - - -def test_can_multiply(): - a = np.array([1, 2, 3]) - b = np.array([4, 5, 6]) - - multiply_2 = np.array([2, 4, 6]) - multiply_minus_1 = np.array([-1, -2, -3]) - multiply_half = np.array([0.5, 1, 1.5]) - multiply_array = np.array([4, 10, 18]) - - assert np.allclose(mul(a, np.array(2)), multiply_2) - assert np.allclose(mul(a, np.array(-1)), multiply_minus_1) - assert np.allclose(mul(a, np.array(0.5)), multiply_half) - assert np.allclose(mul(a, b), multiply_array) - - -def test_can_divide(): - a = np.array([1, 2, 3]) - b = np.array([4, 5, 6]) - - result = np.array([0.25, 0.4, 0.5]) - assert np.allclose(div(a, b), result) - - -def test_can_reduce_sum(): - a = np.array([1, 2, 3]) - assert np.allclose(reduce_sum(a), 6) - - -def test_can_exp(): - a = np.array([1, 2, 3]) - - result = np.array([math.exp(1), math.exp(2), math.exp(3)]) - assert np.allclose(exp(a), result) - - -def test_can_log(): - a = np.array([1, 2, 3]) - - result = np.array([math.log(1), math.log(2), math.log(3)]) - assert np.allclose(log(a), result) - - -def test_can_sqrt(): - a = np.array([1, 2, 3]) - - result = np.array([math.sqrt(1), math.sqrt(2), math.sqrt(3)]) - assert np.allclose(sqrt(a), result) - - -def test_can_sin(): - a = np.array([1, 2, math.pi]) - b = np.array([math.sin(1), math.sin(2), 0]) - assert np.allclose(sin(a), b) - - -def test_can_cos(): - a = np.array([1, 2, math.pi]) - b = np.array([math.cos(1), math.cos(2), -1]) - assert np.allclose(cos(a), b) - - -def test_can_power(): - a = np.array([1, 2, 3], dtype=float) - b = np.array([4, 5, 6]) - - power_2 = np.array([1, 4, 9]) - power_minus_1 = np.array([1, 1 / 2, 1 / 3]) - power_half = np.array([1, math.sqrt(2), math.sqrt(3)]) - power_b = np.array([1, 32, 729]) - - assert np.allclose(pow(a, np.array(2)), power_2) - assert np.allclose(pow(a, np.array(-1)), power_minus_1) - assert np.allclose(pow(a, np.array(0.5)), power_half) - assert np.allclose(pow(a, b), power_b) - - -def test_can_max(): - a = np.array([1, 2, 3]) - assert np.allclose(max(a), 3) - - -def test_can_min(): - a = np.array([1, 2, 3]) - assert np.allclose(min(a), 1) - - -def test_can_matrix_multiply(): - a = np.array([1, 2, 3]) - b = np.array([4, 5, 6]) - assert np.allclose(matmul(a, b), 32) - - -def test_can_einsum(): - a = np.arange(25).reshape(5, 5) - b = np.arange(25).reshape(5, 5) - v = np.arange(5) - - answer = np.einsum("ij,jk->ik", a, b) - assert np.allclose(einsum(a, b, subscripts="ij,jk->ik"), answer) - - answer = np.einsum("i,i->", v, v) - assert np.allclose(einsum(v, v, subscripts="i,i->"), answer) - - answer = np.einsum("i->", v) - assert np.allclose(einsum(v, subscripts="i->"), answer) - - answer = np.einsum("ij->j", a) - assert np.allclose(einsum(a, subscripts="ij->j"), answer) - - answer = np.einsum("ij,j->i", a, v) - assert np.allclose(einsum(a, v, subscripts="ij,j->i"), answer) diff --git a/tests/test_composite_functions.py b/tests/test_composite_functions.py deleted file mode 100644 index 7ed3419..0000000 --- a/tests/test_composite_functions.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np - -from tricycle.ops import mean, sigmoid, softmax, tensor - - -def test_can_mean(): - assert mean(tensor([1, 2, 3])) == 2 - - -def test_can_softmax(): - # Single row - result = softmax(tensor([0, 1, 0])) - expected = tensor([0.21194156, 0.57611688, 0.21194156]) - assert np.allclose(result, expected) - - # Multiple rows - result = softmax(tensor([[0, 1, 0], [1, 0, 0]])) - expected = tensor( - [[0.21194156, 0.57611688, 0.21194156], [0.57611688, 0.21194156, 0.21194156]] - ) - assert np.allclose(result, expected) - - -def test_can_differentiate_softmax(): - x = tensor([1, 2, 3]) - x.name = "x" - z = softmax(x) - z.name = "z" - z.backward() - - assert np.allclose(x.grad, [-0.09003057, -0.24472847, 0.33475904]) - - -def test_can_sigmoid(): - result = sigmoid(tensor([1, 2, 3])) - expected = tensor([0.73105858, 0.88079708, 0.95257413]) - assert np.allclose(result, expected) diff --git a/tests/test_loss_functions.py b/tests/test_loss_functions.py deleted file mode 100644 index d51b431..0000000 --- a/tests/test_loss_functions.py +++ /dev/null @@ -1,98 +0,0 @@ -import numpy as np -from sklearn.datasets import load_iris - -from tricycle.experiments import plot_loss -from tricycle.initialisers import init_xavier -from tricycle.loss import categorical_crossentropy, mean_square_error -from tricycle.ops import einsum, no_grad, sigmoid, tensor - - -def test_mean_square_error(): - x = tensor(np.linspace(-1.0, 1.0, 100)) - - # hard code slope = 2, intercept = -1 - y = tensor(np.linspace(-1.0, 1.0, 100) * 2 - 1) - - slope = tensor(0.01) # start off with a small slope - intercept = tensor(0.0) # start off with a small intercept - - learning_rate = tensor(0.1) - - prev_loss = np.inf - # sourcery skip: no-loop-in-tests - for _ in range(100): - expanded_intercept = einsum(intercept, np.ones_like(x), subscripts=",a->a") - - z = einsum(x, slope, subscripts="a,->a") + expanded_intercept - - # Calculate mean squared error - loss = mean_square_error(z, y) - - # Make sure loss is decreasing - assert prev_loss > loss - prev_loss = loss - - # Do backprop - loss.backward() - - with no_grad(): - slope = slope - slope.grad * learning_rate - intercept = intercept - intercept.grad * learning_rate - - # Make sure we are close to the true slope and intercept - assert np.allclose(slope, 2.0, rtol=0.1) - assert np.allclose(intercept, -1.0, rtol=0.1) - - -# This does not currently work and I haven't figured out why yet -def test_cross_entropy(): - """ - We'll test cross entropy loss by fitting a single layer neural - network to the iris dataset - """ - iris = load_iris() - x = tensor(iris.data, name="x", requires_grad=False) - - # one hot encode the labels - y = tensor( - np.eye(iris.target.max() + 1, dtype=np.float32)[iris.target], - name="y", - requires_grad=False, - ) - - slope = init_xavier(shape=(x.shape[1], y.shape[1])) # start off with a small slope - intercept = tensor( - [-0.05, 0.01, 0.05], name="intercept" - ) # start off with a small intercept - - learning_rate = tensor(1e-2) - learning_rate.requires_grad = False - - prev_loss = np.inf - losses = [] - # sourcery skip: no-loop-in-tests - for _ in range(100): - print(f"{slope.shape=}, {x.shape=}, {y.shape=}") - z = einsum(x, slope, subscripts="ij,jk->ik") - ones = tensor(np.ones_like(z), requires_grad=False) - broadcasted_intercept = einsum(intercept, ones, subscripts="k,jk->jk") - z += broadcasted_intercept - z.name = "z" - - print(f"{z.shape=}") - - # Calculate cross entropy - loss = categorical_crossentropy(z, y) - losses.append(loss) - assert prev_loss > loss - - # Do backprop - loss.backward() - - with no_grad(): - slope = slope - slope.grad * learning_rate - slope.grad = None - intercept = intercept - intercept.grad * learning_rate - intercept.grad = None - - plot_loss(losses) diff --git a/tests/test_simple_neural_network.py b/tests/test_simple_neural_network.py deleted file mode 100644 index ef967a4..0000000 --- a/tests/test_simple_neural_network.py +++ /dev/null @@ -1,100 +0,0 @@ -import numpy as np -from sklearn.datasets import load_diabetes -from sklearn.preprocessing import StandardScaler - -from tricycle.activation import relu -from tricycle.dataset import InfiniteDataset -from tricycle.experiments import smooth, plot_loss -from tricycle.initialisers import init_xavier, init_zero -from tricycle.loss import mean_square_error -from tricycle.ops import einsum, no_grad - - -def test_simple_neural_network(): - """ - Train a simple regression network on the diabetes dataset - """ - # Make sure our results are reproducible - np.random.seed(42) - - batch_size = 64 - learning_rate = 1e-0 - n_epochs = 1000 - n_labels = 1 - n_features = 10 - layer_1_size = 16 - layer_2_size = 8 - - diabetes = load_diabetes(scaled=True) - x = diabetes.data - - # We need to scale the target to avoid giant grads - y = diabetes.target.reshape(-1, 1) - y = StandardScaler().fit_transform(y) - - # Build an iterator that generates random mini-batches. - ds = InfiniteDataset(x, y, batch_size=batch_size) - - # define a model - layer_1_weights = init_xavier((n_features, layer_1_size), name="layer_1_weights") - layer_2_weights = init_xavier((layer_1_size, layer_2_size), name="layer_2_weights") - layer_3_weights = init_xavier((layer_2_size, n_labels), name="layer_3_weights") - - layer_1_bias = init_zero((1, layer_1_size), name="layer_1_bias") - layer_2_bias = init_zero((1, layer_2_size), name="layer_2_bias") - layer_3_bias = init_zero((1, n_labels), name="layer_3_bias") - - params = [ - layer_1_weights, - layer_2_weights, - layer_3_weights, - layer_1_bias, - layer_2_bias, - layer_3_bias, - ] - - def model(x, params): - ( - layer_1_weights, - layer_2_weights, - layer_3_weights, - layer_1_bias, - layer_2_bias, - layer_3_bias, - ) = params - - layer_1 = relu( - einsum(x, layer_1_weights, subscripts="ij,jk->ik") + layer_1_bias - ) - layer_2 = relu( - einsum(layer_1, layer_2_weights, subscripts="ij,jk->ik") + layer_2_bias - ) - layer_3 = ( - einsum(layer_2, layer_3_weights, subscripts="ij,jk->ik") + layer_3_bias - ) - return layer_3 - - losses = [] - - for batch_idx, (X, y) in enumerate(ds): - preds = model(X, params) - loss = mean_square_error(preds, y) - - loss.backward() - losses.append(loss) - - with no_grad(): - for idx, param in enumerate(params): - params[idx] = param - (param.grad * learning_rate) - param.grad = None - - - # sourcery skip: no-conditionals-in-tests - if batch_idx >= n_epochs: - break - - # Check that our loss has actually decreased - # It starts at about 0.9 and drops to around 0.5 - # if the model actually learns - plot_loss(losses) - assert list(smooth(losses))[-1] < 0.6 diff --git a/tests/test_tensor.py b/tests/test_tensor.py deleted file mode 100644 index 884b00d..0000000 --- a/tests/test_tensor.py +++ /dev/null @@ -1,33 +0,0 @@ -import math - -import numpy as np - -from tricycle.ops import tensor - - -def test_can_handle_scalar(): - tensor(1) - - -def test_can_handle_vector(): - tensor([1, 2, 3]) - - -def test_can_handle_matrix(): - tensor([[1, 2, 3], [4, 5, 6]]) - - -def test_can_handle_3d(): - tensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) - - -def test_can_handle_irrational_matrix(): - tensor([[1, 2, math.sqrt(3)], [4, 5, 6], [7, 8, 9]]) - - -def test_can_handle_16d(): - # we arent going all the way to 32d because the - # array ends up being 32GB! - shape = [2] * 16 - big_array = np.ones(shape) - tensor(big_array)