diff --git a/src/tricycle/__init__.py b/src/tricycle/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/tricycle/activation.py b/src/tricycle/activation.py
deleted file mode 100644
index c116095..0000000
--- a/src/tricycle/activation.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import numpy as np
-
-from tricycle.ops import Tensor, einsum, tensor, to_tensor
-
-
-@to_tensor
-def relu(x: Tensor) -> Tensor:
-    """
-    Compute the relu of a tensor
-    """
-    result = tensor(np.maximum(x, 0))
-
-    def diff_relu(arg: Tensor) -> Tensor:
-        weight = (x > 0).astype(x.dtype)
-        return einsum(weight, arg, subscripts="ij,ij->ij")
-
-    result.back_fn = (diff_relu,)
-    result.args = (x,)
-
-    return result
diff --git a/src/tricycle/dataset.py b/src/tricycle/dataset.py
deleted file mode 100644
index 6dbaa7d..0000000
--- a/src/tricycle/dataset.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import numpy as np
-
-
-class InfiniteDataset:
-    """
-    Iterator that infinitely generates random batches from the dataset
-    """
-
-    def __init__(self, X, y, batch_size=32):
-        self.X = X
-        self.y = y
-        self.batch_size = batch_size
-
-    def __iter__(self):
-        while True:
-            indices = np.random.choice(len(self.X), self.batch_size)
-            yield self.X[indices], self.y[indices]
-
-    def __len__(self):
-        return len(self.X)
diff --git a/src/tricycle/experiments.py b/src/tricycle/experiments.py
deleted file mode 100644
index 59b34de..0000000
--- a/src/tricycle/experiments.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-Some utils that help with running experiments (I couldn't find anywhere else
-for them to go)
-"""
-
-from matplotlib import pyplot as plt
-from matplotlib.widgets import Slider
-
-
-def smooth(values: list[float], factor=0.99):
-    """
-    Apply exponential smoothing to a list of values
-    """
-    assert 0 <= factor <= 1
-
-    smoothed = values[0]
-    for val in values[1:]:
-        smoothed = factor * smoothed + (1 - factor) * val
-        yield smoothed
-
-
-def plot_loss(losses):
-    """
-    Plot a loss curve with a slider to control smoothing
-    """
-    factor = 0.99
-
-    fig, ax = plt.subplots(figsize=(16, 9))
-    ax.set_ylim(min(losses) * 0.95, max(losses) * 1.05)
-
-    line = ax.plot(list(smooth(losses, factor=factor)))
-    fig.subplots_adjust(left=0.25, bottom=0.25)
-
-    slider_ax = fig.add_axes((0.25, 0.1, 0.65, 0.03))
-    slider = Slider(
-        ax=slider_ax,
-        label="Smoothing",
-        valmin=0,
-        valmax=1,
-        valinit=factor,
-    )
-
-    def update(factor: float):
-        line[0].set_ydata(list(smooth(losses, factor=factor)))
-        plt.gcf().canvas.draw()
-
-    slider.on_changed(update)
-
-    plt.show()
diff --git a/src/tricycle/initialisers.py b/src/tricycle/initialisers.py
deleted file mode 100644
index 21dbcb1..0000000
--- a/src/tricycle/initialisers.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import numpy as np
-
-from tricycle.ops import tensor
-
-
-def init_normal(shape, name: str = "", loc=0.0, scale=0.01):
-    """
-    Initialize a tensor with values sampled from a normal distribution
-    """
-    return tensor(np.random.normal(size=shape, loc=loc, scale=scale), name=name)
-
-
-def init_zero(shape, name: str = ""):
-    """
-    Initialize a tensor with zeros
-    """
-    return tensor(np.zeros(shape), name=name)
-
-
-def init_xavier(shape, name: str = ""):
-    """
-    Initialize a tensor with xavier/glorot initialisation
-    """
-    f_in, f_out = shape
-    bound = np.sqrt(6) / np.sqrt(f_in + f_out)
-    return tensor(np.random.uniform(low=-bound, high=bound, size=shape), name=name)
diff --git a/src/tricycle/loss.py b/src/tricycle/loss.py
deleted file mode 100644
index f751e49..0000000
--- a/src/tricycle/loss.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from string import ascii_letters
-
-from tricycle.ops import Tensor, einsum, log, mean, softmax, tensor
-
-
-def mean_square_error(y_pred: Tensor, y_true: Tensor) -> Tensor:
-    """
-    Copmute the mean square error between two identically shaped tensors
-    """
-    assert y_pred.shape == y_true.shape
-
-    errors = y_pred - y_true
-
-    indices = ascii_letters[: len(errors.shape)]
-    subscripts = f"{indices}, {indices} -> {indices}"
-
-    square_errors = einsum(errors, errors, subscripts=subscripts)
-    return mean(square_errors)
-
-
-def categorical_crossentropy(y_logits: Tensor, y_true: Tensor) -> Tensor:
-    """
-    NOTE: This does not currently work. I cannot get the test to pass
-
-    Compute the cross entropy loss between two identically shaped tensors.
-    Note, y_pred are assumed to be logits. That is, they are expectected to
-    be-unnormalised. y_true is assumed to be normalised already.
-    Usually this is a one-hor encoding of a single categorical output
-    but sharing labels across multiple outputs is possible.
-    """
-    y_pred = softmax(y_logits)
-    y_pred.name = "y_pred"
-    log_probs = log(y_pred)
-    log_probs.name = "log_probs"
-    loss = -einsum(log_probs, y_true, subscripts="ij,ij->i")
-    loss.name = "loss"
-    # raise Exception(f"{log_probs=}, {y_true=}, {y_pred=}, {y_true=}, {loss=}")
-    return einsum(loss, subscripts="i->") / tensor(loss.shape[0], requires_grad=False)
diff --git a/src/tricycle/ops.py b/src/tricycle/ops.py
deleted file mode 100644
index 3d8cc5f..0000000
--- a/src/tricycle/ops.py
+++ /dev/null
@@ -1,679 +0,0 @@
-import logging
-import uuid
-from functools import partial, wraps
-from string import ascii_letters, ascii_lowercase
-from typing import Callable, List, Optional, Tuple, Union
-
-import networkx as nx
-import numpy as np
-import pydot
-from matplotlib import pyplot as plt
-from networkx.drawing.nx_pydot import graphviz_layout
-
-# Type signature for Tensor operation
-Op = Callable[..., "Tensor"]
-
-grad: bool = True
-
-logger = logging.getLogger(__name__)
-
-
-def to_tensor(fn: Op) -> Op:
-    """
-    A decorator to convert non-tensor arguments to tensors
-    """
-
-    @wraps(fn)
-    def wrapped(*args, **kwargs):
-        args = [arg if isinstance(arg, Tensor) else tensor(arg) for arg in args]
-        return fn(*args, **kwargs)
-
-    return wrapped
-
-
-class no_grad:
-    """
-    A context manager to disable gradient calculation
-    """
-
-    def __enter__(self):
-        global grad
-        grad = False
-
-    def __exit__(self, *_):
-        global grad
-        grad = True
-
-
-class Tensor(np.ndarray):
-    """
-    An N-dimensional grid of numbers. This is implemented as a subclass
-    of a standard numpy array
-    """
-
-    _id: int
-    args: tuple["Tensor", ...] | None = None
-    back_fn: tuple[Op, ...] | None = None
-    grad_fn: List[List[Op]] = []
-    grad: Optional["Tensor"] = None
-    name: Optional[str] = None
-    requires_grad: bool = False
-
-    def backward(self):
-        """
-        Perform auto-differentiation by traversing the computational graph
-        to find every path from the root (loss) to each leaf (parameter).
-
-        Once we reach a leaf, we calculate its gradient by applying
-        the gradient function for each operation that we passed along the
-        path
-        """
-
-        queue = [self]
-        labels = {}
-        edge_labels = {}
-
-        G = nx.DiGraph()
-        G.add_node(str(self))
-        labels[str(self)] = self.name or ""
-
-        logger.info("\n\n")
-        while queue:
-            logger.info(f"stack: {queue}")
-            current_node = queue.pop()
-
-            # At intermediate node
-            if (
-                current_node.back_fn is not None
-                and current_node.args is not None  # noqa: E501
-            ):
-                # Get gradient functions for the operation
-                grad_fn = current_node.grad_fn
-
-                logger.info(
-                    f"{current_node.args=} {current_node.back_fn=} { current_node=}"
-                )
-                # Update gradient functions for each parent node
-                for arg, op in zip(current_node.args, current_node.back_fn):
-                    if str(arg) not in G.nodes:
-                        G.add_node(str(arg))
-                        labels[str(arg)] = arg.name or ""
-                    G.add_edge(str(arg), str(current_node))
-
-                    try:
-                        edge_labels[(str(arg), str(current_node))] = op.__name__
-                    except AttributeError:
-                        edge_labels[(str(arg), str(current_node))] = op.func.__name__
-
-                    arg.grad_fn = grad_fn + [op]
-
-                    # Add each arg to the stack for further processing
-                    queue = [arg] + queue
-
-            # At leaf node
-            elif current_node.requires_grad:
-                grad = tensor(np.ones_like(self))
-                logger.warning(current_node.grad_fn)
-                for op in current_node.grad_fn:
-                    # Follow the path from root to leaf, applying
-                    # each function to get the gradient for the leaf
-                    #
-                    # TODO: this is slow
-                    #
-                    # - We can speed this up by caching intermediate values
-                    # that are used by multiple leaves
-                    #
-                    # - We should also be able to speed this up by analysing
-                    # the path to see if any operations can be fused
-                    #
-                    # - We should also probably ignore leaves that we don't
-                    # need the gradient for (e.g the inputs)
-                    grad = op(grad)
-
-                if current_node.grad is None:
-                    current_node.grad = grad
-                else:
-                    # If there are multiple paths to this node, we
-                    # need to add all of the gradients together
-                    current_node.grad += grad
-
-        fig, ax = plt.subplots(figsize=(10, 10))
-        pos = graphviz_layout(G, prog="dot")
-        nx.draw(G, labels=labels, pos=pos, ax=ax)
-        nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edge_labels, ax=ax)
-        plt.show()
-
-    @to_tensor
-    def __add__(self, other: "Tensor") -> "Tensor":
-        return add(self, other)
-
-    @to_tensor
-    def __iadd__(self, other: "Tensor") -> "Tensor":
-        return add(self, other)
-
-    @to_tensor
-    def __sub__(self, other: "Tensor") -> "Tensor":
-        return sub(self, other)
-
-    @to_tensor
-    def __mul__(self, other: "Tensor") -> "Tensor":
-        return mul(self, other)
-
-    @to_tensor
-    def __truediv__(self, other: "Tensor") -> "Tensor":
-        return div(self, other)
-
-    @to_tensor
-    def __pow__(self, other: "Tensor") -> "Tensor":
-        return pow(self, other)
-
-    @to_tensor
-    def __neg__(self) -> "Tensor":
-        return negate(self)
-
-    @to_tensor
-    def __matmul__(self, other: "Tensor") -> "Tensor":
-        return matmul(self, other)
-
-    def mean(self) -> "Tensor":
-        return mean(self)
-
-    @property
-    def T(self) -> "Tensor":
-        return transpose(self)
-
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def __repr__(self) -> str:
-        if self.name is None:
-            return f"{super().__repr__()}"
-        return f"{super().__repr__()[:-1]}, {self.name=})"
-
-
-class bind(partial):
-    """
-    A version of partial which accepts Ellipsis (...) as a placeholder
-    credit: https://stackoverflow.com/questions/7811247/
-    """
-
-    def __call__(self, *args, **keywords):
-        keywords = {**self.keywords, **keywords}
-        iargs = iter(args)
-        args = (next(iargs) if arg is ... else arg for arg in self.args)
-        return self.func(*args, *iargs, **keywords)
-
-
-# -----------------utility functions--------
-
-
-def tensor(
-    *args, name: Optional[str] = None, requires_grad: bool = True, **kwargs
-) -> Tensor:
-    """
-    Create a new Tensor instance. First, we convert the argument to a numpy
-    array and then to a tensor
-    """
-    result = np.asarray(*args, **kwargs).view(Tensor)
-    result.name = name
-    result.requires_grad = requires_grad
-    return result
-
-
-@to_tensor
-def nothing(x: Tensor) -> Tensor:
-    """
-    Do nothing
-    """
-    global grad
-    result = tensor(x)
-    if grad:
-        result.back_fn = (nothing,)
-        result.args = (x,)
-    return result
-
-
-# ----------binary functions----------------
-
-
-@to_tensor
-def sub(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Subtract two tensors
-    """
-    global grad
-    result = tensor(np.subtract(x, y))
-    if grad:
-        result.back_fn = (nothing, negate)
-        result.args = (x, y)
-    return result
-
-
-@to_tensor
-def negate(x: Tensor) -> Tensor:
-    """
-    Swap the sign of every element of a tensor
-    """
-    global grad
-
-    result = tensor(np.multiply(x, -1))
-    if grad:
-        result.back_fn = (negate,)
-        result.args = (x,)
-    return result
-
-
-@to_tensor
-def add(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Add two tensors
-    """
-    global grad
-    result = tensor(np.add(x, y))
-    if grad:
-        result.back_fn = (nothing, nothing)
-        result.args = (x, y)
-    return result
-
-
-@to_tensor
-def mul(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Multiply two tensors
-    """
-    global grad
-    result = tensor(np.multiply(x, y))
-    if grad:
-        result.back_fn = (
-            partial(mul, y=y),
-            partial(mul, y=x),
-        )
-        result.args = (x, y)
-    return result
-
-
-@to_tensor
-def div(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Divide two tensors
-    """
-    # scalar / tensor or tensor / scalar
-    if len(x.shape) == 0 or len(y.shape) == 0:
-        minus_one = tensor(-1.0, requires_grad=False, name="minus_one")
-        return mul(x, pow(y, minus_one))
-
-    # tensor / tensor
-    elif len(x.shape) == len(y.shape):
-        one = tensor(1.0, requires_grad=False, name="one")
-        indices = ascii_lowercase[: len(x.shape)]
-        einsum(x, one / y, subscripts=f"{indices},{indices}->{indices}")
-    else:
-        raise ValueError(
-            "Division between two tensors is of different shapes is not supported."
-        )
-
-
-# -----------elementwise unary functions----------------
-
-
-@to_tensor
-def pow(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Raise every element of a tensor to the power of another tensor
-    """
-    global grad
-    result = tensor(np.power(x, y))
-    if not grad:
-        return result
-
-    result.back_fn = (
-        partial(mul, y=tensor(np.power(x, y - 1)) * y),
-        partial(mul, y=result * log(x)),
-    )
-    result.args = (x, y)
-    return result
-
-
-@to_tensor
-def exp(x: Tensor) -> Tensor:
-    """
-    Raise every element of a tensor to the power of e
-    """
-    global grad
-    result = tensor(np.exp(x))
-    if not grad:
-        return result
-    result.back_fn = (partial(mul, y=result),)
-    result.args = (x,)
-    return result
-
-
-@to_tensor
-def log(x: Tensor) -> Tensor:
-    """
-    Find the natural log of every element of a tensor
-    """
-    global grad
-    result = tensor(np.log(x))
-    if not grad:
-        return result
-
-    result.back_fn = (partial(div, y=x),)
-    result.args = (x,)
-    return result
-
-
-@to_tensor
-def sqrt(x: Tensor) -> Tensor:
-    """
-    Find the square root of every element of a tensor
-    """
-    global grad
-    result = tensor(np.sqrt(x))
-    if not grad:
-        return result
-
-    def diff_sqrt(arg: Tensor) -> Tensor:
-        return pow(arg, -0.5) * 0.5
-
-    result.back_fn = (diff_sqrt,)
-    result.args = (x,)
-    return result
-
-
-@to_tensor
-def sin(x: Tensor) -> Tensor:
-    """
-    Find the sine of every element of a tensor
-    """
-    global grad
-    result = tensor(np.sin(x))
-    if not grad:
-        return result
-
-    result.back_fn = (cos,)
-    result.args = (x,)
-    return result
-
-
-@to_tensor
-def cos(x: Tensor) -> Tensor:
-    """
-    Find the cosine of every element of a tensor
-    """
-    global grad
-    result = tensor(np.cos(x))
-    if not grad:
-        return result
-
-    def diff_cos(arg: Tensor) -> Tensor:
-        return -sin(arg)
-
-    result.back_fn = (diff_cos,)
-    result.args = (x,)
-    return result
-
-
-# ------------------reduce functions----------------
-
-
-def reduce(x: Tensor, method: Op, subscripts: str) -> Tensor:
-    """
-    Reduce a tensor along some dimensions by applying a reduction function
-    to those indices. A reduction function generates a binary tensor which
-    is multiplied by the input and reduces according to the subscripts
-    """
-    indices, output = _parse_subscripts(subscripts)
-    assert (
-        len(indices) == 1
-    ), f"Can only reduce a single tensor at a time. Indices suggeststed: {len(indices)} tensors: {indices}"
-
-    [idx] = indices
-    axis = tuple(i for i, char in enumerate(idx) if char not in output)
-    assert axis, "Tensor must be reduced along at least one axis"
-    binary_tensor = method(x, axis)
-    binary_tensor.requires_grad = False
-    binary_tensor.name = "binary"
-
-    subscripts = f"{idx},{idx}->{output}"
-
-    return einsum(x, binary_tensor, subscripts=subscripts)
-
-
-# --------------------indicator functions---------------------
-def bmax(x: Tensor, axis: Union[int, Tuple[int]]) -> Tensor:
-    """
-    Return a binary tensor where each element is 1 if the corresponding element
-    is that largest along an axis that is being reduced along
-    """
-    return (x == np.max(x, axis=axis, keepdims=True)).astype(int)
-
-
-def bmin(x: Tensor, axis: Union[int, Tuple[int]]) -> Tensor:
-    """
-    Return a binary tensor where each element is 1 if the corresponding element
-    is that smallest along an axis that is being reduced along
-    """
-    return (x == np.min(x, axis=axis, keepdims=True)).astype(int)
-
-
-@to_tensor
-def max(x: Tensor) -> Tensor:
-    """
-    Find the largest element of a tensor
-    """
-    indices = ascii_lowercase[: len(x.shape)]
-    return reduce(x, bmax, f"{indices}->")
-
-
-@to_tensor
-def min(x: Tensor) -> Tensor:
-    """
-    Find the smallest element of a tensor
-    """
-    indices = ascii_lowercase[: len(x.shape)]
-    return reduce(x, bmin, f"{indices}->")
-
-
-def _parse_subscripts(subscripts: str) -> tuple[list[str], str]:
-    """
-    Parse a subscripts string into a list of indices and a result
-    """
-    indices, result = subscripts.split("->")
-    indices = indices.split(",")
-    return indices, result
-
-
-@to_tensor
-def einsum(*tensors: Tensor, subscripts: str) -> Tensor:
-    """
-    The way we'll be doing tensor manipulation is with the einsum function from
-    numpy. This uses a version of einstein summation notation to do tensor
-    operations. It takes a bit of getting used to but ends up being a really
-    powerful way to do tensor operations. It feels like the natural way to
-    do deep learningbinary_tensor,
-
-    Examples
-    --------
-    >>> a = np.arange(25).reshape(5,5)
-
-    Trace of a matrix:
-
-    >>> np.einsum('ii', a)
-    60
-
-    Sum over an axis:
-
-    >>> np.einsum('ij->i', a)
-    array([ 10,  35,  60,  85, 110])
-
-    Tensor contraction:
-
-    >>> a = np.arange(60.).reshape(3,4,5)
-    >>> b = np.arange(24.).reshape(4,3,2)
-    >>> np.einsum('ijk,jil->kl', a, b)
-    array([[4400., 4730.],
-           [4532., 4874.],
-           [4664., 5018.],
-           [4796., 5162.],
-           [4928., 5306.]])
-
-    """
-    global grad
-
-    indices, output = _parse_subscripts(subscripts)
-
-    # Every single input tensor is equivalent to a dual input where
-    # the second input is a 1 tensor with the indices that dont appear in the
-    # output
-    if len(tensors) == 1:
-        one_indices = ""
-        one_shape = []
-        for size, idx in zip(tensors[0].shape, indices[0]):
-            if idx not in output:
-                one_indices += idx
-                one_shape.append(size)
-
-        indices = [indices[0], one_indices]
-        ones = tensor(np.ones(one_shape), requires_grad=False, name="ones")
-        tensors = (tensors[0], ones)
-
-    if not tensors or len(tensors) > 2:
-        raise NotImplementedError(
-            f"Einsum is only implemented for 1 or 2 inputs. Found {len(tensors)}"
-        )
-
-    indices = ",".join(indices)
-    subscripts = f"{indices}->{output}"
-
-    result = tensor(np.einsum(subscripts, *tensors))
-
-    if not grad:
-        return result
-
-    back_fns = []
-    indices, output = _parse_subscripts(subscripts)
-
-    # The rule for differentiating einsum wrt one of its arguments is:
-    # Swap the indices of the output and the argument being differentiated
-    # in the subscripts. Then pass the current gradient as an argument in
-    # place of the argument being differentiated.
-    # for example:
-    # >>> a = np.arange(12).reshape(3, 4)
-    # >>> b = np.arange(12).reshape(4, 3)
-    # >>> c = np.einsum('ij,jk->ik', a, b)
-    #
-    # >>> c.backward()
-    # >>> assert a.grad == np.einsum('ik,jk->ij', np.ones_like(c), b)
-    # Notice that the indices have swapped and a has been replaced with
-    # the gradient of c (the current gradient)
-    for idx in range(len(tensors)):
-        left = tensors[:idx]
-        right = tensors[idx + 1 :]
-
-        indices, output = _parse_subscripts(subscripts)
-        output, indices[idx] = indices[idx], output
-        indices = ",".join(indices)
-        diff_subscripts = f"{indices}->{output}"
-
-        def diff_einsum(
-            arg: Tensor, left=left, right=right, subscripts=diff_subscripts
-        ) -> Tensor:
-            """
-            Derivative of einsum wrt a single input tensor
-            """
-            args = left + (arg,) + right
-            return einsum(*args, subscripts=subscripts)
-
-        back_fns.append(diff_einsum)
-
-    result.args = tuple(tensors)
-    result.back_fn = tuple(back_fns)
-
-    return result
-
-
-@to_tensor
-def matmul(x: Tensor, y: Tensor) -> Tensor:
-    """
-    Compute the matrix multiplication of two tensors
-    """
-    # We are limited by the number of available letters but thankfully
-    # numpy has a max dimension of 32 so this shouldn't be too much of a
-    # problem. If we start doing giant einsums then i'll probably need to
-    # build a custom implementation of einsum but we should be good for now
-    assert len(x.shape) + len(y.shape) <= len(
-        ascii_letters
-    ), f"Cannot perform matmul on tensors with more than  {len(ascii_letters)} dimensions in total"
-    assert len(x.shape), "Cannot perform matmul on a 0D tensor"
-    assert len(y.shape), "Cannot perform matmul on a 0D tensor"
-
-    left_indices = ascii_letters[: len(x.shape) - 1] if len(x.shape) > 1 else ""
-    right_indices = ascii_letters[-len(y.shape) + 1 :] if len(y.shape) > 1 else ""
-
-    subscripts = f"{left_indices}i,i{right_indices}->{left_indices}{right_indices}"
-    return einsum(x, y, subscripts=subscripts)
-
-
-@to_tensor
-def transpose(x: Tensor) -> Tensor:
-    """
-    Compute the transpose of a tensor
-    """
-    assert (
-        len(x.shape) <= 2
-    ), "Can only perform transpose on 2D tensors. Use einsum for more complex cases."
-    return einsum(x, subscripts="ij->ji")
-
-
-@to_tensor
-def mean(x: Tensor):
-    """
-    Compute the mean of a tensor
-    """
-    return reduce_sum(x) / x.shape[0]
-
-
-@to_tensor
-def softmax(x: Tensor):
-    """
-    Compute the softmax of a tensor
-    """
-    # For numeric stability we'll subtract the max of the tensor
-    indices = ascii_letters[: len(x.shape)]
-    largest_x = reduce(x, bmax, subscripts=f"{indices}->{indices[:-1]}")
-    ones = tensor(np.ones_like(x), requires_grad=False, name="ones")
-
-    largest_x = einsum(
-        largest_x, ones, subscripts=f"{indices[:-1]},{indices}->{indices}"
-    )
-    normalised_x = x - largest_x
-
-    normalised_x = exp(normalised_x)
-    assert len(normalised_x.shape) < len(
-        ascii_letters
-    ), f"Cannot perform softmax on tensors with more than {len(ascii_letters)} dimensions"
-
-    if len(x.shape) == 1:
-        one = tensor(1, name="one", requires_grad=False)
-        denom = one / einsum(normalised_x, subscripts="i->")
-        return einsum(denom, normalised_x, subscripts=",i->i")
-
-    indices = ascii_letters[: len(x.shape) - 1]
-    final_letter = ascii_letters[-1]
-    denom = 1 / einsum(
-        normalised_x, subscripts=f"{final_letter}{indices}->{final_letter}"
-    )
-    return einsum(
-        denom,
-        normalised_x,
-        subscripts=f"{final_letter},{final_letter}{indices}->{final_letter}{indices}",
-    )
-
-
-@to_tensor
-def sigmoid(x: Tensor) -> Tensor:
-    """
-    Compute the sigmoid of a tensor
-    """
-    return tensor(1) / (exp(-x) + 1)
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
deleted file mode 100644
index bcf9ead..0000000
--- a/tests/test_autograd.py
+++ /dev/null
@@ -1,351 +0,0 @@
-from functools import partial
-
-import numpy as np
-
-from tricycle.ops import (add, bmax, cos, div, einsum, exp, log, matmul, max,
-                          min, mul, negate, no_grad, nothing, pow, reduce,
-                          reduce_sum, sin, sqrt, sub, tensor)
-
-
-def partial_func_equals(a, b):
-    assert a.func == b.func
-    assert a.args == b.args
-
-
-def grad_fn_equal(tensor_, ops):
-    assert len(tensor_.grad_fn) == len(ops)
-
-    for fn, op in zip(tensor_.grad_fn, ops):
-        if isinstance(op, partial):
-            partial_func_equals(fn, op)
-        else:
-            assert fn == op
-
-
-def test_can_differentiate_subtract():
-    x = tensor(1.0)
-    y = tensor(2.0)
-
-    z = sub(y, x)
-
-    assert z == 1.0
-
-    z.backward()
-
-    assert x.grad == -1.0
-    assert y.grad == 1.0
-    grad_fn_equal(x, [negate])
-    grad_fn_equal(y, [nothing])
-
-
-def test_can_differentiate_negate():
-    x = tensor(1.0)
-
-    z = negate(x)
-
-    assert z == -1.0
-
-    z.backward()
-
-    assert x.grad == -1.0
-    grad_fn_equal(x, [negate])
-
-
-def test_can_differentiate_nothing():
-    x = tensor(1.0)
-
-    z = nothing(x)
-
-    assert z == 1.0
-
-    z.backward()
-
-    assert x.grad == 1.0
-    grad_fn_equal(x, [nothing])
-
-
-def test_can_differentiate_add():
-    x = tensor(1.0)
-    y = tensor(2.0)
-
-    z = add(x, y)
-
-    assert z == 3.0
-
-    z.backward()
-
-    assert x.grad == 1.0
-    grad_fn_equal(x, [nothing])
-
-    assert y.grad == 1.0
-    grad_fn_equal(y, [nothing])
-
-
-def test_can_differentiate_multiply():
-    x = tensor(1.0)
-    y = tensor(2.0)
-
-    z = mul(x, y)
-
-    assert z == 2.0
-
-    z.backward()
-
-    assert x.grad == 2.0
-    grad_fn_equal(x, [partial(mul, y=y)])
-
-    assert y.grad == 1.0
-    grad_fn_equal(x, [partial(mul, y=x)])
-
-
-def test_can_differentiate_divide():
-    x = tensor(1.0)
-    y = tensor(2.0)
-
-    z = div(x, y)
-
-    assert z == 0.5
-
-    z.backward()
-
-    assert x.grad == 0.5
-    grad_fn_equal(x, [partial(mul, y=1/y)])
-
-    assert y.grad == -0.25
-    assert grad_fn_equal(y, [partial(mul, y=1/x), partial(mul, y=1/y)])
-
-
-def test_can_differentiate_reduce_sum():
-    x = tensor([1.0, 2.0, 3.0])
-    z = reduce_sum(x)
-    assert z == 6.0
-
-    z.backward()
-
-    assert np.allclose(x.grad, [1, 1, 1])
-
-
-def test_can_differentiate_power():
-    x = tensor(2.0)
-    y = tensor(3.0)
-
-    z = pow(x, y)
-
-    assert z == 8.0
-
-    z.backward()
-
-    assert x.grad == 12.0
-    assert len(x.grad_fn) == 1
-    grad_fn = x.grad_fn[0]
-    assert grad_fn.__name__ == "diff_power_arg_1"
-
-    assert y.grad == 8.0 * np.log(2.0)
-    assert len(y.grad_fn) == 1
-    grad_fn = y.grad_fn[0]
-    assert grad_fn.__name__ == "diff_power_arg_2"
-
-
-def test_can_differentiate_exp():
-    x = tensor(2.0)
-
-    z = exp(x)
-
-    assert z == np.exp(2.0)
-
-    z.backward()
-
-    assert x.grad == np.exp(2.0)
-    grad_fn_equal(x, [partial(mul, y=np.exp(2.0))])
-
-
-def test_can_differentiate_log():
-    x = tensor(2.0)
-
-    z = log(x)
-
-    assert z == np.log(2.0)
-
-    z.backward()
-
-    assert x.grad == 0.5
-    assert len(x.grad_fn) == 1
-
-
-def test_can_differentiate_sqrt():
-    x = tensor(2.0)
-
-    z = sqrt(x)
-
-    assert z == np.sqrt(2.0)
-
-    z.backward()
-
-    assert x.grad == 0.5
-    assert len(x.grad_fn) == 1
-    grad_fn = x.grad_fn[0]
-    assert grad_fn.__name__ == "diff_sqrt"
-
-
-def test_can_differentiate_sin():
-    x = tensor(2.0)
-
-    z = sin(x)
-
-    assert z == np.sin(2.0)
-
-    z.backward()
-
-    assert x.grad == np.cos(1.0)
-    grad_fn_equal(x, [cos])
-
-
-def test_can_differentiate_cos():
-    x = tensor(2.0)
-
-    z = cos(x)
-
-    assert z == np.cos(2.0)
-
-    z.backward()
-
-    assert x.grad == -np.sin(1.0)
-    assert len(x.grad_fn) == 1
-    grad_fn = x.grad_fn[0]
-    assert grad_fn.__name__ == "diff_cos"
-
-
-def test_can_differentiate_max():
-    x = tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-
-    z = max(x)
-
-    assert np.allclose(z, [6.0])
-    z.backward()
-
-    assert np.allclose(x.grad, [[0, 0, 0], [0, 0, 1.0]])
-
-
-def test_can_reduce_max():
-    x = tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-
-    z = reduce(x, bmax, "ij->i")
-
-    assert np.allclose(z, [3.0, 6.0])
-    z.backward()
-
-    assert np.allclose(x.grad, [[0, 0, 1.0], [0, 0, 1.0]])
-
-
-def test_can_differentiate_min():
-    x = tensor([1.0, 2.0, 3.0])
-
-    z = min(x)
-
-    assert z == 1.0
-    z.backward()
-
-    assert len(x.grad_fn) == 1
-    grad_fn = x.grad_fn[0]
-    assert grad_fn.__name__ == "diff_min"
-
-
-def test_can_differentiate_matmul():
-    a = tensor([[1.0, 2.0], [3.0, 4.0]])
-    b = tensor([[5.0, 6.0], [7.0, 8.0]])
-
-    z = matmul(a, b)
-
-    assert np.allclose(z, tensor([[19.0, 22.0], [43.0, 50.0]])), z
-
-    z.backward()
-
-    assert np.allclose(np.array(a.grad), np.array([[11, 15], [11, 15]]))
-
-    assert np.allclose(np.array(b.grad), np.array([[4, 4], [6, 6]]))
-
-
-def test_can_differentiate_einsum_two_tensors():
-    a = np.arange(12).reshape(3, 4)
-    b = np.arange(12).reshape(4, 3)
-
-    a = tensor(a)
-    b = tensor(b)
-
-    output = np.array([[42, 48, 54], [114, 136, 158], [186, 224, 262]])
-
-    z = einsum(a, b, subscripts="ij,jk->ik")
-    assert np.allclose(z, output)
-
-    z.backward()
-
-    # einsum(np.ones_like(z), b, subscripts="ik,jk->ij")
-    assert np.allclose(
-        a.grad, np.array([[3, 12, 21, 30], [3, 12, 21, 30], [3, 12, 21, 30]])
-    )
-    # einsum(a, np.ones_like(z), subscripts="ij,ij->jk")
-    assert np.allclose(
-        b.grad, np.array([[12, 12, 12], [15, 15, 15], [18, 18, 18], [21, 21, 21]])
-    )
-
-
-def test_can_differentiate_einsum_one_tensor():
-    a = tensor(np.arange(12).reshape(3, 4))
-
-    z = einsum(a, subscripts="ij->i")
-    assert np.allclose(z, np.array([6, 22, 38]))
-
-    z.backward()
-
-    # einsum(1, ones, subscripts="i,j->ij")
-    assert np.allclose(a.grad, np.array([[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]))
-
-
-def test_can_combine_ops_correctly():
-    x = tensor([1.0, 2.0, 3.0])
-    y = tensor([4.0, 5.0, 6.0])
-
-    z = ((x + y) * 0.3) ** 2
-
-    assert np.allclose(z, tensor([2.25, 4.41, 7.29]))
-
-    z.backward()
-
-    assert np.allclose(np.array(x.grad), 0.18 * np.array(x + y))
-    assert np.allclose(np.array(y.grad), 0.18 * np.array(x + y))
-
-
-def test_can_do_linear_regression_backprop():
-    x = tensor(np.linspace(-1.0, 1.0, 100))
-
-    # hard code slope = 2, intercept = -1
-    y = tensor(np.linspace(-1.0, 1.0, 100) * 2 - 1)
-
-    slope = tensor(0.01)  # start off with a small slope
-    intercept = tensor(0.0)  # start off with a small intercept
-
-    # This feels a bit high, and im not sure why
-    learning_rate = tensor(1e-1)
-
-    prev_loss = np.inf
-    for _ in range(100):
-        z = einsum(slope, x, subscripts=",i->i")
-        z += einsum(intercept, np.ones_like(x), subscripts=",i->i")
-
-        square_error = (z - y) ** 2
-        loss = einsum(square_error, subscripts="i->") / square_error.shape[0]
-
-        # Make sure loss is decreasing
-        assert prev_loss > loss
-        prev_loss = loss
-
-        # Do backprop
-        loss.backward()
-
-        with no_grad():
-            slope = slope - slope.grad * learning_rate
-            intercept = intercept - intercept.grad * learning_rate
-
-    # Make sure we are close to the true slope and intercept
-    assert np.allclose(slope, 2.0, rtol=0.1)
-    assert np.allclose(intercept, -1.0, rtol=0.1)
diff --git a/tests/test_basic_operations.py b/tests/test_basic_operations.py
deleted file mode 100644
index 668c09a..0000000
--- a/tests/test_basic_operations.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import math
-
-import numpy as np
-
-from tricycle.ops import (add, cos, div, einsum, exp, log, matmul, max, min, mul, negate, nothing, pow, reduce_sum, sin, sqrt, sub)
-
-
-def test_can_negate():
-    a = np.array([1, 2, 3])
-    result = np.array([-1, -2, -3])
-
-    assert np.allclose(negate(a), result)
-
-
-def test_can_do_nothing():
-    a = np.array([1, 2, 3])
-    assert np.allclose(nothing(a), a)
-
-
-def test_can_add():
-    a = np.array([1, 2, 3])
-    b = np.array([4, 5, 6])
-
-    result = np.array([5, 7, 9])
-    assert np.allclose(add(a, b), result)
-
-
-def test_can_subtract():
-    a = np.array([1, 2, 3])
-    b = np.array([4, 5, 6])
-
-    result = np.array([-3, -3, -3])
-    assert np.allclose(sub(a, b), result)
-
-
-def test_can_multiply():
-    a = np.array([1, 2, 3])
-    b = np.array([4, 5, 6])
-
-    multiply_2 = np.array([2, 4, 6])
-    multiply_minus_1 = np.array([-1, -2, -3])
-    multiply_half = np.array([0.5, 1, 1.5])
-    multiply_array = np.array([4, 10, 18])
-
-    assert np.allclose(mul(a, np.array(2)), multiply_2)
-    assert np.allclose(mul(a, np.array(-1)), multiply_minus_1)
-    assert np.allclose(mul(a, np.array(0.5)), multiply_half)
-    assert np.allclose(mul(a, b), multiply_array)
-
-
-def test_can_divide():
-    a = np.array([1, 2, 3])
-    b = np.array([4, 5, 6])
-
-    result = np.array([0.25, 0.4, 0.5])
-    assert np.allclose(div(a, b), result)
-
-
-def test_can_reduce_sum():
-    a = np.array([1, 2, 3])
-    assert np.allclose(reduce_sum(a), 6)
-
-
-def test_can_exp():
-    a = np.array([1, 2, 3])
-
-    result = np.array([math.exp(1), math.exp(2), math.exp(3)])
-    assert np.allclose(exp(a), result)
-
-
-def test_can_log():
-    a = np.array([1, 2, 3])
-
-    result = np.array([math.log(1), math.log(2), math.log(3)])
-    assert np.allclose(log(a), result)
-
-
-def test_can_sqrt():
-    a = np.array([1, 2, 3])
-
-    result = np.array([math.sqrt(1), math.sqrt(2), math.sqrt(3)])
-    assert np.allclose(sqrt(a), result)
-
-
-def test_can_sin():
-    a = np.array([1, 2, math.pi])
-    b = np.array([math.sin(1), math.sin(2), 0])
-    assert np.allclose(sin(a), b)
-
-
-def test_can_cos():
-    a = np.array([1, 2, math.pi])
-    b = np.array([math.cos(1), math.cos(2), -1])
-    assert np.allclose(cos(a), b)
-
-
-def test_can_power():
-    a = np.array([1, 2, 3], dtype=float)
-    b = np.array([4, 5, 6])
-
-    power_2 = np.array([1, 4, 9])
-    power_minus_1 = np.array([1, 1 / 2, 1 / 3])
-    power_half = np.array([1, math.sqrt(2), math.sqrt(3)])
-    power_b = np.array([1, 32, 729])
-
-    assert np.allclose(pow(a, np.array(2)), power_2)
-    assert np.allclose(pow(a, np.array(-1)), power_minus_1)
-    assert np.allclose(pow(a, np.array(0.5)), power_half)
-    assert np.allclose(pow(a, b), power_b)
-
-
-def test_can_max():
-    a = np.array([1, 2, 3])
-    assert np.allclose(max(a), 3)
-
-
-def test_can_min():
-    a = np.array([1, 2, 3])
-    assert np.allclose(min(a), 1)
-
-
-def test_can_matrix_multiply():
-    a = np.array([1, 2, 3])
-    b = np.array([4, 5, 6])
-    assert np.allclose(matmul(a, b), 32)
-
-
-def test_can_einsum():
-    a = np.arange(25).reshape(5, 5)
-    b = np.arange(25).reshape(5, 5)
-    v = np.arange(5)
-
-    answer = np.einsum("ij,jk->ik", a, b)
-    assert np.allclose(einsum(a, b, subscripts="ij,jk->ik"), answer)
-
-    answer = np.einsum("i,i->", v, v)
-    assert np.allclose(einsum(v, v, subscripts="i,i->"), answer)
-
-    answer = np.einsum("i->", v)
-    assert np.allclose(einsum(v, subscripts="i->"), answer)
-
-    answer = np.einsum("ij->j", a)
-    assert np.allclose(einsum(a, subscripts="ij->j"), answer)
-
-    answer = np.einsum("ij,j->i", a, v)
-    assert np.allclose(einsum(a, v, subscripts="ij,j->i"), answer)
diff --git a/tests/test_composite_functions.py b/tests/test_composite_functions.py
deleted file mode 100644
index 7ed3419..0000000
--- a/tests/test_composite_functions.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import numpy as np
-
-from tricycle.ops import mean, sigmoid, softmax, tensor
-
-
-def test_can_mean():
-    assert mean(tensor([1, 2, 3])) == 2
-
-
-def test_can_softmax():
-    # Single row
-    result = softmax(tensor([0, 1, 0]))
-    expected = tensor([0.21194156, 0.57611688, 0.21194156])
-    assert np.allclose(result, expected)
-
-    # Multiple rows
-    result = softmax(tensor([[0, 1, 0], [1, 0, 0]]))
-    expected = tensor(
-        [[0.21194156, 0.57611688, 0.21194156], [0.57611688, 0.21194156, 0.21194156]]
-    )
-    assert np.allclose(result, expected)
-
-
-def test_can_differentiate_softmax():
-    x = tensor([1, 2, 3])
-    x.name = "x"
-    z = softmax(x)
-    z.name = "z"
-    z.backward()
-
-    assert np.allclose(x.grad, [-0.09003057, -0.24472847, 0.33475904])
-
-
-def test_can_sigmoid():
-    result = sigmoid(tensor([1, 2, 3]))
-    expected = tensor([0.73105858, 0.88079708, 0.95257413])
-    assert np.allclose(result, expected)
diff --git a/tests/test_loss_functions.py b/tests/test_loss_functions.py
deleted file mode 100644
index d51b431..0000000
--- a/tests/test_loss_functions.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import numpy as np
-from sklearn.datasets import load_iris
-
-from tricycle.experiments import plot_loss
-from tricycle.initialisers import init_xavier
-from tricycle.loss import categorical_crossentropy, mean_square_error
-from tricycle.ops import einsum, no_grad, sigmoid, tensor
-
-
-def test_mean_square_error():
-    x = tensor(np.linspace(-1.0, 1.0, 100))
-
-    # hard code slope = 2, intercept = -1
-    y = tensor(np.linspace(-1.0, 1.0, 100) * 2 - 1)
-
-    slope = tensor(0.01)  # start off with a small slope
-    intercept = tensor(0.0)  # start off with a small intercept
-
-    learning_rate = tensor(0.1)
-
-    prev_loss = np.inf
-    # sourcery skip: no-loop-in-tests
-    for _ in range(100):
-        expanded_intercept = einsum(intercept, np.ones_like(x), subscripts=",a->a")
-
-        z = einsum(x, slope, subscripts="a,->a") + expanded_intercept
-
-        # Calculate mean squared error
-        loss = mean_square_error(z, y)
-
-        # Make sure loss is decreasing
-        assert prev_loss > loss
-        prev_loss = loss
-
-        # Do backprop
-        loss.backward()
-
-        with no_grad():
-            slope = slope - slope.grad * learning_rate
-            intercept = intercept - intercept.grad * learning_rate
-
-    # Make sure we are close to the true slope and intercept
-    assert np.allclose(slope, 2.0, rtol=0.1)
-    assert np.allclose(intercept, -1.0, rtol=0.1)
-
-
-# This does not currently work and I haven't figured out why yet
-def test_cross_entropy():
-    """
-    We'll test cross entropy loss by fitting a single layer neural
-    network to the iris dataset
-    """
-    iris = load_iris()
-    x = tensor(iris.data, name="x", requires_grad=False)
-
-    # one hot encode the labels
-    y = tensor(
-        np.eye(iris.target.max() + 1, dtype=np.float32)[iris.target],
-        name="y",
-        requires_grad=False,
-    )
-
-    slope = init_xavier(shape=(x.shape[1], y.shape[1]))  # start off with a small slope
-    intercept = tensor(
-        [-0.05, 0.01, 0.05], name="intercept"
-    )  # start off with a small intercept
-
-    learning_rate = tensor(1e-2)
-    learning_rate.requires_grad = False
-
-    prev_loss = np.inf
-    losses = []
-    # sourcery skip: no-loop-in-tests
-    for _ in range(100):
-        print(f"{slope.shape=}, {x.shape=}, {y.shape=}")
-        z = einsum(x, slope, subscripts="ij,jk->ik")
-        ones = tensor(np.ones_like(z), requires_grad=False)
-        broadcasted_intercept = einsum(intercept, ones, subscripts="k,jk->jk")
-        z += broadcasted_intercept
-        z.name = "z"
-
-        print(f"{z.shape=}")
-
-        # Calculate cross entropy
-        loss = categorical_crossentropy(z, y)
-        losses.append(loss)
-        assert prev_loss > loss
-
-        # Do backprop
-        loss.backward()
-
-        with no_grad():
-            slope = slope - slope.grad * learning_rate
-            slope.grad = None
-            intercept = intercept - intercept.grad * learning_rate
-            intercept.grad = None
-
-    plot_loss(losses)
diff --git a/tests/test_simple_neural_network.py b/tests/test_simple_neural_network.py
deleted file mode 100644
index ef967a4..0000000
--- a/tests/test_simple_neural_network.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import numpy as np
-from sklearn.datasets import load_diabetes
-from sklearn.preprocessing import StandardScaler
-
-from tricycle.activation import relu
-from tricycle.dataset import InfiniteDataset
-from tricycle.experiments import smooth, plot_loss
-from tricycle.initialisers import init_xavier, init_zero
-from tricycle.loss import mean_square_error
-from tricycle.ops import einsum, no_grad
-
-
-def test_simple_neural_network():
-    """
-    Train a simple regression network on the diabetes dataset
-    """
-    # Make sure our results are reproducible
-    np.random.seed(42)
-
-    batch_size = 64
-    learning_rate = 1e-0
-    n_epochs = 1000
-    n_labels = 1
-    n_features = 10
-    layer_1_size = 16
-    layer_2_size = 8
-
-    diabetes = load_diabetes(scaled=True)
-    x = diabetes.data
-
-    # We need to scale the target to avoid giant grads
-    y = diabetes.target.reshape(-1, 1)
-    y = StandardScaler().fit_transform(y)
-
-    # Build an iterator that generates random mini-batches.
-    ds = InfiniteDataset(x, y, batch_size=batch_size)
-
-    # define a model
-    layer_1_weights = init_xavier((n_features, layer_1_size), name="layer_1_weights")
-    layer_2_weights = init_xavier((layer_1_size, layer_2_size), name="layer_2_weights")
-    layer_3_weights = init_xavier((layer_2_size, n_labels), name="layer_3_weights")
-
-    layer_1_bias = init_zero((1, layer_1_size), name="layer_1_bias")
-    layer_2_bias = init_zero((1, layer_2_size), name="layer_2_bias")
-    layer_3_bias = init_zero((1, n_labels), name="layer_3_bias")
-
-    params = [
-        layer_1_weights,
-        layer_2_weights,
-        layer_3_weights,
-        layer_1_bias,
-        layer_2_bias,
-        layer_3_bias,
-    ]
-
-    def model(x, params):
-        (
-            layer_1_weights,
-            layer_2_weights,
-            layer_3_weights,
-            layer_1_bias,
-            layer_2_bias,
-            layer_3_bias,
-        ) = params
-
-        layer_1 = relu(
-            einsum(x, layer_1_weights, subscripts="ij,jk->ik") + layer_1_bias
-        )
-        layer_2 = relu(
-            einsum(layer_1, layer_2_weights, subscripts="ij,jk->ik") + layer_2_bias
-        )
-        layer_3 = (
-            einsum(layer_2, layer_3_weights, subscripts="ij,jk->ik") + layer_3_bias
-        )
-        return layer_3
-
-    losses = []
-
-    for batch_idx, (X, y) in enumerate(ds):
-        preds = model(X, params)
-        loss = mean_square_error(preds, y)
-
-        loss.backward()
-        losses.append(loss)
-
-        with no_grad():
-            for idx, param in enumerate(params):
-                params[idx] = param - (param.grad * learning_rate)
-                param.grad = None
-                
-
-        # sourcery skip: no-conditionals-in-tests
-        if batch_idx >= n_epochs:
-            break
-
-    # Check that our loss has actually decreased
-    # It starts at about 0.9 and drops to around 0.5
-    # if the model actually learns
-    plot_loss(losses)
-    assert list(smooth(losses))[-1] < 0.6
diff --git a/tests/test_tensor.py b/tests/test_tensor.py
deleted file mode 100644
index 884b00d..0000000
--- a/tests/test_tensor.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import math
-
-import numpy as np
-
-from tricycle.ops import tensor
-
-
-def test_can_handle_scalar():
-    tensor(1)
-
-
-def test_can_handle_vector():
-    tensor([1, 2, 3])
-
-
-def test_can_handle_matrix():
-    tensor([[1, 2, 3], [4, 5, 6]])
-
-
-def test_can_handle_3d():
-    tensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
-
-
-def test_can_handle_irrational_matrix():
-    tensor([[1, 2, math.sqrt(3)], [4, 5, 6], [7, 8, 9]])
-
-
-def test_can_handle_16d():
-    # we arent going all the way to 32d because the
-    # array ends up being 32GB!
-    shape = [2] * 16
-    big_array = np.ones(shape)
-    tensor(big_array)