Completed linear regression

bclarkson-code · Jan 14, 2024 · 777bff5 · 777bff5
1 parent d73bc0c
commit 777bff5
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 5 deletions.
diff --git a/src/tricycle_v2/initialisers.py b/src/tricycle_v2/initialisers.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+from tricycle_v2.ops import to_tensor
+
+
+def init_xavier(shape, name: str = ""):
+    """
+    Initialize a tensor with xavier/glorot initialisation
+    """
+    f_in, f_out = shape
+    bound = np.sqrt(6) / np.sqrt(f_in + f_out)
+    return to_tensor(np.random.uniform(low=-bound, high=bound, size=shape), name=name)
diff --git a/src/tricycle_v2/ops.py b/src/tricycle_v2/ops.py
@@ -1,4 +1,5 @@
 from functools import partial
+from string import ascii_lowercase
 
 import numpy as np
 
@@ -66,3 +67,26 @@ def nothing(tensor):
     Return a tensor
     """
     return tensor
+
+
+def softmax(tensor):
+    """
+    Apply softmax. The softmax is only applied to the final
+    dimension of the tensor
+    Note: the tensor is normalised for numeric stability
+    """
+    from tricycle_v2.reduce import radd, rmax
+    from tricycle_v2.unary import uexp
+
+    indices = ascii_lowercase[: len(tensor.shape)]
+    reduce_subscript = f"{indices}->{indices[:-1]}"
+    # largest = rmax(tensor, reduce_subscript)
+
+    expand_subscript = f"{indices[:-1]}->{indices}"
+    # largest = repeat(expand_subscript, largest, tensor.shape)
+    normalised = tensor#  - largest
+    exponentiated = uexp(normalised)
+
+    denom = radd(exponentiated, reduce_subscript)
+    denom = repeat(expand_subscript, denom, tensor.shape)
+    return exponentiated / denom
diff --git a/src/tricycle_v2/tensor.py b/src/tricycle_v2/tensor.py
@@ -43,7 +43,7 @@ def backward(self):
 
             else:
                 for arg, op in zip(current_node.args, current_node.back_fn):
-                    logger.info(f"{hash(current_node)=} {hash(arg)=} {op=}")
+                    # logger.info(f"{hash(current_node)=} {hash(arg)=} {op=}")
                     if not arg.requires_grad:
                         continue
 
@@ -53,15 +53,18 @@ def backward(self):
 
         # calculate the gradient for each parameter
         for leaf in leaves.values():
+            logger.info(f"leaf: {leaf}")
             if leaf.grad_fn is None:
                 continue
 
             for path in leaf.grad_fn:
                 grad = np.ones_like(self).view(Tensor)
                 grad.requires_grad = False
 
+                logger.info(grad)
                 for op in path:
                     grad = op(grad)
+                    logger.info(grad)
 
                 leaf.grad = grad if leaf.grad is None else leaf.grad + grad
 

diff --git a/src/tricycle_v2/unary.py b/src/tricycle_v2/unary.py
@@ -93,8 +93,8 @@ def udiv(arg_1: Union[Tensor, float], arg_2: Union[Tensor, float]) -> Tensor:
 
 def umax(tensor: Tensor, constant: float) -> Tensor:
     """
-    Max a tensor by a constant, elementwise. The constant is not
-    differentiable.
+    If only a tensor is passed, find the max of the tensor.
+    If a constant is passed, find the max of the tensor and the constant, elementwise. The constant is not differentiable.
     """
     assert isinstance(tensor, Tensor)
     assert np.isscalar(constant)
@@ -120,7 +120,7 @@ def umin(tensor: Tensor, constant: float) -> Tensor:
 
     result = to_tensor(np.minimum(tensor, constant))
 
-    indicator = to_tensor((tensor <= constant).astype(float))
+    indicator = to_tensor((tensor <= constant).astype(float), requires_grad=False)
     indices = ascii_letters[: len(tensor.shape)]
     subscripts = f"{indices},{indices}->{indices}"
 

diff --git a/tests/test_composite.py b/tests/test_composite.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+from tricycle_v2.ops import softmax
+from tricycle_v2.reduce import radd
+from tricycle_v2.tensor import to_tensor
+
+
+def test_softmax():
+    in_tensor = to_tensor(np.arange(12).reshape(3, 4), name='in_tensor')
+
+    out_tensor = softmax(in_tensor)
+
+    assert out_tensor.shape == (3, 4)
+    assert np.allclose(radd(out_tensor, "ij->i"), [1, 1, 1])
+    assert np.allclose(
+        out_tensor,
+        np.array(
+            [
+                [0.0320586, 0.08714432, 0.23688282, 0.64391426],
+                [0.0320586, 0.08714432, 0.23688282, 0.64391426],
+                [0.0320586, 0.08714432, 0.23688282, 0.64391426],
+            ]
+        ),
+    )
+
+    out_tensor.backward()
+
+    breakpoint()
+    assert np.allclose(
+        in_tensor.grad,
+        np.ones_like(in_tensor),
+    )
diff --git a/tests/test_loss.py b/tests/test_loss.py
@@ -1,8 +1,12 @@
 import numpy as np
+import pytest
 from matplotlib import pyplot as plt
+from sklearn.datasets import load_diabetes, load_linnerud
+from sklearn.preprocessing import RobustScaler
 
+from tricycle_v2.initialisers import init_xavier
 from tricycle_v2.loss import mean_squared_error
-from tricycle_v2.ops import repeat
+from tricycle_v2.ops import einsum, repeat
 from tricycle_v2.reduce import radd
 from tricycle_v2.tensor import to_tensor
 
@@ -17,6 +21,7 @@ def test_can_mean_square_error():
     assert np.allclose(mse, np.array([0, 2 / 3, 2 / 9]))
 
 
+@pytest.mark.skip
 def test_can_linear_regression():
     np.random.seed(42)
 
@@ -53,3 +58,81 @@ def test_can_linear_regression():
     ax.plot(losses)
     ax.set_yscale("log")
     plt.show()
+
+
+@pytest.mark.skip
+def test_linear_regression_multi_input():
+    X, y = load_diabetes(return_X_y=True)
+    x_scaler = RobustScaler()
+    y_scaler = RobustScaler()
+    X = x_scaler.fit_transform(X)
+    y = y_scaler.fit_transform(y.reshape(-1, 1))
+
+    X = to_tensor(X)
+    y = to_tensor(y)
+
+    learning_rate = 1e-1
+
+    slope = init_xavier((X.shape[1], 1), name="slope")
+    intercept = to_tensor([0], name="intercept")
+
+    losses = []
+    for _ in range(100):
+        repeated_intercept = repeat("j->ij", intercept, (X.shape[0], 1))
+
+        y_pred = einsum("ij,jk->ik", X, slope) + repeated_intercept
+        mse = mean_squared_error(y, y_pred)
+        loss = radd(mse, "i->") / y.shape[0]
+
+        losses.append(loss)
+
+        loss.backward()
+
+        slope = to_tensor(slope - slope.grad * learning_rate, name="slope")
+        intercept = to_tensor(
+            intercept - intercept.grad * learning_rate, name="intercept"
+        )
+
+    _, ax = plt.subplots()
+    ax.plot(losses)
+    ax.set_yscale("log")
+    plt.show()
+
+
+@pytest.mark.skip
+def test_linear_regression_multi_input_output():
+    X, y = load_linnerud(return_X_y=True)
+    x_scaler = RobustScaler()
+    y_scaler = RobustScaler()
+    X = x_scaler.fit_transform(X)
+    y = y_scaler.fit_transform(y)
+
+    X = to_tensor(X)
+    y = to_tensor(y)
+
+    learning_rate = 1e-1
+
+    slope = init_xavier((X.shape[1], y.shape[1]), name="slope")
+    intercept = to_tensor([-0.01, 0.01, 0.02], name="intercept")
+
+    losses = []
+    for _ in range(100):
+        repeated_intercept = repeat("k->ik", intercept, (X.shape[0], y.shape[1]))
+
+        y_pred = einsum("ij,jk->ik", X, slope) + repeated_intercept
+        mse = mean_squared_error(y, y_pred)
+        loss = radd(mse, "i->") / y.shape[0]
+
+        losses.append(loss)
+
+        loss.backward()
+
+        slope = to_tensor(slope - slope.grad * learning_rate, name="slope")
+        intercept = to_tensor(
+            intercept - intercept.grad * learning_rate, name="intercept"
+        )
+
+    _, ax = plt.subplots()
+    ax.plot(losses)
+    ax.set_yscale("log")
+    plt.show()
diff --git a/tests/test_unary_ops.py b/tests/test_unary_ops.py
@@ -149,6 +149,7 @@ def test_can_umax():
     )
 
 
+
 def test_can_umin():
     in_tensor = to_tensor(np.arange(12).reshape(3, 4))
     out_tensor = umin(in_tensor, 4)