In [45]:
import numpy as np
import torch

from lib.Tensor import Tensor
from lib.NN import Module, force_tensor_method

In [3]:
t = Tensor([[1, 2, 3], [4, 5, 6]], requires_grad=True)

In [28]:
class Softmax(Module):
    #! https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
    @force_tensor_method
    def forward(self, x: Tensor) -> Tensor:
        shiftx = x - x.max(axis=-1, keepdims=True) # Subtract the max of x.data along the last axis for numerical stability.
        exps = shiftx.exp() # get exponential of shifted data
        out = exps / exps.sum(axis=-1, keepdims=True) # Normalize along the last axis so they sum to 1 (making a prob distribution)
        
        #! =====================================================
        # out = Tensor(out_data, (x,), 'softmax', requires_grad=x.requires_grad)
        # i dont think i need this because it's already a tensor and keeps track of ops
        def _backward(): pass # Dont think i need this either cause it'll be handled by the autograd for each individual op
        #! =====================================================

        return out

In [29]:
softmax = Softmax()

In [30]:
out = softmax(t)

In [48]:
out

Tensor([[0.09003057 0.24472847 0.66524096]
 [0.09003057 0.24472847 0.66524096]], requires_grad=True)

In [41]:
data = out.sum(axis=-1).data

In [44]:
data.dtype

dtype('float64')

In [40]:
np.allclose(out.sum(axis=1).data, np.ones((2,)))

True

In [51]:
pt = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32, requires_grad=True)
pt_softmax = torch.nn.Softmax(dim=-1)
pt_out = pt_softmax(pt)

In [52]:
pt_out

tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]], grad_fn=<SoftmaxBackward0>)