In [7]:
import cupy as cp
import numpy
import torch

In [8]:
def forward_propagation(g):
    # apply simple softmax
    y = cp.exp(g)
    return y / cp.sum(y)

def backward_propagation(g, true_labels):

    # variant with creating matrix
    matrix = cp.zeros((len(true_labels), len(true_labels)))
    for i in range(len(true_labels)):
        for j in range(len(true_labels)):
            e1 = cp.exp(g[i])
            e2 = cp.exp(g[j])
            s1 = 1 / cp.sum(cp.exp(g))
            s2 = -1 * (1 / cp.sum(cp.exp(g))**2)
            if i == j:
                matrix[i][j] = e1*s2*e2 + e1*s1
            else:
                matrix[i][j] = e1*s2*e2 + s1
    return cp.dot(true_labels, matrix)

    #   variant with computing gradients manually
    # e = cp.exp(g)
    # s = cp.sum(e)
    # s = 1 / s
    #   back to copy gate (2)
    # g0 = true_labels * e
    #   back to cope gate (1)
    # g1 = true_labels * s
    #   back to 1 / input gate
    # g2 = cp.sum(g0)
    #   back to add gate
    # g3 = g2*(-1 * s**2)
    #   back to exp gate
    # g4 = g3 + g1
    #   return the result
    # return g4*e


def forward_propagation_normalization(g):
    # apply simple softmax and normalize (subtract maximum)
    y = cp.exp(g - cp.max(g))
    return y / cp.sum(y)

def backward_propagation_normalization(g, true_labels):
    # variant with creating matrix (the same, because the normalization does not affect the gradient computation)
    matrix = cp.zeros((len(true_labels), len(true_labels)))
    for i in range(len(true_labels)):
        for j in range(len(true_labels)):
            e1 = cp.exp(g[i])
            e2 = cp.exp(g[j])
            s1 = 1 / cp.sum(cp.exp(g))
            s2 = -1 * (1 / cp.sum(cp.exp(g))**2)
            if i == j:
                matrix[i][j] = e1*s2*e2 + e1*s1
            else:
                matrix[i][j] = e1*s2*e2 + s1
    return cp.dot(true_labels, matrix)

    #   variant with computing gradients manually
    # e = cp.exp(g)
    # s = cp.sum(e)
    # s = 1 / s
    #   back to copy gate (2)
    # g0 = true_labels * e
    #   back to cope gate (1)
    # g1 = true_labels * s
    #   back to 1 / input gate
    # g2 = cp.sum(g0)
    #   back to add gate
    # g3 = g2*(-1 * s**2)
    #   back to exp gate
    # g4 = g3 + g1
    #   the normalization was applied by subtracting a constant (subtract maximum), thus the gradient will not change at this step
    #   return the result
    # return g4*e

In [9]:
# compare the output with torch
inputs = cp.array([[1, -1, 0, 0], [0, 99, 0, 0], [0, 1, 21, 4]])
for input in inputs:
    print()
    print(f"Forward pass: {forward_propagation(input)}")
    print(f"Backward grad: {backward_propagation(input, cp.array([1, 0, 0, 0]))}")
    print(f"Forward pass (normalization): {forward_propagation_normalization(input)}")
    print(f"Backward grad (normalization): {backward_propagation_normalization(input, cp.array([1, 0, 0, 0]))}")

    input_tensor = torch.Tensor(input)
    input_tensor.requires_grad = True
    softmax = torch.nn.Softmax()
    output_tensor = softmax(input_tensor)
    print(f"Forward pass (torch): {output_tensor.data.numpy()}")
    output_tensor.backward(torch.Tensor([1, 0, 0, 0]))
    print(f"Backward grad (torch): {input_tensor.grad.numpy()}")


Forward pass: [0.53444665 0.07232949 0.19661193 0.19661193]
Backward grad: [0.24881343 0.15795568 0.09153335 0.09153335]
Forward pass (normalization): [0.53444665 0.07232949 0.19661193 0.19661193]
Backward grad (normalization): [0.24881343 0.15795568 0.09153335 0.09153335]
Forward pass (torch): [0.53444666 0.07232949 0.19661194 0.19661194]
Backward grad (torch): [ 0.24881342 -0.03865625 -0.10507859 -0.10507859]

Forward pass: [1.01122149e-43 1.00000000e+00 1.01122149e-43 1.01122149e-43]
Backward grad: [1.01122149e-43 0.00000000e+00 1.01122149e-43 1.01122149e-43]
Forward pass (normalization): [1.01122149e-43 1.00000000e+00 1.01122149e-43 1.01122149e-43]
Backward grad (normalization): [1.01122149e-43 0.00000000e+00 1.01122149e-43 1.01122149e-43]
Forward pass (torch): [1.01e-43 1.00e+00 1.01e-43 1.01e-43]
Backward grad (torch): [ 1.01e-43 -1.01e-43 -0.00e+00 -0.00e+00]

Forward pass: [7.58256009e-10 2.06115353e-09 9.99999956e-01 4.13993754e-08]
Backward grad: [7.58256009e-10 7.58256008e-

  output_tensor = softmax(input_tensor)
