In [None]:
# | default_exp layers

# Layers

> Multi-layered feedforward structure with fully connected MVN.

The multi-layered feedforward structure with fully connected MVN is referred to as MLMVN. Considering a MLMVN structure $[n$-$N_1$-$\dots$-$N_{m-1}$-$N_m ]$ with $n$ inputs in the input layer, $m$-$1$ hidden layers, and the output layer $m$. The algorithm is divided into three steps. Before starting the iterative algorithm, the weights are randomly initialized, and the biases are set to zero.


In [None]:
# |hide
from nbdev.showdoc import *

In [None]:
# |export
import math
import torch
import torch.nn as nn
from torch.autograd import Function

## MLMVN

The algorithm is divided into three steps. Before starting the iterative algorithm, the weights are randomly initialized, and the biases are set to zero. 

For weight adjustment three distinctions are made: the `FirstLayer`, `HiddenLayer` [$2$ to $m-1$], and the `OutputLayer`. Thereby the weights are updated successively from layer $1$ to layer $m$. The $1st$ hidden Layer is updated by
\begin{equation*}
	\tilde{w}_0^{k1} = w_0^{k1} + \frac{C_{k1}}{(n+1) \cdot |z_{k1}|} \cdot \delta_{k1} \,,
\end{equation*}

\begin{equation*}
	\tilde{w}_i^{k1} = w_i^{k1} + \frac{C_{k1}}{(n+1) \cdot |z_{k1}|} \cdot \delta_{k1} \cdot \bar{x}_{i} \,,
\end{equation*}

\begin{equation*}
	i = \{1, \dots, n\} \,.
\end{equation*}

In [None]:
# |export
class FirstLayer(nn.Module):
    """Custom first layer, mimics a standard linear layer."""

    def __init__(self, size_in, size_out):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights = torch.Tensor(size_out, size_in)

        # initialize weights and biases
        weights = torch.randn(
            self.size_in, self.size_out, dtype=torch.cdouble
        ) / math.sqrt(self.size_in)
        self.weights = nn.Parameter(
            weights
        )  # nn.Parameter is a Tensor that's a module parameter.

        bias = torch.unsqueeze(
            torch.zeros(size_out, dtype=torch.cdouble, requires_grad=True), 0
        )
        self.bias = nn.Parameter(bias)

    def forward(self, x):
        x = FirstLayerFB.apply(x, self.weights, self.bias)
        x.register_hook(self._hook_fn)
        return x

    def _hook_fn(self, tensor):
        self.grad_output = tensor


class FirstLayerFB(Function):
    """Base class to create custom `autograd.Function`"""

    @staticmethod
    def forward(ctx, input, weights, bias):
        w_times_x = torch.mm(input, weights)
        output = torch.add(w_times_x, bias)
        ctx.save_for_backward(input, weights, bias, output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias, output = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.

        # output = torch.ones(1, grad_output.size(1))
        # grad_output = grad_output / (input.size(1) + 1)
        if ctx.needs_input_grad[0]:
            cinv = (torch.conj(weight) / torch.square(torch.abs(weight))).T
            grad_input = grad_output.mm(cinv)
        if ctx.needs_input_grad[1]:
            x_pinv = torch.linalg.pinv(
                torch.cat([torch.ones(1, input.size(0)), input.T[0:]])
            ).T
            angle_pinv = x_pinv[1:, :]
            grad_weight = angle_pinv @ torch.div(grad_output, torch.abs(output))
            grad_weight = grad_weight * (-1)
        if bias is not None and ctx.needs_input_grad[2]:
            angle_pinv = x_pinv[0, :]
            grad_bias = (
                angle_pinv @ torch.div(grad_output, torch.abs(output))
            ).unsqueeze(dim=0)
            grad_bias = grad_bias * (-1)

        return grad_input, grad_weight, grad_bias

The hidden layer $2,\dots,m-1$ is updated by

\begin{equation*}
    \tilde{w}_0^{kj} = w_0^{kj} + \frac{C_{kj}}{(N_{j-1}+1) \cdot |z_{kj}|} \cdot \delta_{kj}
\end{equation*}

\begin{equation*}
    \tilde{w}_i^{kj} = w_i^{kj} + \frac{C_{kj}}{(N_{j-1}+1) \cdot |z_{kj}|} \cdot \delta_{kj} \cdot \bar{\tilde{Y}}_{i,j-1}
\end{equation*}

\begin{equation*}
    i = \{1, \dots, N_{j-1}\}; j = \{2, \dots, m-1\}.
\end{equation*}

In [None]:
# |export
class HiddenLayer(nn.Module):
    """Custom hidden layer, mimics a standard linear layer."""

    def __init__(self, size_in, size_out):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights = torch.Tensor(size_out, size_in)

        # initialize weights and biases
        weights = torch.randn(
            self.size_in, self.size_out, dtype=torch.cdouble
        ) / math.sqrt(self.size_in)
        self.weights = nn.Parameter(
            weights
        )  # nn.Parameter is a Tensor that's a module parameter.

        bias = torch.unsqueeze(
            torch.zeros(size_out, dtype=torch.cdouble, requires_grad=True), 0
        )
        self.bias = nn.Parameter(bias)

    def forward(self, x):
        x = HiddenLayerFB.apply(x, self.weights, self.bias)
        x.register_hook(self._hook_fn)
        return x

    def _hook_fn(self, tensor):
        self.grad_output = tensor / (self.size_in + 1)


class HiddenLayerFB(Function):
    @staticmethod
    def forward(ctx, input, weights, bias):
        w_times_x = torch.mm(input, weights)
        output = torch.add(w_times_x, bias)
        ctx.save_for_backward(input, weights, bias, output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias, output = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.

        # output = torch.ones(1, grad_output.size(1))
        grad_output = grad_output / (input.size(1) + 1)
        if ctx.needs_input_grad[0]:
            cinv = (torch.conj(weight) / torch.square(torch.abs(weight))).T
            grad_input = grad_output.mm(cinv)
        if ctx.needs_input_grad[1]:
            x_pinv = torch.linalg.pinv(
                torch.cat([torch.ones(1, input.size(0)), input.T[0:]])
            ).T
            angle_pinv = x_pinv[1:, :]
            grad_weight = angle_pinv @ torch.div(grad_output, torch.abs(output))
            grad_weight = grad_weight * (-1)
        if bias is not None and ctx.needs_input_grad[2]:
            angle_pinv = x_pinv[0, :]
            grad_bias = (
                angle_pinv @ torch.div(grad_output, torch.abs(output))
            ).unsqueeze(dim=0)
            grad_bias = grad_bias * (-1)

        return grad_input, grad_weight, grad_bias

Finally, the weights of the output layer $m$ are updated
\begin{equation*}
    \tilde{w}_0^{km} = w_0^{km} + \frac{C_{km}}{N_{m-1}+1} \cdot \delta_{km} \, ,
\end{equation*}

\begin{equation*}
    \tilde{w}_i^{km} = w_i^{km} + \frac{C_{km}}{N_{m-1}+1} \cdot \delta_{km} \cdot \bar{\tilde{Y}}_{i,m-1} \, ,
\end{equation*}

\begin{equation*}
    i = \{1, \dots, N_{m-1}\} \, ,
\end{equation*}
where $\bar{\tilde{Y}}_{i,j-1}$ is the updated complex conjugated output of the $i$-th neuron from the $j-1$-th layer. The variable learning rate $\frac{1}{|z|}$ is an additional parameter for nonlinear mappings that makes learning smoother. The variable learning rate can be omitted in the output layer since the exact error is known here, and it is not computed heuristically as in the previous layers.

In [None]:
# |export
class OutputLayer(nn.Module):
    """Custom output layer, mimics a standard linear layer."""

    def __init__(self, size_in, size_out):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights = torch.Tensor(size_out, size_in)

        # initialize weights and biases
        weights = torch.randn(
            self.size_in, self.size_out, dtype=torch.cdouble
        ) / math.sqrt(self.size_in)
        self.weights = nn.Parameter(
            weights
        )  # nn.Parameter is a Tensor that's a module parameter.

        bias = torch.unsqueeze(
            torch.zeros(size_out, dtype=torch.cdouble, requires_grad=True), 0
        )
        self.bias = nn.Parameter(bias)

    def forward(self, x):
        x = OutputLayerFB.apply(x, self.weights, self.bias)
        x.register_hook(self._hook_fn)
        return x

    def _hook_fn(self, tensor):
        self.grad_output = tensor / (self.size_in + 1)
        # self.grad_output = torch.ones(1, self.size_out)


class OutputLayerFB(Function):
    @staticmethod
    def forward(ctx, input, weights, bias):
        w_times_x = torch.mm(input, weights)
        output = torch.add(w_times_x, bias)
        ctx.save_for_backward(input, weights, bias, output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias, output = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.

        # output = torch.ones(1, grad_output.size(1))
        grad_output = grad_output / (input.size(1) + 1)
        if ctx.needs_input_grad[0]:
            cinv = (torch.conj(weight) / torch.square(torch.abs(weight))).T
            grad_input = grad_output.mm(cinv)
        if ctx.needs_input_grad[1]:
            x_pinv = torch.linalg.pinv(
                torch.cat([torch.ones(1, input.size(0)), input.T[0:]])
            ).T
            angle_pinv = x_pinv[1:, :]
            grad_weight = angle_pinv @ torch.div(grad_output, torch.abs(output))
            grad_weight = grad_weight * (-1)
        if bias is not None and ctx.needs_input_grad[2]:
            angle_pinv = x_pinv[0, :]
            grad_bias = (
                angle_pinv @ torch.div(grad_output, torch.abs(output))
            ).unsqueeze(dim=0)
            grad_bias = grad_bias * (-1)

        return grad_input, grad_weight, grad_bias

## Activation

The activation function maps depending on the weighted sum $z$ to the unit circle, which is divided into $k$ sectors described by the set
\begin{equation}
	E_k = \{1, \varepsilon_k, \varepsilon_k^2, \dots, \varepsilon_k^{k-1}  \}, 
\end{equation}
with $ \varepsilon_k = e^{j\frac{2\pi}{k}} $, where $j$ is the imaginary unit and $k \in \mathbb{N}_{>0}$. Therefore, the activation function of a continuous MVN is defined by 
\begin{equation}
	P(w_0 + w_1 x_1 + \dots + w_n x_n) = P(z) = e^{j\varphi} = \frac{z}{|z|}, 
\end{equation}
where $w_0$ is the bias, $w_i$ is the corresponding weight to the input $x_i$ with $i = \{1,\dots,n\}$ and $\varphi \in [0,2\pi[$ is the argument of the weighted sum $z$. Fig. \ref{fig:complexActivation} illustrates this context. The discrete activation function differs only in that the phase is adjusted to the nearest bisector, i.e. $P(z) \in E_k \cdot e^{j\frac{\pi}{k}}$, where $e^{j\frac{\pi}{k}}$ realizes a shift of half a sector to move from the sector borders to the bisectors.

In [None]:
# |export
class phase_activation(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input / torch.abs(input)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, None


class cmplx_phase_activation(nn.Module):
    """Custom Linear layer but mimics a standard linear layer"""

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return phase_activation.apply(x)

## Dropout

In [None]:
# | export
class DropoutFB(Function):
    @staticmethod
    def forward(ctx, input, p):
        # ctx.save_for_backward(input)
        # return input / torch.abs(input)
        binomial = torch.distributions.binomial.Binomial(probs=1 - p)
        return input * binomial.sample(input.size()) * (1.0 / (1 - p))

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, None


class MyDropout(nn.Module):
    def __init__(self, p: float = 0.5):
        super(MyDropout, self).__init__()
        if p < 0 or p > 1:
            raise ValueError(
                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
            )
        self.p = p

    def forward(self, X):
        if self.training:
            return DropoutFB.apply(X, self.p)
        return X