In [87]:
import numpy as np

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [88]:
class Module(object):
   
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
    
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
       
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput


    def updateOutput(self, input):

        pass

    def updateGradInput(self, input, gradOutput):

        pass

    def accGradParameters(self, input, gradOutput):
      
        pass

    def zeroGradParameters(self):
        
        pass

    def getParameters(self):
        
        return []

    def getGradParameters(self):
    
        return []

    def train(self):

        self.training = True

    def evaluate(self):
        self.training = False

    def __repr__(self):
       
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [89]:
class Sequential(Module):
    def __init__(self):
        super().__init__()
        self.modules = []

    def add(self, module):
        self.modules.append(module)

    def updateOutput(self, input):
        activations = input
        cache = []
        for module in self.modules:
            activations = module.forward(activations)
            cache.append(activations)
        self.output = cache[-1]
        self.cache = cache
        return self.output

    def backward(self, input, gradOutput):
        grad = gradOutput
        activations = [input] + self.cache
        for idx in range(len(self.modules) - 1, -1, -1):
            grad = self.modules[idx].backward(activations[idx], grad)
        self.gradInput = grad
        return self.gradInput

    def zeroGradParameters(self):
        for module in self.modules:
            module.zeroGradParameters()

    def getParameters(self):
        return [module.getParameters() for module in self.modules]

    def getGradParameters(self):
        return [module.getGradParameters() for module in self.modules]

    def __repr__(self):
        return "\n".join(str(module) for module in self.modules)

    def __getitem__(self, idx):
        return self.modules[idx]

    def train(self):
        self.training = True
        for module in self.modules:
            module.train()

    def evaluate(self):
        self.training = False
        for module in self.modules:
            module.evaluate()


# Layers

## 1 (0.2). Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [90]:
class Linear(Module):
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size=(n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size=n_out)
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)

    def updateOutput(self, input):
        self.output = input @ self.W.T + self.b.T
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput @ self.W
        assert self.gradInput.shape == input.shape
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradW = np.sum(input[:, None, :] * gradOutput[:, :, None], axis=0)
        self.gradb = np.sum(gradOutput, axis=0)

    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)

    def getParameters(self):
        return [self.W, self.b]

    def getGradParameters(self):
        return [self.gradW, self.gradb]

    def __repr__(self):
        s = self.W.shape
        return 'Linear %d -> %d' % (s[1], s[0])

## 2. (0.2) SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [91]:
"""class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()

    def updateOutput(self, input):
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        
        self.output = np.exp(self.output) 
        hq = self.output.sum(axis = 1,keepdims = True)
        self.output = np.divide(self.output,  hq)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
    
        self.gradInput = gradOutput
        pred_grad = np.multiply(self.output, gradOutput)
        self.gradInput = pred_grad - np.multiply(self.output, np.sum(pred_grad, axis=1, keepdims=True))
        return self.gradInput
   

    def __repr__(self):
        return "SoftMax" """

'class SoftMax(Module):\n    def __init__(self):\n         super(SoftMax, self).__init__()\n\n    def updateOutput(self, input):\n        self.output = np.subtract(input, input.max(axis=1, keepdims=True))\n        \n        self.output = np.exp(self.output) \n        hq = self.output.sum(axis = 1,keepdims = True)\n        self.output = np.divide(self.output,  hq)\n        return self.output\n    \n    def updateGradInput(self, input, gradOutput):\n    \n        self.gradInput = gradOutput\n        pred_grad = np.multiply(self.output, gradOutput)\n        self.gradInput = pred_grad - np.multiply(self.output, np.sum(pred_grad, axis=1, keepdims=True))\n        return self.gradInput\n   \n\n    def __repr__(self):\n        return "SoftMax" '

In [92]:
class SoftMax(Module):
    def __init__(self):
        super().__init__()

    def updateOutput(self, input):
        shifted_input = input - np.max(input, axis=1, keepdims=True)
        exp_vals = np.exp(shifted_input)
        self.output = exp_vals / np.sum(exp_vals, axis=1, keepdims=True)
        return self.output

    def updateGradInput(self, input, gradOutput):
        softmax_grad = self.output * gradOutput
        self.gradInput = softmax_grad - self.output * np.sum(softmax_grad, axis=1, keepdims=True)
        return self.gradInput

    def __repr__(self):
        return "SoftMax"


## 3. (0.2) LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [93]:
class LogSoftMax(Module):
    def __init__(self):
        super().__init__()

    def updateOutput(self, input):
        stable_input = input - np.max(input, axis=1 , keepdims=True)
        log_sum_exp = np.log(np.sum(np.exp(stable_input), axis=1, keepdims=True))
        self.output = stable_input - log_sum_exp
        return self.output

    def updateGradInput(self, input, gradOutput):
        softmax_vals = np.exp(self.output)
        self.gradInput = gradOutput - softmax_vals * np.sum(gradOutput, axis=1, keepdims=True)
        return self.gradInput

    def __repr__(self):
        return "LogSoftMax"


In [94]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()

    def updateOutput(self, input):
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = self.output - np.log(np.sum(np.exp(self.output), axis=1, keepdims=True))
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput - np.exp(self.output) * np.sum(gradOutput,  axis=1, keepdims=True)
        return self.gradInput

    def __repr__(self):
        return "LogSoftMax"

## 4. (0.3) Batch normalization
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance:
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance.

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [95]:
class BatchNormalization(Module):
    EPS = 1e-3  # Маленькая константа для избежания деления на ноль

    def __init__(self, alpha=0.):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha  # Коэффициент обновления для экспоненциального среднего
        self.moving_mean = None  # Хранимое среднее для тестового режима
        self.moving_variance = None  # Хранимая дисперсия для тестового режима

    def updateOutput(self, input):
        if self.training:
            # Вычисляем среднее значение и дисперсию для текущего батча
            batch_mean = np.mean(input, axis=0)
            batch_mean_shifted = batch_mean + self.EPS  # Добавляем небольшое смещение для стабильности
            
            batch_var = np.var(input, axis=0, ddof=0)
            adjusted_var = np.maximum(batch_var, self.EPS)  # Гарантируем, что дисперсия не станет слишком маленькой
            batch_std = np.sqrt(adjusted_var)  # Стандартное отклонение
            
            # Нормализация входных данных
            self.output = (input - batch_mean_shifted) / batch_std

            # Обновляем экспоненциальное скользящее среднее и дисперсию
            if self.moving_mean is None:
                self.moving_mean = batch_mean
                self.moving_variance = batch_var
            else:
                self.moving_mean = self.alpha * self.moving_mean + (1 - self.alpha) * batch_mean
                self.moving_variance = self.alpha * self.moving_variance + (1 - self.alpha) * batch_var
        else:
            # Используем сохраненные значения при инференсе
            norm_factor = 1 / np.sqrt(self.moving_variance + self.EPS)  # Масштабируем вход
            self.output = (input - self.moving_mean) * norm_factor  # Применяем нормализацию
        return self.output

    def updateGradInput(self, input, gradOutput):
        if self.training:
            batch_mean = np.mean(input, axis=0)  # Среднее по батчу
            batch_var = np.var(input, axis=0)  # Дисперсия по батчу

            safe_inv_std = 1 / np.sqrt(batch_var + self.EPS)  # Безопасное обращение стандартного отклонения
            batch_size = input.shape[0]  # Количество примеров в батче

            # Градиент по среднему значению
            grad_mean = np.mean(gradOutput, axis=0)

            # Градиент по входу с учетом дисперсии
            grad_centered_input = (input - batch_mean) * safe_inv_std**2
            grad_correction = np.mean(gradOutput * grad_centered_input, axis=0)

            # Вычисляем окончательный градиент входа
            self.gradInput = (gradOutput - grad_mean - (input - batch_mean) * grad_correction) * safe_inv_std

            # Градиенты для параметров γ (масштабирование) и β (сдвиг)
            self.gradGamma = np.sum((input - batch_mean) * gradOutput * safe_inv_std, axis=0)
            self.gradBeta = np.sum(gradOutput, axis=0)
        else:
            self.gradInput = gradOutput  # При тестировании просто передаем градиент дальше
        return self.gradInput

    def __repr__(self):
        return "BatchNormalization"


In [96]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)

        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)

    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)

    def getParameters(self):
        return [self.gamma, self.beta]

    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]

    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. (0.3) Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [97]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        self.p = p  # Вероятность зануления нейрона
        self.mask = None  # Маска для зануления случайных нейронов

    def updateOutput(self, input):
        if not self.training:
            # Во время тестирования Dropout не применяется, передаём вход как есть
            self.output = input
        else:
            # Генерируем маску: 1 с вероятностью (1 - p), 0 с вероятностью p
            self.mask = np.random.binomial(1, 1 - self.p, size=input.shape)
            
            # Масштабируем выход, чтобы среднее значение оставалось неизменным
            self.output = input * self.mask / (1 - self.p)
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Градиент передается только через "выжившие" нейроны
        self.gradInput = np.multiply(gradOutput, self.mask) / (1 - self.p)
        return self.gradInput

    def __repr__(self):
        return "Dropout"

## 6. (2.0) Conv2d
Implement [**Conv2d**](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html). Use only this list of parameters: (in_channels, out_channels, kernel_size, stride, padding, bias, padding_mode) and fix dilation=1 and groups=1.

In [98]:
class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True, padding_mode='zeros'):
        super().__init__()

        # Обработка режима паддинга
        self.padding_mode = 'zeros' if padding_mode != 'zeros' else padding_mode
        if padding_mode != 'zeros':
            print(f"Warning: Unsupported padding_mode '{padding_mode}', defaulting to 'zeros'.")

        # Основные параметры слоя
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
        self.stride = (stride, stride) if isinstance(stride, int) else tuple(stride)
        self.padding = (padding, padding) if isinstance(padding, int) else tuple(padding)

        # Инициализация весов (Xavier initialization)
        weight_std = np.sqrt(2.0 / (self.in_channels * np.prod(self.kernel_size)))
        self.W = np.random.randn(self.out_channels, self.in_channels, *self.kernel_size) * weight_std
        self.gradW = np.zeros_like(self.W)

        # Смещение
        self.bias = bias
        if bias:
            self.b = np.zeros(self.out_channels)
            self.gradb = np.zeros_like(self.b)
        else:
            self.b = None
            self.gradb = None

        self.input_cache = None  # Кэш для хранения входных данных
        self.col_buffer = None   # Кэш для хранения развернутых данных (im2col)

    def updateOutput(self, input):
        # Кэшируем размерность входного тензора
        self.input_cache = input.shape
        batch_size, _, input_h, input_w = input.shape
        kernel_h, kernel_w = self.kernel_size
        stride_h, stride_w = self.stride
        pad_h, pad_w = self.padding

        # Определяем размеры выходного тензора
        out_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
        out_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1

        # Создаем выходной массив
        self.output = np.zeros((batch_size, self.out_channels, out_h, out_w))

        # Проходим по каждому изображению в батче
        for n in range(batch_size):
            padded_input = np.pad(input[n], ((0, 0), (pad_h, pad_h), (pad_w, pad_w)), mode='constant')

            # Применяем свертку вручную
            for oc in range(self.out_channels):  # Перебираем выходные каналы
                for i in range(out_h):
                    for j in range(out_w):
                        region = padded_input[:, i * stride_h:i * stride_h + kernel_h, j * stride_w:j * stride_w + kernel_w]
                        self.output[n, oc, i, j] = np.sum(region * self.W[oc]) + (self.b[oc] if self.bias else 0)

        return self.output

    def updateGradInput(self, input, gradOutput):
        batch_size, _, input_h, input_w = self.input_cache
        kernel_h, kernel_w = self.kernel_size
        stride_h, stride_w = self.stride
        pad_h, pad_w = self.padding

        # Создаем массив для градиента входа
        self.gradInput = np.zeros_like(input)

        # Проходим по каждому изображению в батче
        for n in range(batch_size):
            padded_grad = np.pad(self.gradInput[n], ((0, 0), (pad_h, pad_h), (pad_w, pad_w)), mode='constant')

            # Обратное распространение ошибки через фильтр
            for oc in range(self.out_channels):
                for i in range(gradOutput.shape[2]):
                    for j in range(gradOutput.shape[3]):
                        padded_grad[:, i * stride_h:i * stride_h + kernel_h, j * stride_w:j * stride_w + kernel_w] += \
                            self.W[oc] * gradOutput[n, oc, i, j]

            # Убираем паддинг после вычислений
            self.gradInput[n] = padded_grad[:, pad_h:pad_h + input_h, pad_w:pad_w + input_w]

        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        batch_size, _, input_h, input_w = self.input_cache
        kernel_h, kernel_w = self.kernel_size
        stride_h, stride_w = self.stride
        pad_h, pad_w = self.padding

        # Создаем массивы для градиентов
        self.gradW.fill(0)
        if self.bias:
            self.gradb.fill(0)

        # Проходим по каждому изображению в батче
        for n in range(batch_size):
            padded_input = np.pad(input[n], ((0, 0), (pad_h, pad_h), (pad_w, pad_w)), mode='constant')

            for oc in range(self.out_channels):
                for i in range(gradOutput.shape[2]):
                    for j in range(gradOutput.shape[3]):
                        self.gradW[oc] += padded_input[:, i * stride_h:i * stride_h + kernel_h, j * stride_w:j * stride_w + kernel_w] * gradOutput[n, oc, i, j]

                        # Обновляем градиент смещения
                        if self.bias:
                            self.gradb[oc] += gradOutput[n, oc, i, j]

    def zeroGradParameters(self):
        self.gradW.fill(0)
        if self.bias:
            self.gradb.fill(0)

    def getParameters(self):
        return [self.W, self.b] if self.bias else [self.W]

    def getGradParameters(self):
        return [self.gradW, self.gradb] if self.bias else [self.gradW]

    def __repr__(self):
        return f"Conv2d(in_channels={self.in_channels}, out_channels={self.out_channels}, kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding})"



## 7. (0.5) Implement [**MaxPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) and [**AvgPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html). 
Use only parameters like kernel_size, stride, padding (negative infinity for maxpool and zero for avgpool) and other parameters fixed as in framework.

In [82]:
class MaxPool2d(Module):
    def __init__(self, kernel_size, stride, padding):
        super(MaxPool2d, self).__init__()

        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
        self.stride = (stride, stride) if isinstance(stride, int) else stride
        self.padding = (padding, padding) if isinstance(padding, int) else padding
        self.cache = None
        
    def updateOutput(self, input):
        # Получаем размеры входного тензора
        batch, num_channels, H_in, W_in = input.shape
        # Расчет размеров выходного тензора
        H_out = ((H_in + 2*self.padding[0] - self.kernel_size[0]) // self.stride[0] + 1)
        W_out = ((W_in + 2*self.padding[1] - self.kernel_size[1]) // self.stride[1] + 1)

        # Инициализация выходного тензора
        self.output = np.zeros((batch, num_channels, H_out, W_out))
        # Индексы для максимальных значений в окне
        self.indices = np.zeros((batch, num_channels, H_out, W_out, 2), dtype=int)
        
        # Добавление паддинга
        padding = np.pad(input, ((0,0), (0,0), (self.padding[0], self.padding[0]), (self.padding[1], self.padding[1])), mode='constant')
        
        # Итерации по батчу и каналам
        for b in range(batch):
            for c in range(num_channels):
                for i in range(H_out):
                    for j in range(W_out):
                        h_start = i * self.stride[0]
                        h_end = h_start + self.kernel_size[0]
                        w_start = j * self.stride[1]
                        w_end = w_start + self.kernel_size[1]

                        # Извлекаем окно
                        window = padding[b, c, h_start:h_end, w_start:w_end]
                        self.output[b, c, i, j] = np.max(window)
                        max_idx = np.unravel_index(np.argmax(window), window.shape)
                        # Сохраняем индексы максимальных значений
                        self.indices[b, c, i, j] = [h_start + max_idx[0], w_start + max_idx[1]]
        
        self.cache = (batch, num_channels, H_in, W_in)
        return self.output

    def updateGradInput(self, input, gradOutput):
        batch, num_channels, H_in, W_in = self.cache
        # Расчет выходной формы после паддинга
        out_shape = (batch, num_channels, H_in + 2*self.padding[0], W_in + 2*self.padding[1])
        self.gradInput = np.zeros(out_shape)

        # Обратный проход через слои
        for b in range(batch):
            for c in range(num_channels):
                for i in range(gradOutput.shape[2]):
                    for j in range(gradOutput.shape[3]):
                        h, w = self.indices[b, c, i, j]
                        # Распределяем градиент на соответствующий пиксель
                        self.gradInput[b, c, h, w] += gradOutput[b, c, i, j]

        # Обработка паддинга
        if self.padding[0] > 0 or self.padding[1] > 0:
            self.gradInput = self.gradInput[:, :, self.padding[0]:-self.padding[0], self.padding[1]:-self.padding[1]] if self.padding[0] > 0 and self.padding[1] > 0 else \
                             self.gradInput[:, :, self.padding[0]:-self.padding[0], :] if self.padding[0] > 0 else \
                             self.gradInput[:, :, :, self.padding[1]:-self.padding[1]]
                             
        return self.gradInput

    def __repr__(self):
        return "MaxPool2d"


class AvgPool2d(Module):
    def __init__(self, kernel_size, stride, padding):
        super(AvgPool2d, self).__init__()

        self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
        self.stride = (stride, stride) if isinstance(stride, int) else stride
        self.padding = (padding, padding) if isinstance(padding, int) else padding
        self.cache = None

    def updateOutput(self, input):
        # Получаем размеры входного тензора
        batch, num_channels, H_in, W_in = input.shape
        # Расчет размеров выходного тензора
        H_out = ((H_in + 2*self.padding[0] - self.kernel_size[0]) // self.stride[0] + 1)
        W_out = ((W_in + 2*self.padding[1] - self.kernel_size[1]) // self.stride[1] + 1)
        
        # Инициализация выходного тензора
        self.output = np.zeros((batch, num_channels, H_out, W_out))
        
        # Добавление паддинга
        padding = np.pad(input, ((0,0), (0,0), (self.padding[0],self.padding[0]), (self.padding[1],self.padding[1])), mode='constant')
        
        # Итерации по батчу и каналам
        for b in range(batch):
            for c in range(num_channels):
                for i in range(H_out):
                    for j in range(W_out):
                        h_start = i* self.stride[0]
                        h_end = h_start + self.kernel_size[0]
                        w_start = j * self.stride[1]
                        w_end = w_start + self.kernel_size[1]

                        # Извлекаем окно и вычисляем среднее
                        window = padding[b, c, h_start:h_end, w_start:w_end]
                        self.output[b, c,  i, j] = np.mean(window)
        
        self.cache = (batch,  num_channels, H_in, W_in)
        return self.output

    def updateGradInput(self, input, gradOutput):
        batch, num_channels, H_in, W_in = self.cache
        # Расчет выходной формы после паддинга
        out_shape= (batch, num_channels, H_in + 2*self.padding[0], W_in + 2*self.padding[1])
        self.gradInput = np.zeros(out_shape)
        
        # Нормализация окна
        norm_win =1 / (self.kernel_size[0] * self.kernel_size[1])
        
        # Обратный проход через слои
        for b in range(batch):
            for c in range(num_channels):
                for i in range(gradOutput.shape[2]):
                    for j in range(gradOutput.shape[3]):
                        h_start = i * self.stride[0]
                        h_end = h_start + self.kernel_size[0]
                        w_start = j * self.stride[1]
                        w_end = w_start + self.kernel_size[1]
                        
                        # Распределяем градиент на все элементы окна
                        self.gradInput[b, c, h_start:h_end, w_start:w_end] += gradOutput[b, c, i, j] * norm_win
                        
        # Обработка паддинга
        if self.padding[0] > 0 or self.padding[1] > 0:
            self.gradInput = self.gradInput[:, :, self.padding[0]:-self.padding[0], self.padding[1]:-self.padding[1]] if self.padding[0] > 0 and self.padding[1] > 0 else \
                             self.gradInput[:, :, self.padding[0]:-self.padding[0], :] if self.padding[0] > 0 else \
                             self.gradInput[:, :, :, self.padding[1]:-self.padding[1]]
                             
        return self.gradInput

    def __repr__(self):
        return "AvgPool2d"



## 8. (0.3) Implement **GlobalMaxPool2d** and **GlobalAvgPool2d**.
They do not have testing and parameters are up to you but they must aggregate information within channels. Write test functions for these layers on your own.

In [83]:
class GlobalMaxPool2d(Module):
    def __init__(self, keepdims=False):
        super(GlobalMaxPool2d, self).__init__()
        self.keepdims = keepdims
        self.cache = None

    def updateOutput(self, input_data):
        batch_size, channels, height, width = input_data.shape
        
        # Преобразуем вход в одномерный массив и находим индексы максимальных значений
        self.cache = input_data
        input_reshaped = input_data.reshape(batch_size, channels, -1)
        self.max_indices = np.argmax(input_reshaped, axis=2)
        
        # Находим максимальные значения по осям (высота и ширина)
        self.output = np.max(input_reshaped, axis=2)
        
        if self.keepdims:
            self.output = self.output.reshape(batch_size, channels, 1, 1)

        return self.output

    def updateGradInput(self, input_data, grad_output):
        batch_size, channels = grad_output.shape[:2]
        height, width = input_data.shape[-2:]
        
        # Создаем пустой градиент
        self.gradInput = np.zeros_like(input_data)
        
        if self.keepdims:
            grad_output = grad_output.reshape(batch_size, channels)
        
        # Разделяем индексы на максимальные значения по высоте и ширине
        max_height, max_width = self._get_max_indices()

        # Распределяем градиенты только для максимальных значений
        self._assign_gradients_to_max_values(grad_output, batch_size, channels, max_height, max_width)

        return self.gradInput

    def _get_max_indices(self):
        """Функция для получения индексов максимальных значений по высоте и ширине"""
        image_width = self.cache.shape[3]  # ширина входного изображения
        max_height = self.max_indices // image_width  # Индекс высоты
        max_width = self.max_indices % image_width   # Индекс ширины
        return max_height, max_width

    def _assign_gradients_to_max_values(self, grad_output, batch_size, channels, max_height, max_width):
        """Функция для распределения градиентов на максимальные значения"""
        batch_indices, channel_indices = np.indices((batch_size, channels))
        for b, c in zip(batch_indices.flatten(), channel_indices.flatten()):
            self.gradInput[b, c, max_height[b, c], max_width[b, c]] = grad_output[b, c]

    def __repr__(self):
        return f"GlobalMaxPool2d(keepdims={self.keepdims})"


class GlobalAvgPool2d(Module):
    def __init__(self, keepdims=False):
        super(GlobalAvgPool2d, self).__init__()
        self.keepdims = keepdims

    def updateOutput(self, input_data):
        batch_size, channels, height, width = input_data.shape
        
        # Находим среднее значение по осям (высота и ширина)
        self.output = np.mean(input_data, axis=(2, 3))
        
        if self.keepdims:
            self.output = self.output.reshape(batch_size, channels, 1, 1)

        return self.output

    def updateGradInput(self, input_data, grad_output):
        batch_size, channels = grad_output.shape[:2]
        height, width = input_data.shape[-2:]
        num_pixels = height * width
        
        # Создаем пустой градиент
        self.gradInput = np.zeros_like(input_data)
        
        if self.keepdims:
            grad_output = grad_output.reshape(batch_size, channels)
        
        # Распределяем градиенты равномерно по всем пикселям
        for n in range(batch_size):
            for c in range(channels):
                for h in range(height):
                    for w in range(width):
                        self.gradInput[n, c, h, w] = grad_output[n, c] / num_pixels

        return self.gradInput

    def __repr__(self):
        return f"GlobalAvgPool2d(keepdims={self.keepdims})"


## 9. (0.2) Implement [**Flatten**](https://pytorch.org/docs/stable/generated/torch.flatten.html)

In [84]:
class Flatten(Module):
    def __init__(self, start_dim=0, end_dim=-1):
        super(Flatten, self).__init__()
        self.start_dim = start_dim
        self.end_dim = end_dim

    def updateOutput(self, input_data):
        self.input_shape = input_data.shape  # Сохраняем исходную форму
        start_dim = self.start_dim
        end_dim = self.end_dim if self.end_dim >= 0 else len(input_data.shape) + self.end_dim
        
        # Вычисляем новую форму
        new_shape = input_data.shape[:start_dim] + (-1,) + input_data.shape[end_dim + 1:]
        self.output = input_data.reshape(new_shape)
        return self.output

    def updateGradInput(self, input_data, grad_output):
        # Восстанавливаем исходную форму градиента
        self.gradInput = grad_output.reshape(self.input_shape)
        return self.gradInput

    def __repr__(self):
        return f"Flatten(start_dim={self.start_dim}, end_dim={self.end_dim})"


# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**):

In [62]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()

    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput

    def __repr__(self):
        return "ReLU"

## 10. (0.1) Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope.

In [85]:
class LeakyReLU(Module):
    def __init__(self, slope=0.03):
        super(LeakyReLU, self).__init__()
        self.slope_value = slope

    def updateOutput(self, input_data):
        self.output = np.where(input_data > 0, input_data, self.slope_value * input_data)
        return  self.output

    def updateGradInput(self, input_data, grad_output):
        grad_input = np.where(input_data >= 0, 1, self.slope_value)
        self.gradInput = grad_input * grad_output
        return  self.gradInput

    def __repr__(self):
        return "LeakyReLU"

## 11. (0.1) ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [64]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()

        self.alpha = alpha

    def updateOutput(self, input):
        self.output = np.where(input >= 0, input, self.alpha * (np.exp(input) - 1))
        return  self.output

    def updateGradInput(self, input, gradOutput):
        gradInput = np.where(input >= 0, 1, self.alpha * np.exp(input))
        self.gradInput = gradInput * gradOutput
        return self.gradInput

    def __repr__(self):
        return "ELU"

## 12. (0.1) SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [65]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()

    def updateOutput(self, input):
 
        self.output = np.log(1 + np.exp(input))
        return  self.output
    def updateGradInput(self, input, gradOutput):
        gradInput = 1 / (1 + np.exp(-input))  # sigmoid(x)
        self.gradInput = gradInput * gradOutput
        return self.gradInput

    def __repr__(self):
        return "SoftPlus"

## 13. (0.2) Gelu
Implement [**Gelu**](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html) activations.

In [66]:
class GeLU(Module): 
    def __init__(self):
        super(GeLU, self).__init__()

    def updateOutput(self, input):
        sqrt_2_pi = (2 / np.pi) ** 0.5
        cubic_term = 0.044715 * input**3
        self.output= 0.5 * input * (1 + np.tanh(sqrt_2_pi * (input + cubic_term)))
        return self.output

    def updateGradInput(self, input, gradOutput):
        sqrt_2_pi = (2 / np.pi) ** 0.5
        cubic_term = 0.044715 * input**3
        tanh_term = np.tanh(sqrt_2_pi * (input + cubic_term))
        dtanh_dx = (1 - tanh_term**2) * sqrt_2_pi * (1 + 3 * 0.044715 * input**2)
        self.gradInput = gradOutput * (0.5 * (1 + tanh_term) + 0.5 * input * dtanh_dx)
        return self.gradInput

    def __repr__(self):
        return "GeLU"


# Criterions

Criterions are used to score the models answers.

In [67]:
class Criterion(object):
    def __init__(self):
        self.output = None
        self.gradInput = None

    def forward(self, input, target):
        return self.updateOutput(input, target)

    def backward(self, input, target):
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        return self.output

    def updateGradInput(self, input, target):
        return self.gradInput

    def __repr__(self):
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [68]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 14. (0.2) Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula,
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [77]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self)
        super(ClassNLLCriterionUnstable, self).__init__()

    def updateOutput(self, input, target):

        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.output = -np.sum(target * np.log(input_clamp)) / input_clamp.shape[0]
        return self.output

    def updateGradInput(self, input, target):

        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.gradInput = -target / input_clamp / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterionUnstable"

## 15. (0.3) Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [86]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()

    def updateOutput(self, input, target):
       self.output = -np.sum(target * input) / input.shape[0]
       return self.output

    def updateGradInput(self, input, target):
      self.gradInput = -target / input.shape[0]
      return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterion

1-я часть задания: реализация слоев, лосей и функций активации - 5 баллов. \\
2-я часть задания: реализация моделей на своих классах. Что должно быть:
  1. Выберите оптимизатор и реализуйте его, чтоб он работал с вами классами. - 1 балл.
  2. Модель для задачи мультирегрессии на выбраных вами данных. Использовать FCNN, dropout, batchnorm, MSE. Пробуйте различные фукнции активации. Для первой модели попробуйте большую, среднюю и маленькую модель. - 1 балл.
  3. Модель для задачи мультиклассификации на MNIST. Использовать свёртки, макспулы, флэттэны, софтмаксы - 1 балла.
  4. Автоэнкодер для выбранных вами данных. Должен быть на свёртках и полносвязных слоях, дропаутах, батчнормах и тд. - 2 балла. \\

Дополнительно в оценке каждой модели будет учитываться:
1. Наличие правильно выбранной метрики и лосс функции.
2. Отрисовка графиков лосей и метрик на трейне-валидации. Проверка качества модели на тесте.
3. Наличие шедулера для lr.
4. Наличие вормапа.
5. Наличие механизма ранней остановки и сохранение лучшей модели.
6. Свитч лося (метрики) и оптимайзера.

In [None]:
class Adam:
    def __init__(self, parameters, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.parameters = parameters
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = {param: np.zeros_like(param.data) for param in parameters}
        self.v = {param: np.zeros_like(param.data) for param in parameters}
        self.t = 0  

    def step(self):
        self.t += 1
        for param in self.parameters:
            if param.grad is not None:
                # Adam обновление
                self.m[param] = self.beta1 * self.m[param] + (1 - self.beta1) * param.grad
                self.v[param] = self.beta2 * self.v[param] + (1 - self.beta2) * (param.grad ** 2)

                m_hat = self.m[param] / (1 - self.beta1 ** self.t)
                v_hat = self.v[param] / (1 - self.beta2 ** self.t)

                param.data -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

    def zero_grad(self):
        for param in self.parameters:
            if param.grad is not None:
                param.grad.fill(0)