# Нейронные сети
- Найти данные в виде изображений для задачи классификации. Например, можно взять данные [отсюда](http://ufldl.stanford.edu/housenumbers/).
- Реализовать классы, необходимые для построения сети со следующими слоями
	- FullyConnectedLayer
	- ReluLayer
	- FullyConnectedLayer
- Использовать CrossEntropyLoss и L2-регуляризацию.
- Обучить модель на тренировочных данных, подбирать параметры (особенно learning rate) на валидационной и оценить качество на тестовой. Анализировать графики train/val loss, проверять на каждом шаге корректность вычисления градиентов с помощью разностной оценки.
- (**+2 балла**) Добавить Batch normalization.
- (**+2 балла**) В качестве оптимизатор использовать один из: Momentum, RMSprop.
- (**+1 балл**) Также реализовать оптимизатор Adam.

## Датасет
Для решения задачи был взят датасет http://ufldl.stanford.edu/housenumbers/, а именно уже разделенные тренировочные и тестовые данные, без учета дополнительных данных. В репозитории этого датасета нет из-за его большого размера (~ 182MB train и 64MB test)

In [1]:
!pip install --quiet matplotlib
!pip install --quiet numpy
!pip install --quiet pandas 
!pip install --quiet torch
!pip install --quiet torchvision

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import SVHN
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split
from itertools import product
import pandas as pd

Подготовка датасета

In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

raw_train_data = SVHN(
    root="../data/task07", 
    split='train', 
    download=True, 
    transform=transform)

test_data = SVHN(
    root="../data/task07", 
    split='test', 
    download=True, 
    transform=transform)

Создание loader-ов

In [4]:
raw_train_size = len(raw_train_data)
val_size = int(0.3 * raw_train_size)
train_size = raw_train_size - val_size

train_data, val_data = random_split(raw_train_data, [train_size, val_size])

train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=False)

## Слои сети

In [5]:
class FullyConnectedLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.W = nn.Parameter(torch.randn(input_dim, output_dim) * torch.sqrt(torch.tensor(2.0 / input_dim)))
        self.b = nn.Parameter(torch.zeros(output_dim))

    def forward(self, x):
        self._x = x
        return x @ self.W + self.b

    def backward(self, grad_out):
        grad_in = grad_out @ self.W.t()
        self.W.grad = self._x.t() @ grad_out
        self.b.grad = grad_out.sum(dim=0)
        return grad_in

In [6]:
class ReluLayer(nn.Module):
    def forward(self, x):
        self._x = x
        return F.relu(x)
    
    def backward(self, grad_output):
        grad_input = grad_output.clone()
        grad_input[self._x <= 0] = 0
        return grad_input

Нейронная сеть

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, dim_hidden=128, use_bn=False):
        super().__init__()
        self.use_bn = use_bn
        self.layer_in = FullyConnectedLayer(3 * 32 * 32, dim_hidden)
        self.activation = ReluLayer()
        self.bn_layer = nn.BatchNorm1d(dim_hidden) if use_bn else None
        self.layer_out = FullyConnectedLayer(dim_hidden, 10)

    def forward(self, inputs):
        reshaped = inputs.view(-1, 3 * 32 * 32)
        out = self.layer_in(reshaped)
        out = self.activation(out)
        if self.use_bn:
            out = self.bn_layer(out)
        out = self.layer_out(out)
        return out

## Оптимизатор Adam

In [8]:
class AdamOptimizer:
    def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        self.params = list(params)
        self.lr = lr
        self.betas = betas
        self.eps = eps
        self.weight_decay = weight_decay
        
        self.states = []
        for p in self.params:
            if p.requires_grad:
                state = {
                    'm': torch.zeros_like(p.data),
                    'v': torch.zeros_like(p.data),
                    't': 0
                }
                self.states.append(state)
            else:
                self.states.append(None)
        
        self.t = 0 

    def step(self):
        self.t += 1
        beta1, beta2 = self.betas
        
        for param, state in zip(self.params, self.states):
            if param.grad is None or state is None:
                continue
                
            grad = param.grad.data
            
            if self.weight_decay != 0:
                grad += self.weight_decay * param.data

            state['m'] *= beta1
            state['m'] += (1 - beta1) * grad
            state['v'] *= beta2
            state['v'] += (1 - beta2) * grad * grad
            
            m_hat = state['m'] / (1 - beta1 ** self.t)
            v_hat = state['v'] / (1 - beta2 ** self.t)
            
            param.data -= self.lr * m_hat / (v_hat.sqrt() + self.eps)

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.detach_()
                param.grad.zero_()

## Обучение 

In [9]:
devide = device = torch.device('cpu')

def train(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    train_loss = []
    val_loss = []
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        

        train_loss.append(running_loss/len(train_loader))
        
        model.eval()
        val_running_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item()
        
        val_loss.append(val_running_loss/len(val_loader))
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}")
    
    return train_loss, val_loss

In [10]:
def get_optimizer(name, params, lr, **kwargs):
    optimizers = {
        'adam': lambda: AdamOptimizer(params, lr=lr, **kwargs),
        'momentum': lambda: optim.SGD(params, lr=lr, momentum=0.9, **kwargs),
        'sgd': lambda: optim.SGD(params, lr=lr, **kwargs)
    }
    
    return optimizers[name]()

def run_experiments(models, lrs, optimizers, train_loader, val_loader, num_epochs=10):
    results = []
    
    for model_config, lr, optim_name in product(models, lrs, optimizers):
        model = NeuralNetwork(batch_normalization=model_config['batch_norm'])
        model = model.to(device)
        
        if model_config.get('seed'):
            torch.manual_seed(model_config['seed'])
            model.apply(init_weights)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = get_optimizer(optim_name, model.parameters(), lr)
        
        print(f"\nTraining {model.__class__.__name__} "
              f"with lr={lr}, optimizer={optim_name}, "
              f"batch norm={model_config['batch_norm']}")
        
        try:
            train_loss, val_loss = train(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                criterion=criterion,
                optimizer=optimizer,
                epochs=num_epochs
            )
            
            results.append({
                'model_type': 'BN' if model_config['batch_norm'] else 'NoBN',
                'lr': lr,
                'optimizer': optim_name,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'final_train_loss': train_loss[-1],
                'final_val_loss': val_loss[-1]
            })
            
            plot_losses(train_loss, val_loss, model_config, lr, optim_name)
            
        except Exception as e:
            print(f"Experiment failed: {str(e)}")
            continue
    
    return pd.DataFrame(results)

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_normal_(m.weight)
        m.bias.data.fill_(0.01)

def plot_losses(train_loss, val_loss, model_config, lr, optim_name):
    plt.figure(figsize=(10, 6))
    plt.plot(train_loss, label='Train Loss', marker='o')
    plt.plot(val_loss, label='Val Loss', marker='s')
    plt.title(f"BN={model_config['batch_norm']}, lr={lr}, Optim={optim_name}")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(f"loss_BN_{model_config['batch_norm']}_lr_{lr}_{optim_name}.png")
    plt.close()

experiment_config = {
    'models': [{'batch_norm': False, 'seed': 42}, 
               {'batch_norm': True, 'seed': 42}],
    'lrs': [0.001, 0.01],
    'optimizers': ['adam', 'momentum'],
    'num_epochs': 10
}

results_df = run_experiments(
    models=experiment_config['models'],
    lrs=experiment_config['lrs'],
    optimizers=experiment_config['optimizers'],
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=experiment_config['num_epochs']
)

print("\nBest configurations:")
print(results_df.sort_values('final_val_loss').head())

results_df.to_csv('experiment_results.csv', index=False)


Training NeuralNetwork with lr=0.001, optimizer=adam, batch norm=False
Epoch 1/10, Train Loss: 1.3897412085622327, Val Loss: 1.13796490915986
Epoch 2/10, Train Loss: 0.9446058612214657, Val Loss: 0.9425010339811791
Epoch 3/10, Train Loss: 0.820262461714911, Val Loss: 0.9216330870119638
Epoch 4/10, Train Loss: 0.7414359329941861, Val Loss: 0.79617451469219
Epoch 5/10, Train Loss: 0.6991120759314136, Val Loss: 0.7776406156982101
Epoch 6/10, Train Loss: 0.6454887787562951, Val Loss: 0.7084675283106261
Epoch 7/10, Train Loss: 0.6328630697474515, Val Loss: 0.7199454494687014
Epoch 8/10, Train Loss: 0.6003791517636128, Val Loss: 0.6894269903991804
Epoch 9/10, Train Loss: 0.5808715683006289, Val Loss: 0.7115949100359927
Epoch 10/10, Train Loss: 0.5634153442378353, Val Loss: 0.7720921026586101

Training NeuralNetwork with lr=0.001, optimizer=momentum, batch norm=False
Epoch 1/10, Train Loss: 1.9232909860753655, Val Loss: 1.5823067340046861
Epoch 2/10, Train Loss: 1.3881329245995404, Val Loss:

## Результаты 

In [11]:
pd.read_csv('experiment_results.csv')  

Unnamed: 0,model_type,lr,optimizer,train_loss,val_loss,final_train_loss,final_val_loss
0,NoBN,0.001,adam,"[1.3897412085622327, 0.9446058612214657, 0.820...","[1.13796490915986, 0.9425010339811791, 0.92163...",0.563415,0.772092
1,NoBN,0.001,momentum,"[1.9232909860753655, 1.3881329245995404, 1.164...","[1.5823067340046861, 1.2559536875680435, 1.106...",0.768442,0.810565
2,NoBN,0.01,adam,"[2.1891276205418415, 1.4821687030821964, 1.536...","[1.7062780803372695, 1.8731124335250189, 2.263...",1.441332,1.346241
3,NoBN,0.01,momentum,"[1.3169760640125323, 0.9084285393988997, 0.795...","[1.0189300059925679, 0.8788112791818242, 0.882...",0.546216,0.719027
4,BN,0.001,adam,"[1.487420904740431, 1.0393446443086847, 0.9219...","[1.167830377817154, 0.9558829102058743, 0.8927...",0.651107,0.715368
5,BN,0.001,momentum,"[1.6541047684866887, 1.1288522391247928, 0.983...","[1.238197989068752, 1.0107809487123822, 0.9366...",0.671751,0.724063
6,BN,0.01,adam,"[1.560680665354479, 1.1384312926682452, 1.0207...","[1.16967354576255, 1.026318756646888, 0.994482...",0.732441,0.789052
7,BN,0.01,momentum,"[1.4729630060475367, 1.0582162459889553, 0.931...","[1.1953521232272304, 0.9533800766911618, 0.865...",0.658688,0.720648


Модель с использованием Batch Normalization и при learning rate = 0.001 с оптимизатором Adam показала наилучший результат (train: 0.651107, val: 0.715368)