# *Challenge 2*: *Discovering Complexity in Neural Networks*

Advanced Topics in Machine Learning -- Fall 2023, UniTS

<a target="_blank" href="https://colab.research.google.com/github/ganselmif/adv-ml-units/blob/main/notebooks/AdvML_Challenge_2.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

In this Notebook we will perform an automatic analysis of complexity of a CNN over the Cifar10 dataset, in order to understand how non-linear is our feature representation (I choose this challenge as second project instead the the Challenge 2)

### Main Idea
We'll analyze a naive CNN architecture: Convolution -> ReLU -> Convolution -> ReLU -> Convolution -> ReLU -> Linear -> Softmax

Now, we want to set a trainable non-linearity $R_{\beta}(x) = ReLU(x) - \beta \cdot ReLU(-x)$, if we take $\beta = (1 - \alpha)$ we obtain

$R_{\alpha}(x) = ReLU(x) - (1-\alpha)\cdot ReLU(-x)$

If we observe that $x = ReLU(x) - ReLU(-x)$ we finally obtain:

$$R_{\alpha}(x) = x + \alpha \cdot ReLU(-x)$$

Where $\alpha \in [0, 1]$

*Note*: you can also interpret the action of $R_\alpha(x)$ as a residual connection!
*Note*: we'll use a different $\alpha$ for each non linearity

There could be different way to model $\alpha$, Professor Anselmi suggested to simply penalize the final loss with the L1 Norm of the alphas, instead I'll use $\alpha_t = f((f(\gamma)-0.5)\nu_t)$, with $\gamma$ trainable, $f(x) = 1/(1+e^{-x})$, and $\nu_t$ increasing in time, so in the end the model will automatically select if the $\alpha$ is relevant or not.

You can find the same updating strategy in this paper: https://appliednetsci.springeropen.com/articles/10.1007/s41109-023-00542-x

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import random_split
from tqdm import tqdm
import logging
import matplotlib.pyplot as plt

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class GammaAlphaCompute(nn.Module):
    def __init__(self, name):
        super().__init__()
        self.gamma = nn.Parameter(torch.tensor(0.0))
        self.register_buffer('nu', torch.tensor(0.0))
        self.name = name
        self.alpha_history = []
        self.gamma_history = []

    def compute_alpha(self):
        if self.training:
            self.nu += 0.01  # Increase nu if we train

        f_gamma = torch.sigmoid(self.gamma)
        alpha = torch.sigmoid((f_gamma - 0.5) * self.nu)

        if self.training:
            self.alpha_history.append(alpha.item())
            self.gamma_history.append(self.gamma.item())

        return alpha

class CustomNonLinearity(nn.Module):
    def __init__(self, name):
        super().__init__()
        self.ga_compute = GammaAlphaCompute(name)

    def forward(self, x):
        alpha = self.ga_compute.compute_alpha()
        return x + alpha * F.relu(-x)

class CustomCNN(nn.Module):
    def __init__(self):
        super(CustomCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.nl1 = CustomNonLinearity("NL1")

        self.conv2 = nn.Conv2d(64, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.nl2 = CustomNonLinearity("NL2")

        self.conv3 = nn.Conv2d(64, 64, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.nl3 = CustomNonLinearity("NL3")

        self.fc = nn.Linear(64 * 4 * 4, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.nl1(self.bn1(self.conv1(x)))
        x = nn.MaxPool2d(2)(x)

        x = self.nl2(self.bn2(self.conv2(x)))
        x = nn.MaxPool2d(2)(x)

        x = self.nl3(self.bn3(self.conv3(x)))
        x = nn.MaxPool2d(2)(x)

        x = x.view(-1, 64 * 4 * 4)
        x = self.fc(x)
        return self.softmax(x)

transform = transforms.Compose([
    transforms.ToTensor()
    # transforms.Normalize((0., 0., 0.), (255, 255, 255))
])

fullset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

# Split the dataset
train_size = int(0.8 * len(fullset))
val_size = len(fullset) - train_size
trainset, valset = random_split(fullset, [train_size, val_size])

trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=False)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

model = CustomCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return running_loss / len(dataloader), 100 * correct / total

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:18<00:00, 9250527.36it/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [2]:
train_losses, train_accs = [], []
val_losses, val_accs = [], []

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    running_correct = 0
    total_samples = 0

    print(f"\nEpoch {epoch+1}/{num_epochs}")
    print("Model parameters:")
    for name, param in model.named_parameters():
        if 'gamma' in name:
            print(f"{name}: {param.item():.4f}")

    pbar = tqdm(enumerate(trainloader), total=len(trainloader), desc=f'Training: ')
    for i, data in pbar:
        inputs, labels = data

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(outputs.data, 1)
        running_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

        # Compute running averages
        avg_loss = running_loss / (i + 1)
        avg_accuracy = 100 * running_correct / total_samples

        # Update progress bar
        pbar.set_postfix({
            'loss': f'{avg_loss:.4f}',
            'accuracy': f'{avg_accuracy:.2f}%'
        })

    train_losses.append(avg_loss)
    train_accs.append(avg_accuracy)

    # Validation
    val_loss, val_acc = evaluate(model, valloader)
    val_losses.append(val_loss)
    val_accs.append(val_acc)

    # Print epoch summary
    print(f'Epoch {epoch+1} summary:')
    print(f'Train Loss: {avg_loss:.4f}, Train Accuracy: {avg_accuracy:.2f}%')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.2f}%')

print('Finished Training')



Epoch 1/10
Model parameters:
nl1.ga_compute.gamma: 0.0000
nl2.ga_compute.gamma: 0.0000
nl3.ga_compute.gamma: 0.0000


Training: 100%|██████████| 625/625 [00:38<00:00, 16.25it/s, loss=1.9830, accuracy=48.24%]


Epoch 1 summary:
Train Loss: 1.9830, Train Accuracy: 48.24%
Val Loss: 1.9575, Val Accuracy: 50.15%

Epoch 2/10
Model parameters:
nl1.ga_compute.gamma: 0.3195
nl2.ga_compute.gamma: 0.3453
nl3.ga_compute.gamma: -0.5148


Training: 100%|██████████| 625/625 [00:37<00:00, 16.48it/s, loss=1.8486, accuracy=61.75%]


Epoch 2 summary:
Train Loss: 1.8486, Train Accuracy: 61.75%
Val Loss: 1.9026, Val Accuracy: 55.69%

Epoch 3/10
Model parameters:
nl1.ga_compute.gamma: 0.5253
nl2.ga_compute.gamma: 0.6409
nl3.ga_compute.gamma: -0.6907


Training: 100%|██████████| 625/625 [00:38<00:00, 16.23it/s, loss=1.8013, accuracy=66.30%]


Epoch 3 summary:
Train Loss: 1.8013, Train Accuracy: 66.30%
Val Loss: 1.8216, Val Accuracy: 64.17%

Epoch 4/10
Model parameters:
nl1.ga_compute.gamma: 0.6002
nl2.ga_compute.gamma: 0.7731
nl3.ga_compute.gamma: -0.7375


Training: 100%|██████████| 625/625 [00:38<00:00, 16.30it/s, loss=1.7754, accuracy=68.75%]


Epoch 4 summary:
Train Loss: 1.7754, Train Accuracy: 68.75%
Val Loss: 1.8645, Val Accuracy: 59.46%

Epoch 5/10
Model parameters:
nl1.ga_compute.gamma: 0.6566
nl2.ga_compute.gamma: 0.8361
nl3.ga_compute.gamma: -0.7443


Training: 100%|██████████| 625/625 [00:38<00:00, 16.41it/s, loss=1.7536, accuracy=70.95%]


Epoch 5 summary:
Train Loss: 1.7536, Train Accuracy: 70.95%
Val Loss: 1.7801, Val Accuracy: 68.20%

Epoch 6/10
Model parameters:
nl1.ga_compute.gamma: 0.6907
nl2.ga_compute.gamma: 0.8701
nl3.ga_compute.gamma: -0.7449


Training: 100%|██████████| 625/625 [00:38<00:00, 16.33it/s, loss=1.7386, accuracy=72.48%]


Epoch 6 summary:
Train Loss: 1.7386, Train Accuracy: 72.48%
Val Loss: 1.7780, Val Accuracy: 68.42%

Epoch 7/10
Model parameters:
nl1.ga_compute.gamma: 0.7072
nl2.ga_compute.gamma: 0.8858
nl3.ga_compute.gamma: -0.7473


Training: 100%|██████████| 625/625 [00:38<00:00, 16.23it/s, loss=1.7232, accuracy=74.12%]


Epoch 7 summary:
Train Loss: 1.7232, Train Accuracy: 74.12%
Val Loss: 1.7831, Val Accuracy: 67.79%

Epoch 8/10
Model parameters:
nl1.ga_compute.gamma: 0.7148
nl2.ga_compute.gamma: 0.8924
nl3.ga_compute.gamma: -0.7469


Training: 100%|██████████| 625/625 [00:38<00:00, 16.30it/s, loss=1.7121, accuracy=75.09%]


Epoch 8 summary:
Train Loss: 1.7121, Train Accuracy: 75.09%
Val Loss: 1.7436, Val Accuracy: 71.85%

Epoch 9/10
Model parameters:
nl1.ga_compute.gamma: 0.7200
nl2.ga_compute.gamma: 0.8955
nl3.ga_compute.gamma: -0.7465


Training: 100%|██████████| 625/625 [00:37<00:00, 16.51it/s, loss=1.7010, accuracy=76.25%]


Epoch 9 summary:
Train Loss: 1.7010, Train Accuracy: 76.25%
Val Loss: 1.7651, Val Accuracy: 69.58%

Epoch 10/10
Model parameters:
nl1.ga_compute.gamma: 0.7227
nl2.ga_compute.gamma: 0.8967
nl3.ga_compute.gamma: -0.7462


Training: 100%|██████████| 625/625 [00:37<00:00, 16.59it/s, loss=1.6907, accuracy=77.29%]


Epoch 10 summary:
Train Loss: 1.6907, Train Accuracy: 77.29%
Val Loss: 1.7352, Val Accuracy: 72.75%
Finished Training


In [3]:
# Final test evaluation
test_loss, test_acc = evaluate(model, testloader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%')

# Plot training and validation metrics
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss over epochs')

plt.subplot(1, 2, 2)
plt.plot(train_accs, label='Train')
plt.plot(val_accs, label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy over epochs')

plt.tight_layout()
plt.savefig('training_validation_metrics.png')
plt.close()

# Plot alpha evolution
plt.figure(figsize=(10, 6))
plt.plot(model.nl1.ga_compute.alpha_history, label='NL1')
plt.plot(model.nl2.ga_compute.alpha_history, label='NL2')
plt.plot(model.nl3.ga_compute.alpha_history, label='NL3')
plt.xlabel('Training Steps')
plt.ylabel('Alpha Value')
plt.title('Evolution of Alpha Values')
plt.legend()
plt.savefig('alpha_evolution.png')
plt.close()

print("Training/validation metrics plot saved as 'training_validation_metrics.png'")
print("Alpha evolution plot saved as 'alpha_evolution.png'")

Test Loss: 1.7379, Test Accuracy: 72.34%
Training/validation metrics plot saved as 'training_validation_metrics.png'
Alpha evolution plot saved as 'alpha_evolution.png'
