In [3]:
import torch
import numpy as np
from ALOptimizer import ALOptimizer

In [4]:
cuda = False
torch.manual_seed(42)

<torch._C.Generator at 0x28440c4a410>

In [5]:
import torchvision
import torchvision.transforms.v2 as transforms

dataset = torchvision.datasets.MNIST('/data', download=True, train=True)



batch_size = 16

transform = transforms.Compose(
    [transforms.ToTensor(),])
trainset = torchvision.datasets.MNIST('/data', train=True, transform=transform)
# trainset = torch.utils.data.
trainset = torch.utils.data.Subset(trainset, np.arange(0, 20000))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=False, num_workers=0,
                                          generator=torch.Generator(device='cuda') if cuda else None)

testset = torchvision.datasets.MNIST('/data', train=False, transform=transform)

true_test, constr_test = torch.utils.data.random_split(testset, [0.95, 0.05])
constr_test_loader = torch.utils.data.DataLoader(constr_test, batch_size=batch_size,
                                          shuffle=True, num_workers=0,
                                          generator=torch.Generator(device='cuda') if cuda else None)
true_test_loader = torch.utils.data.DataLoader(true_test, batch_size=batch_size,
                                          shuffle=True, num_workers=0,
                                          generator=torch.Generator(device='cuda') if cuda else None)

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, n_in, n_out):
        super(Net, self).__init__()

        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 4 * 4, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        # self.double()
    
    def forward(self, input):

        # Convolution layer C1: 1 input image channel, 6 output channels,
        # 5x5 square convolution, it uses RELU activation function, and
        # outputs a Tensor with size (N, 6, 28, 28), where N is the size of the batch
        c1 = F.relu(self.conv1(input))
        # Subsampling layer S2: 2x2 grid, purely functional,
        # this layer does not have any parameter, and outputs a (N, 6, 14, 14) Tensor
        s2 = F.max_pool2d(c1, (2, 2))
        # Convolution layer C3: 6 input channels, 16 output channels,
        # 5x5 square convolution, it uses RELU activation function, and
        # outputs a (N, 16, 10, 10) Tensor
        c3 = F.relu(self.conv2(s2))
        # Subsampling layer S4: 2x2 grid, purely functional,
        # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor
        s4 = F.max_pool2d(c3, 2)
        # Flatten operation: purely functional, outputs a (N, 400) Tensor
        s4 = torch.flatten(s4, 1)
        # Fully connected layer F5: (N, 400) Tensor input,
        # and outputs a (N, 120) Tensor, it uses RELU activation function
        f5 = F.relu(self.fc1(s4))
        # Fully connected layer F6: (N, 120) Tensor input,
        # and outputs a (N, 84) Tensor, it uses RELU activation function
        f6 = F.relu(self.fc2(f5))
        # Gaussian layer OUTPUT: (N, 84) Tensor input, and
        # outputs a (N, 10) Tensor
        output = self.fc3(f6)
        return output

***
***
Try a constraint on the loss on a separate subset of data (stochastic)

In [7]:
def test_loss_constr(net, loss_fn, testset, threshold):
    loss = 0
    
    for i, (inputs, labels) in enumerate(testset):
        out = net.forward(inputs)
        loss += loss_fn(out, labels)
    loss /= i
    return torch.max((loss-threshold), torch.zeros(1))

In [11]:
n_classes = 10
class_test_net = Net(1, n_classes)
loss = torch.nn.CrossEntropyLoss()

def constraint(net):
    return test_loss_constr(net, loss, constr_test_loader, 0.1)

alo = ALOptimizer(net=class_test_net, loss_fn=loss, m=1, constraint_fn=constraint)

In [20]:
alo.optimize(trainloader, maxiter=3, epochs=3)

tensor([0.])10144695406779647, 0.075935646891593933
tensor(0.0010)


tensor([0.])9201057921164e-05, 0.0027400493621826112
tensor(3.0992e-05)


tensor([0.])41950340080075e-05, 0.03191259503364566
tensor(2.2842e-05)




In [None]:
acc = 0
with torch.no_grad():
    for i, data in enumerate(true_test):
        inputs, labels = data
        out = class_test_net.forward(inputs.unsqueeze(0))
        if np.argmax(out.detach().numpy()) == labels:
            acc+=1

acc/len(true_test)

In [22]:
with torch.no_grad():
    x = test_loss_constr(class_test_net, loss, constr_test_loader, 0)
    print(x)

tensor([0.0227])


In [21]:
with torch.no_grad():
    x = test_loss_constr(class_test_net, loss, true_test_loader, 0)
    print(x)

tensor([0.1529])


***
***

An L2 constraint on the weights. This is a deterministic constraint.

We get

$$
L := l(outputs, labels) + \lambda*h(W) + 0.5 \lambda r * h(W)^2 
$$

where

$$
h(W) := \begin{cases} 0 \qquad \textrm{if} \quad  {||W||_2}^2 - c \leq 0 \\ {||W||_2}^2  \quad \textrm{otherwise} \end{cases}
$$

In [1]:
def total_w_l2_constr(params, c):
    l2 = 0
    for param in params:
        l2 += torch.sum(torch.square(param))
    cval = torch.max(l2 - c, torch.zeros(1, dtype=param.dtype))
    return cval

In [2]:
n_classes = 10
class_test_net = Net(1, n_classes)
loss = torch.nn.CrossEntropyLoss()

def constraint(net):
    return total_w_l2_constr(net.parameters(), 15)

alo = ALOptimizer(net=class_test_net, loss_fn=loss, m=1, constraint_fn=constraint, lr=0.005)

NameError: name 'Net' is not defined

In [9]:
alo.optimize(trainloader, maxiter=5, epochs=10)

0, 870, 2.2839787006378174, 0.04061050415039886

KeyboardInterrupt: 

In [11]:
l2 = 0
for param in class_test_net.parameters():
    l2 += torch.sum(torch.square(param))
l2

tensor(39.3950, grad_fn=<AddBackward0>)

In [12]:
acc = 0
with torch.no_grad():
    for i, data in enumerate(true_test):
        inputs, labels = data
        out = class_test_net.forward(inputs.unsqueeze(0))
        if np.argmax(out.detach().numpy()) == labels:
            acc+=1

acc/len(true_test)

0.7650526315789473