# Test the multi objective optimization

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

# Check versions
import torch
import numpy
from platform import python_version
print("python_version() ---> ", python_version())
print("torch.__version__ --> ", torch.__version__)

# make sure to fix the randomness at the very beginning
torch.manual_seed(0)
numpy.random.seed(0)

python_version() --->  3.8.8
torch.__version__ -->  1.8.0


# Test the Multi-Objective Solver

In [4]:
from genus.util_moo import MinNormSolver

n_tasks = 4
n_dim = 1000

grads = [torch.randn(size=[n_dim], dtype=float) for n in range(n_tasks)]
grads[0] += 1
grads[1] *= 3000

sol, norm = MinNormSolver.find_min_norm_element(vecs=grads, verbose=False)
print(sol, norm)

tensor([2.1764e-01, 1.3512e-05, 4.0679e-01, 3.7556e-01], dtype=torch.float64) 400.04693780713717


# Experiment with detaching

In [42]:
import time

batch, c, w, h = 64, 1, 80, 80


class Net(torch.nn.Module):
    def __init__(self, backbone, head):
        super().__init__()
        self.backbone = backbone
        self.head = head
        
    def forward(self, x):
        y = self.backbone(x)
        z = self.head(y)
        return z, y
    
    def forward_no_grad(self, x):
        with torch.no_grad():
            tmp = self.backbone(x)
        y = tmp
        y.requires_grad = True
        z = self.head(y)
        return z, y
    
    def detach_backbone(self):
        for param in self.backbone.parameters():
            param.requires_grad = False
        
    def attach_backbone(self):
        for param in self.backbone.parameters():
            param.requires_grad = True
        
    
backbone = torch.nn.Sequential(
    torch.nn.Conv2d(c, 128, kernel_size=3, padding=1),
    torch.nn.ReLU(),
    torch.nn.Conv2d(128, 128, kernel_size=3, padding=1),
    torch.nn.ReLU(),
    torch.nn.Conv2d(128, 128, kernel_size=3, padding=1),
    torch.nn.ReLU(),
    torch.nn.Conv2d(128, 128, kernel_size=3, padding=1),
    torch.nn.ReLU()
)

head = torch.nn.Conv2d(128, c, kernel_size=3, padding=1)

net = Net(backbone, head)
x = torch.zeros(size=[batch, c, w, h], requires_grad=True)

### Run once in the standard way for reference

In [43]:
tin = time.time()
out, intermediate = net.forward(x)
loss = (x - out).pow(2).mean()
print("forward ->",time.time()-tin)

tin = time.time()
intermediate.retain_grad()
loss.backward()
print("backward ->",time.time()-tin)

print(intermediate.grad.min(), intermediate.grad.max(), intermediate.grad.shape)

forward -> 4.206957817077637
backward -> 9.368711948394775
tensor(-1.8714e-08) tensor(3.0396e-08) torch.Size([64, 128, 80, 80])


### If the computational graph was inside no grad

In [44]:
tin = time.time()
out, intermediate = net.forward_no_grad(x)
loss = (x - out).pow(2).mean()
print("forward ->",time.time()-tin)

tin = time.time()
intermediate.retain_grad()
loss.backward()
print("backward ->",time.time()-tin)

print(intermediate.grad.min(), intermediate.grad.max(), intermediate.grad.shape)

forward -> 4.389677047729492
backward -> 0.425462007522583
tensor(-1.8714e-08) tensor(3.0396e-08) torch.Size([64, 128, 80, 80])


### What happens if I detach the backbone

In [45]:
net.detach_backbone()

tin = time.time()
out, intermediate = net.forward(x)
loss = (x - out).pow(2).mean()
print("forward ->",time.time()-tin)

tin = time.time()
intermediate.retain_grad()
loss.backward()
print("backward ->",time.time()-tin)

print(intermediate.grad.min(), intermediate.grad.max(), intermediate.grad.shape)

forward -> 4.264872074127197
backward -> 4.869318962097168
tensor(-1.8714e-08) tensor(3.0396e-08) torch.Size([64, 128, 80, 80])
