In [1]:
import argparse
import os
import sys
import tabulate
import time
import torch
import torch.nn.functional as F

import curves
import data
import models
import utils

import numpy as np

import copy

In [2]:
torch.backends.cudnn.benchmark = True

loaders, num_classes = data.loaders(
    "CIFAR10",
    "data",
    128,
    1,
    "VGG",
    False)

Files already downloaded and verified
Using train (45000) + validation (5000)
Files already downloaded and verified


In [87]:
architecture = getattr(models, "VGG16")
model1 = architecture.base(num_classes=10, **architecture.kwargs)
model2 = architecture.base(num_classes=10, **architecture.kwargs)
model3 = architecture.base(num_classes=10, **architecture.kwargs)

model1.load_state_dict(torch.load("curves/curve51/checkpoint-100.pt")['model_state'])
model2.load_state_dict(torch.load("curves/middle_init5051_21/checkpoint-100.pt")['model_state'])
model3.load_state_dict(torch.load("curves/curve50/checkpoint-100.pt")['model_state'])

In [88]:
def dist(model1, model2):
    par1 = np.concatenate([p.data.cpu().numpy().ravel() for p in model1.parameters()])
    par2 = np.concatenate([p.data.cpu().numpy().ravel() for p in model2.parameters()])
    u = par2 - par1
    dx = np.linalg.norm(u)
    print('distance: %5.4f' % dx)
    return dx

In [89]:
distance = [dist(model3, model2)]
for i in range(2, 6):
    model1.load_state_dict(torch.load("curves/middle_init5051_2"+str(i-1)+"/checkpoint-100.pt")['model_state'])
    model2.load_state_dict(torch.load("curves/middle_init5051_2"+str(i)+"/checkpoint-100.pt")['model_state'])
    model3.load_state_dict(torch.load("curves/curve50/checkpoint-100.pt")['model_state'])
    distance.append(dist(model3, model2))

distance: 56.3710
distance: 31.5998
distance: 21.6775
distance: 18.2212
distance: 17.0778


In [91]:
# distance.append(dist(model1, model2))

In [78]:
dist(model2, model3)
# dist(model2, model1)

distance: 31.5998


In [79]:
dist(model1, model3)

distance: 63.1997


In [48]:
model1.cuda();
model2.cuda();
model3.cuda();

In [49]:
criterion = F.cross_entropy
regularizer = None 
model1.cuda();

In [42]:
test_res = utils.test(loaders['test'], model2, criterion, regularizer)
test_res

{'nll': 0.6022534687042236, 'loss': 0.6022534687042236, 'accuracy': 91.3}

In [27]:
architecture = getattr(models, "VGG16")
model1 = architecture.base(num_classes=10, **architecture.kwargs)
model2 = architecture.base(num_classes=10, **architecture.kwargs)

model1.load_state_dict(torch.load("curves/curve50/checkpoint-100.pt")['model_state'])
model2.load_state_dict(torch.load("curves/curve51/checkpoint-100.pt")['model_state'])

In [24]:
model3 = architecture.base(num_classes=10, **architecture.kwargs)
model3.cuda();

In [25]:
criterion = F.cross_entropy
regularizer = None 
optimizer = torch.optim.SGD(
    filter(lambda param: param.requires_grad, model3.parameters()),
    lr=0.01,
    momentum=0.9,
    weight_decay=0.0
)

In [30]:
test_res = utils.test(loaders['test'], model3, criterion, regularizer)

In [31]:
test_res

{'nll': 2.3029834384918213, 'loss': 2.3029834384918213, 'accuracy': 10.28}

In [31]:
def train(train_loader, model, optimizer, criterion, regularizer=None, lr_schedule=None):
    loss_sum = 0.0
    correct = 0.0

    num_iters = len(train_loader)
    model.train()
    for iter, (input, target) in enumerate(train_loader):
        if lr_schedule is not None:
            lr = lr_schedule(iter / num_iters)
            adjust_learning_rate(optimizer, lr)
        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        output = model(input)
        loss = criterion(output, target)
        if regularizer is not None:
            loss += regularizer(model)

        optimizer.zero_grad()
        loss.backward()
        
        loss_sum += loss.item() * input.size(0)
        pred = output.data.argmax(1, keepdim=True)
        correct += pred.eq(target.data.view_as(pred)).sum().item()
        
    grad = np.concatenate([p.grad.data.cpu().numpy().ravel() for p in model.parameters()])
    
    return grad

## Computing gradients 

In [67]:
num_points = 15
l_grad = []
for j in range(0, num_points):
    for p1, p2, p3 in zip(model1.parameters(), model2.parameters(), model3.parameters()):
        alpha = j * 1.0 / (num_points-1)
        p3.data.copy_(alpha * p1.data + (1.0 - alpha) * p2.data)        
    grad = train(loaders['train'], model3, optimizer, criterion, )
    l_grad.append(grad)
        

In [37]:
grad = copy.copy(l_grad)

In [59]:
l_grad = copy.copy(grad[:])

In [60]:
for i, v in enumerate(l_grad):
    print(np.linalg.norm(v))
    l_grad[i] = v/np.linalg.norm(v)

0.0344283
3.3656862
1.7700907
3.5720978
1.4306158


In [61]:
s = 0
for v in l_grad:
    s += v
print(np.linalg.norm(s)/num_points)
s = s/np.linalg.norm(s)

0.5325227737426758


In [62]:
for v in l_grad:
    print((v*s).sum())

0.38430274
0.6172532
0.70602906
0.59019786
0.3648513


In [63]:
for i in range(len(l_grad)):
    l_grad[i] = l_grad[i]-(l_grad[i]*s).sum()*s
    l_grad[i] = l_grad[i]/np.linalg.norm(l_grad[i])
    print(np.linalg.norm(l_grad[i]))

1.0
1.0
1.0
1.0000001
0.9999999


In [64]:
s = 0
for v in l_grad:
    s += v
print(np.linalg.norm(s)/num_points)
s = s/np.linalg.norm(s)

0.059120625257492065


In [65]:
for v in l_grad:
    print((v*s).sum())

-0.60078305
0.41879392
0.83125174
0.27660877
-0.630264
