In [27]:
import argparse
import numpy as np
import os
import tabulate
import torch
import torch.nn.functional as F

import data
import models
import curves
import utils
import copy

parser = argparse.ArgumentParser(description='Computes values for plane visualization')
parser.add_argument('--dir', type=str, default='/tmp/plane', metavar='DIR',
                    help='training directory (default: /tmp/plane)')

parser.add_argument('--grid_points', type=int, default=21, metavar='N',
                    help='number of points in the grid (default: 21)')
parser.add_argument('--margin_left', type=float, default=0.2, metavar='M',
                    help='left margin (default: 0.2)')
parser.add_argument('--margin_right', type=float, default=0.2, metavar='M',
                    help='right margin (default: 0.2)')
parser.add_argument('--margin_bottom', type=float, default=0.2, metavar='M',
                    help='bottom margin (default: 0.)')
parser.add_argument('--margin_top', type=float, default=0.2, metavar='M',
                    help='top margin (default: 0.2)')

parser.add_argument('--curve_points', type=int, default=61, metavar='N',
                    help='number of points on the curve (default: 61)')

parser.add_argument('--dataset', type=str, default='CIFAR10', metavar='DATASET',
                    help='dataset name (default: CIFAR10)')
parser.add_argument('--use_test', action='store_true',
                    help='switches between validation and test set (default: validation)')
parser.add_argument('--transform', type=str, default='VGG', metavar='TRANSFORM',
                    help='transform name (default: VGG)')
parser.add_argument('--data_path', type=str, default=None, metavar='PATH',
                    help='path to datasets location (default: None)')
parser.add_argument('--batch_size', type=int, default=128, metavar='N',
                    help='input batch size (default: 128)')
parser.add_argument('--num_workers', type=int, default=4, metavar='N',
                    help='number of workers (default: 4)')

parser.add_argument('--model', type=str, default=None, metavar='MODEL',
                    help='model name (default: None)')
parser.add_argument('--curve', type=str, default=None, metavar='CURVE',
                    help='curve type to use (default: None)')
parser.add_argument('--num_bends', type=int, default=3, metavar='N',
                    help='number of curve bends (default: 3)')

parser.add_argument('--ckpt', type=str, default=None, metavar='CKPT',
                    help='checkpoint to eval (default: None)')

parser.add_argument('--wd', type=float, default=1e-4, metavar='WD',
                    help='weight decay (default: 1e-4)')

args = parser.parse_args(['--dir', 'plots/plot-normal-5354grad', '--data_path', 'data', '--model', 'VGG16', '--curve', 
                          'PolyChain', '--ckpt', 'points2plane/connect-normal-5556/checkpoint-100.pt'])



In [28]:
args.dir

'plots/plot-normal-5354grad'

In [33]:
os.makedirs(args.dir, exist_ok=True)

torch.backends.cudnn.benchmark = True

loaders, num_classes = data.loaders(
    args.dataset,
    args.data_path,
    args.batch_size,
    args.num_workers,
    args.transform,
    args.use_test,
    shuffle_train=False
)

architecture = getattr(models, args.model)
curve = getattr(curves, args.curve)

curve_model = curves.CurveNet(
    num_classes,
    curve,
    architecture.curve,
    args.num_bends,
    architecture_kwargs=architecture.kwargs,
)
curve_model.cuda()

checkpoint = torch.load(args.ckpt)
curve_model.load_state_dict(checkpoint['model_state'])

criterion = F.cross_entropy
regularizer = utils.l2_regularizer(args.wd)


def get_xy(point, origin, vector_x, vector_y):
    return np.array([np.dot(point - origin, vector_x), np.dot(point - origin, vector_y)])


w = list()
curve_parameters = list(curve_model.net.parameters())
for i in range(args.num_bends):
    w.append(np.concatenate([
        p.data.cpu().numpy().ravel() for p in curve_parameters[i::args.num_bends]
    ]))

print('Weight space dimensionality: %d' % w[0].shape[0])

u = w[2] - w[0]
dx = np.linalg.norm(u)
u /= dx

print('Computing gradient vector')

def train(train_loader, model, optimizer, criterion, regularizer=None, lr_schedule=None):
    loss_sum = 0.0
    correct = 0.0

    num_iters = len(train_loader)
    model.train()
    for iter, (input, target) in enumerate(train_loader):
        if lr_schedule is not None:
            lr = lr_schedule(iter / num_iters)
            utils.adjust_learning_rate(optimizer, lr)
        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        output = model(input)
        loss = criterion(output, target)
        if regularizer is not None:
            loss += regularizer(model)

        optimizer.zero_grad()
        loss.backward()

        loss_sum += loss.item() * input.size(0)
        pred = output.data.argmax(1, keepdim=True)
        correct += pred.eq(target.data.view_as(pred)).sum().item()

    grad = np.concatenate([p.grad.data.cpu().numpy().ravel() for p in model.parameters()])

    return grad

model1 = architecture.base(num_classes=10, **architecture.kwargs)
model2 = architecture.base(num_classes=10, **architecture.kwargs)
model3 = architecture.base(num_classes=10, **architecture.kwargs)
model3.cuda()


def init_model(p, model):
    offset = 0
    for parameter in model.parameters():
        size = np.prod(parameter.size())
        value = p[offset:offset + size].reshape(parameter.size())
        parameter.data.copy_(torch.from_numpy(value))
        offset += size


init_model(w[0], model1)
init_model(w[2], model2)

criterion = F.cross_entropy
regularizer = None
optimizer = torch.optim.SGD(
    filter(lambda param: param.requires_grad, model3.parameters()),
    lr=0.01,
    momentum=0.9,
    weight_decay=0.0
)

num_points = 15
l_grad = []
for j in range(0, num_points):
    for p1, p2, p3 in zip(model1.parameters(), model2.parameters(), model3.parameters()):
        alpha = j * 1.0 / (num_points-1)
        p3.data.copy_(alpha * p1.data + (1.0 - alpha) * p2.data)
    grad = train(loaders['train'], model3, optimizer, criterion, )
    l_grad.append(grad)

for i, v in enumerate(l_grad):
    print(np.linalg.norm(v))
    l_grad[i] -= np.dot(u, v) * u
    l_grad[i] = v/np.linalg.norm(v)

s = 0
for v in l_grad:
    s += v
print(np.linalg.norm(s)/num_points)
s = s/np.linalg.norm(s)


v = w[1] - w[0]
v -= np.dot(u, v) * u
dy = np.linalg.norm(v)
v /= dy

# u2, v2 = copy.deepcopy(u), copy.deepcopy(v)
# u1, v1 = copy.deepcopy(u), copy.deepcopy(v)

Files already downloaded and verified
Using train (45000) + validation (5000)
Files already downloaded and verified
Weight space dimensionality: 15245130
Computing gradient vector
0.020508204
1.3164183
6.10122
6.648986
3.4395576
2.0869703
1.4800283
1.2656943
1.402981
1.9051266
3.0564978
5.6885357
7.898111
4.0841546
0.05878834
0.5266523679097493


In [34]:
# np.dot(u2, u1)

In [35]:
# np.dot(v2, v1)

In [36]:
print(np.dot(s, v))
for i, vec in enumerate(l_grad):
    print(i, np.dot(v, vec))

-0.029102946
0 -0.009224109
1 -0.037704706
2 -0.0581911
3 -0.04219137
4 -0.0033210842
5 0.010927529
6 0.013926087
7 0.013118408
8 0.013774304
9 0.011390868
10 -0.0018395763
11 -0.047116976
12 -0.062830776
13 -0.019998532
14 -0.010626069


In [19]:
for i, v in enumerate(l_grad):
#     print(np.linalg.norm(v))
    l_grad[i] -= np.dot(u, v) * u
    l_grad[i] = v/np.linalg.norm(v)

s = 0
for v in l_grad:
    s += v
print(np.linalg.norm(s)/num_points)
s = s/np.linalg.norm(s)


v = w[1] - w[0]
v -= np.dot(u, v) * u
dy = np.linalg.norm(v)
v /= dy

0.5295691808064779


In [20]:
np.dot(s, v) 

-0.027354201

In [21]:
print(np.dot(s, v))
for i, vec in enumerate(l_grad):
#     print(np.linalg.norm(vec))
    print(i, np.dot(v, vec))

0 -0.0037905849
1 -0.024135474
2 -0.057654627
3 -0.04594074
4 -0.0017186748
5 0.013093248
6 0.014068341
7 0.013736997
8 0.01338212
9 0.011975017
10 -0.0037048897
11 -0.04647499
12 -0.06424836
13 -0.02056173
14 -0.015314849
