In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np

In [211]:
def SV(n):
    q = n//2
    r = n%2
    if r == 1:
        sv = 3.1416
        for i in range(1, q+1):
            sv *= (2*i+1)/(2*i)
    else:
        sv = 2
        for i in range(1, q+1):
            sv *= 2*i/(2*i-1)
    return sv


def h(n):
    h = np.sqrt((n+1)/(2*n))
    return h


def LU(m, n):
    l = np.power((n+1)*SV(n)/(2*m), 1/n)
    u = l/h(n)
    return l, u



def ad_function(W, theta, target_norm=1):
    m = W.shape[0]
    WWT = W @ torch.t(W)
    norm2 = torch.diagonal(WWT, 0)
    with torch.no_grad():
        N = (torch.sqrt(norm2[:, None] @ norm2[None, :]) + 1e-8)*1.001
    if theta == 1.5708:
        M = torch.logical_not(torch.eye(m, dtype=bool).cuda())
        tloss = torch.sum(((torch.arccos(WWT[M]) - theta))**2)
    else:
        WWTN = WWT/N
#         WWTNN = WWTN/N
        Z = torch.logical_not(torch.eye(m, dtype=bool).cuda())
        M1 = (WWTN > np.cos(theta)) * (WWT < 0.99)*Z
        M2 = (WWTN < -np.cos(theta)) * (WWT > -0.99)*Z

        tloss = torch.sum((torch.arccos(WWTN[M1]) - theta)**2) + \
            torch.sum((torch.arccos(WWTN[M2]) - 3.1416 + theta)**2)   

#         tloss = torch.sum((torch.arccos(WWT[M1]) - theta)**2) + \
#             torch.sum((torch.arccos(WWT[M2]) - 3.1416 + theta)**2)

#         print(torch.sum(M1), torch.sum(M2), tloss)

    nloss = torch.sum((target_norm**2 - norm2)**2)
    return nloss, tloss



def ADK(weight, theta, double=False):
    if double:
#         nloss, tloss = ad_function(weight, 1.5708, target_norm=1)
#         n2, t2 = ad_function(torch.t(weight), theta, target_norm=0.01)
#         nloss += n2
#         tloss += t2
        nloss, tloss = ad_function(torch.t(weight), theta, target_norm=0.1)
    else:
        nloss, tloss = ad_function(weight, 1.5708, target_norm=1)
    return  nloss, tloss


def SO(weight):
    if isinstance(weight, tuple):
        weight = weight[0]
    m = weight.shape[0]
    W = weight.view(m, -1)
    loss = torch.sum((W @ torch.t(W) - torch.eye(m, dtype=float).cuda())**2)
    return loss


def DSO(weight):
    if isinstance(weight, tuple):
        weight = weight[0]
    m = weight.shape[0]
    W = weight.view(m, -1)
    n = W.shape[1]
    loss = torch.sum((W @ torch.t(W) - torch.eye(m, dtype=float).cuda())**2) + \
        torch.sum((torch.t(W) @ W - torch.eye(n, dtype=float).cuda())**2) 
    return loss

In [3]:
def angle_analysis(W):
    m = W.shape[0]
    WWT = W @ torch.t(W)
    norm2 = torch.diagonal(WWT, 0)
    N = (torch.sqrt(norm2[:, None] @ norm2[None, :]).detach() + 1e-8)*1.001  
    WWTN = WWT/N
    
    M = torch.logical_not(torch.eye(m))
    sp = torch.sort(1 - torch.abs(WWTN[M].view(m, -1)), dim=1)
    
    theta = torch.arccos(torch.abs(WWTN[torch.arange(m), sp.indices[:, 0]]))
    mean = torch.mean(theta)
    Max = torch.amax(theta)
    Min = torch.amin(theta)

    return mean, Min, Max

In [4]:
bsize = 512
num_worker = 4
epochs = 5

In [5]:
train_dataset = torchvision.datasets.CIFAR10(
                root='./DATA/', 
                transform=transforms.Compose(
                    [
                    transforms.RandomCrop(32, padding=4),
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomRotation(15),
                    transforms.ToTensor(),
                    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
                    ]),
                train=True)

val_dataset = torchvision.datasets.CIFAR10(
                root='./DATA/', 
                transform=transforms.Compose(
                    [
                    transforms.ToTensor(),
                    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
                    ]),
                train=False) 

In [6]:
train_loader = DataLoader(
                    train_dataset, 
                    batch_size=bsize, 
                    shuffle=True, 
                    num_workers=num_worker, 
                    pin_memory=True, 
                    )

val_loader = DataLoader(
                    val_dataset, 
                    batch_size=bsize, 
                    shuffle=False, 
                    num_workers=num_worker, 
                    pin_memory=True, 
                    )

In [7]:
class TestModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(32*32*3, 1000)
        self.layer2 = nn.Linear(1000, 1000)
        self.layer3 = nn.Linear(1000, 10)
        
    def forward(self, x):
        x = x.view(-1, 32*32*3)
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        x = F.relu(x)
        x = self.layer3(x)
        return x

In [8]:
l, u = LU(1000, 10)
print(l, u)

0.7328738236546221 0.9882068622217131


In [9]:
def ad_function_update(W, theta, target_norm=1):
    m = W.shape[0]
    WWT = W @ torch.t(W)
    norm2 = torch.diagonal(WWT, 0)
    with torch.no_grad():
        N = (torch.sqrt(norm2[:, None] @ norm2[None, :]) + 1e-8)*1.001
    if theta == 1.5708:
        M = torch.logical_not(torch.eye(m, dtype=bool).cuda())
        tloss = torch.sum(((torch.arccos(WWT[M]) - theta))**2)
    else:
        WWTN = WWT/N
        WWTNN = WWTN/N
        Z = torch.logical_not(torch.eye(m, dtype=bool).cuda())
        M1 = (WWTN > np.cos(theta)) * (WWT < 0.99)*Z
        M2 = (WWTN < -np.cos(theta)) * (WWT > -0.99)*Z

        tloss = torch.sum((torch.arccos(WWTN[M1]) - theta)**2) + \
            torch.sum((torch.arccos(WWTN[M2]) - 3.1416 + theta)**2)   

#         tloss = torch.sum((torch.arccos(WWT[M1]) - theta)**2) + \
#             torch.sum((torch.arccos(WWT[M2]) - 3.1416 + theta)**2)

#         print(torch.sum(M1), torch.sum(M2), tloss)

#     nloss = torch.sum((target_norm**2 - norm2)**2)
    return tloss



def ADK_update(weight, theta, lamd, lr, target_norm):
#     if isinstance(weight, tuple):
#         weight = weight[0]
#     m = weight.shape[0]
#     W = weight.view(m, -1)
    weight1 = weight.clone()
    
    tloss = ad_function_update(torch.t(weight1), theta, target_norm=0.1)
    delW = torch.autograd.grad(tloss, weight1, retain_graph=True)[0]

    with torch.no_grad():
        newW = weight1 - lr*lamd*(delW/torch.norm(weight1, dim=[1], keepdim=True)).view(weight1.shape)
        weight.copy_(newW)
    m = weight.shape[0]
    WWT = weight @ torch.t(weight)
    norm2 = torch.diagonal(WWT, 0)
    nloss = torch.sum((target_norm**2 - norm2)**2)
    return nloss, tloss

In [205]:
# ADK UPDATE double
epochs = 50
lamd = 0.1
lr = 0.01
target_norm = 0.1
theta = 1.
rtn = 0.01

model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
       
        W = torch.t(model.layer3.weight)

        WWT = W @ torch.t(W)
        norm2 = torch.diagonal(WWT, 0)
        nloss = torch.sum((target_norm**2 - norm2)**2)        
        
        loss = criterion(out, y) + lamd*nloss
#         loss = nl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        weight1 = W.clone()
        tloss = ad_function(weight1, theta, target_norm=0.1)
        delW = torch.autograd.grad(tloss, weight1)[0]
#         print('norm2', norm2)
#         print('1', W)
        with torch.no_grad():
            newW = weight1 - lr*lamd*rtn*(delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape)
            model.layer3.weight.copy_(torch.t(newW))
        
#         print('2', W)
#         print('delW', delW)
#         print('norm', torch.norm(weight1, dim=[1], keepdim=True))
#         print('delW/norm', (delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape))
        
        
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.39 0.04 1.57 : 0.92 0.19 1.57
mean norm:  0.5639896392822266 0.05581842362880707
epoch: 0, loss: 2.1267154216766357, accruacy: 0.2675
1.37 0.05 1.56 : 0.93 0.20 1.57
mean norm:  0.5580971837043762 0.05517022684216499
epoch: 1, loss: 2.0740468502044678, accruacy: 0.2921
1.36 0.04 1.56 : 0.93 0.04 1.57
mean norm:  0.5505606532096863 0.05435553193092346
epoch: 2, loss: 2.0093839168548584, accruacy: 0.3098
1.35 0.05 1.57 : 0.92 0.04 1.57
mean norm:  0.5419009327888489 0.05344051867723465
epoch: 3, loss: 1.9270597696304321, accruacy: 0.322
1.33 0.05 1.56 : 0.92 0.04 1.57
mean norm:  0.5329955220222473 0.052503008395433426
epoch: 4, loss: 1.8606401681900024, accruacy: 0.3315
1.32 0.05 1.56 : 0.93 0.04 1.57
mean norm:  0.5245689749717712 0.05160930007696152
epoch: 5, loss: 1.9425522089004517, accruacy: 0.337
1.30 0.04 1.54 : 0.93 0.04 1.57
mean norm:  0.5177648663520813 0.05087217688560486
epoch: 6, loss: 1.8826148509979248, accruacy: 0.345
1.30 0.05 1.52 : 0.92 0.04 1.57
mean norm:  0.5107

In [206]:
# ADK UPDATE double
epochs = 50
lamd = 0.1
lr = 0.01
target_norm = 0.1
theta = 1.
rtn = 0.1

model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
       
        W = torch.t(model.layer3.weight)

        WWT = W @ torch.t(W)
        norm2 = torch.diagonal(WWT, 0)
        nloss = torch.sum((target_norm**2 - norm2)**2)        
        
        loss = criterion(out, y) + lamd*nloss
#         loss = nl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        weight1 = W.clone()
        tloss = ad_function(weight1, theta, target_norm=0.1)
        delW = torch.autograd.grad(tloss, weight1)[0]
#         print('norm2', norm2)
#         print('1', W)
        with torch.no_grad():
            newW = weight1 - lr*lamd*rtn*(delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape)
            model.layer3.weight.copy_(torch.t(newW))
        
#         print('2', W)
#         print('delW', delW)
#         print('norm', torch.norm(weight1, dim=[1], keepdim=True))
#         print('delW/norm', (delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape))
        
        
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.23 0.04 1.55 : 0.92 0.04 1.57
mean norm:  0.38339099287986755 0.03793233633041382
epoch: 0, loss: 2.1696174144744873, accruacy: 0.2496
1.35 0.05 1.55 : 0.93 0.25 1.57
mean norm:  0.27672240138053894 0.027248242869973183
epoch: 1, loss: 2.1227924823760986, accruacy: 0.264
1.33 0.04 1.55 : 0.90 0.24 1.57
mean norm:  0.20879574120044708 0.02034074440598488
epoch: 2, loss: 2.1085901260375977, accruacy: 0.2821
1.32 0.05 1.56 : 0.88 0.25 1.57
mean norm:  0.16490352153778076 0.01575537584722042
epoch: 3, loss: 2.0746278762817383, accruacy: 0.2788
1.30 0.05 1.57 : 0.86 0.05 1.57
mean norm:  0.1363113820552826 0.01265737134963274
epoch: 4, loss: 2.067530632019043, accruacy: 0.2792
1.25 0.05 1.52 : 0.86 0.20 1.57
mean norm:  0.11836904287338257 0.010626466944813728
epoch: 5, loss: 2.1170642375946045, accruacy: 0.2773
1.23 0.04 1.55 : 0.83 0.16 1.57
mean norm:  0.10809934139251709 0.009459488093852997
epoch: 6, loss: 2.061394691467285, accruacy: 0.2857
1.19 0.04 1.51 : 0.80 0.06 1.57
mean norm:

In [207]:
# ADK UPDATE double
epochs = 50
lamd = 0.1
lr = 0.01
target_norm = 0.1
theta = 1.
rtn = 1

model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
       
        W = torch.t(model.layer3.weight)

        WWT = W @ torch.t(W)
        norm2 = torch.diagonal(WWT, 0)
        nloss = torch.sum((target_norm**2 - norm2)**2)        
        
        loss = criterion(out, y) + lamd*nloss
#         loss = nl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        weight1 = W.clone()
        tloss = ad_function(weight1, theta, target_norm=0.1)
        delW = torch.autograd.grad(tloss, weight1)[0]
#         print('norm2', norm2)
#         print('1', W)
        with torch.no_grad():
            newW = weight1 - lr*lamd*rtn*(delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape)
            model.layer3.weight.copy_(torch.t(newW))
        
#         print('2', W)
#         print('delW', delW)
#         print('norm', torch.norm(weight1, dim=[1], keepdim=True))
#         print('delW/norm', (delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape))
        
        
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.35 0.05 1.57 : 0.91 0.29 1.57
mean norm:  0.018144696950912476 0.001708401134237647
epoch: 0, loss: 2.3048758506774902, accruacy: 0.1842
1.36 1.26 1.57 : 0.93 0.19 1.57
mean norm:  0.006043476518243551 0.0005289650289341807
epoch: 1, loss: 2.309612274169922, accruacy: 0.177
1.34 1.18 1.51 : 0.92 0.34 1.57
mean norm:  0.005311189219355583 0.00046188506530597806
epoch: 2, loss: 2.309718370437622, accruacy: 0.18
1.34 1.14 1.53 : 0.91 0.33 1.57
mean norm:  0.0054674046114087105 0.0004710395587608218
epoch: 3, loss: 2.3085503578186035, accruacy: 0.1576
1.36 1.16 1.56 : 0.92 0.33 1.57
mean norm:  0.005427140276879072 0.00045919031254015863
epoch: 4, loss: 2.308847665786743, accruacy: 0.1825
1.27 0.05 1.56 : 0.92 0.16 1.57
mean norm:  0.005384047981351614 0.0004577586951199919
epoch: 5, loss: 2.3108737468719482, accruacy: 0.1856
1.24 0.05 1.53 : 0.93 0.28 1.57
mean norm:  0.005299943964928389 0.0004467568069230765
epoch: 6, loss: 2.310016632080078, accruacy: 0.1374
1.32 1.11 1.53 : 0.92 0.3

In [208]:
# ADK UPDATE double
epochs = 50
lamd = 0.01
lr = 0.01
target_norm = 0.1
theta = 1.
rtn = 1

model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
       
        W = torch.t(model.layer3.weight)

        WWT = W @ torch.t(W)
        norm2 = torch.diagonal(WWT, 0)
        nloss = torch.sum((target_norm**2 - norm2)**2)        
        
        loss = criterion(out, y) + lamd*nloss
#         loss = nl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        weight1 = W.clone()
        tloss = ad_function(weight1, theta, target_norm=0.1)
        delW = torch.autograd.grad(tloss, weight1)[0]
#         print('norm2', norm2)
#         print('1', W)
        with torch.no_grad():
            newW = weight1 - lr*lamd*rtn*(delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape)
            model.layer3.weight.copy_(torch.t(newW))
        
#         print('2', W)
#         print('delW', delW)
#         print('norm', torch.norm(weight1, dim=[1], keepdim=True))
#         print('delW/norm', (delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape))
        
        
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.54 1.50 1.57 : 0.91 0.04 1.57
mean norm:  0.3846983015537262 0.0380522757768631
epoch: 0, loss: 2.168888807296753, accruacy: 0.2398
1.36 0.05 1.54 : 0.92 0.04 1.57
mean norm:  0.27690139412879944 0.027287045493721962
epoch: 1, loss: 2.1098759174346924, accruacy: 0.2542
1.33 0.05 1.54 : 0.93 0.30 1.57
mean norm:  0.20760364830493927 0.02031354233622551
epoch: 2, loss: 2.10550856590271, accruacy: 0.2798
1.32 0.04 1.56 : 0.90 0.22 1.57
mean norm:  0.1627831608057022 0.015678657218813896
epoch: 3, loss: 2.053070306777954, accruacy: 0.2722
1.29 0.05 1.55 : 0.85 0.15 1.57
mean norm:  0.1341269612312317 0.012579708360135555
epoch: 4, loss: 2.039233922958374, accruacy: 0.2715
1.25 0.05 1.55 : 0.83 0.05 1.57
mean norm:  0.11604940891265869 0.01051049679517746
epoch: 5, loss: 2.045306921005249, accruacy: 0.2719
1.21 0.05 1.54 : 0.82 0.05 1.57
mean norm:  0.1057608500123024 0.009276111610233784
epoch: 6, loss: 2.0622289180755615, accruacy: 0.2731
1.18 0.04 1.49 : 0.82 0.14 1.57
mean norm:  0.09

In [210]:
# ADK UPDATE double
epochs = 50
lamd = 0.01
lr = 0.01
target_norm = 0.1
theta = 1.57
rtn = 1

model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-4)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
       
        W = torch.t(model.layer3.weight)

        WWT = W @ torch.t(W)
        norm2 = torch.diagonal(WWT, 0)
        nloss = torch.sum((target_norm**2 - norm2)**2)        
        
        loss = criterion(out, y) + lamd*nloss
#         loss = nl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        weight1 = W.clone()
        tloss = ad_function(weight1, theta, target_norm=0.1)
        delW = torch.autograd.grad(tloss, weight1)[0]
#         print('norm2', norm2)
#         print('1', W)
        with torch.no_grad():
            newW = weight1 - lr*lamd*rtn*(delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape)
            model.layer3.weight.copy_(torch.t(newW))
        
#         print('2', W)
#         print('delW', delW)
#         print('norm', torch.norm(weight1, dim=[1], keepdim=True))
#         print('delW/norm', (delW*torch.norm(weight1, dim=[1], keepdim=True)**2).view(weight1.shape))
        
        
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.22 0.04 1.54 : 0.89 0.26 1.57
mean norm:  0.009849299676716328 0.0009317856165580451
epoch: 0, loss: 2.2972638607025146, accruacy: 0.1811
1.16 0.04 1.55 : 0.90 0.26 1.57
mean norm:  0.007223817054182291 0.0006416146643459797
epoch: 1, loss: 2.299117088317871, accruacy: 0.1865
1.28 0.98 1.55 : 0.90 0.22 1.57
mean norm:  0.007736501283943653 0.0006800131523050368
epoch: 2, loss: 2.2951574325561523, accruacy: 0.1601
1.11 0.05 1.53 : 0.89 0.20 1.57
mean norm:  0.007617245893925428 0.0006699476507492363
epoch: 3, loss: 2.296225070953369, accruacy: 0.217
1.24 0.87 1.54 : 0.87 0.17 1.57
mean norm:  0.008078524842858315 0.0007082631927914917
epoch: 4, loss: 2.2950332164764404, accruacy: 0.2241
1.16 0.90 1.39 : 0.88 0.19 1.57
mean norm:  0.008235028944909573 0.0007216797093860805
epoch: 5, loss: 2.2955429553985596, accruacy: 0.2412
1.12 0.05 1.54 : 0.88 0.21 1.57
mean norm:  0.008314087986946106 0.0007314324611797929
epoch: 6, loss: 2.293825149536133, accruacy: 0.222
1.09 0.05 1.49 : 0.87 0.1

In [None]:
# ADK UPDATE double
lamd = 1
lr = 0.01
target_norm = 0.1
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)
for e in range(epochs):
    for x, y in train_loader:
        print('######')
        x = x.cuda()
        y = y.cuda()
        out = model(x)
       
        W = model.layer3.weight
        WWT = W @ torch.t(W)
        norm2 = torch.diagonal(WWT, 0)
        nloss = torch.sum((target_norm**2 - norm2)**2)        
        
        loss = criterion(out, y) + lamd*nl 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        weight1 = W.clone()
        tloss = ad_function(torch.t(weight1), theta, target_norm=0.1)
        delW = torch.autograd.grad(tloss, weight1)[0]

        with torch.no_grad():
            newW = weight1 - lr*lamd*(delW/torch.norm(weight1, dim=[1], keepdim=True)).view(weight1.shape)
            W.copy_(newW)      
        
        
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

In [212]:
# ADK double
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        nl, tl = ADK(model.layer3.weight, 1.0, double=True)
        loss = criterion(out, y) + 0.1*nl + 0.*tl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.52 1.49 1.57 : 0.88 0.04 1.57
mean norm:  0.587649941444397 0.05807867646217346
epoch: 0, loss: 2.1342148780822754, accruacy: 0.2645
1.37 0.04 1.57 : 0.88 0.04 1.57
mean norm:  0.6067554354667664 0.05987248569726944
epoch: 1, loss: 1.9973318576812744, accruacy: 0.2867
1.35 0.04 1.57 : 0.89 0.04 1.57
mean norm:  0.6234175562858582 0.06141820177435875
epoch: 2, loss: 2.0336532592773438, accruacy: 0.3089
1.33 0.05 1.55 : 0.89 0.04 1.57
mean norm:  0.6380568146705627 0.06277431547641754
epoch: 3, loss: 1.9372048377990723, accruacy: 0.3159
1.32 0.05 1.56 : 0.89 0.04 1.57
mean norm:  0.6519581079483032 0.06406112760305405
epoch: 4, loss: 1.8804259300231934, accruacy: 0.3271


KeyboardInterrupt: 

In [142]:
# ADK double
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        nl, tl = ADK(model.layer3.weight, 1.0, double=True)
        loss = criterion(out, y) + 1*nl + 1*tl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

nan nan nan : nan nan nan
mean norm:  nan nan
epoch: 0, loss: nan, accruacy: 0.1
nan nan nan : nan nan nan
mean norm:  nan nan
epoch: 1, loss: nan, accruacy: 0.1
nan nan nan : nan nan nan
mean norm:  nan nan
epoch: 2, loss: nan, accruacy: 0.1
nan nan nan : nan nan nan
mean norm:  nan nan
epoch: 3, loss: nan, accruacy: 0.1
nan nan nan : nan nan nan
mean norm:  nan nan
epoch: 4, loss: nan, accruacy: 0.1


In [9]:
for i in range(1, 20):
    u = i*0.1
    print('###########  ', u, '  ###########')
    # ADK double
    model = TestModel().cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    for e in range(epochs):
        for x, y in train_loader:
            x = x.cuda()
            y = y.cuda()
            out = model(x)
            nl, tl = ADK(model.layer3.weight, u, double=True)
            loss = criterion(out, y) + 0.1*nl + 0.1*tl
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        W = model.layer3.weight
        Mean, Min, Max = angle_analysis(W)
        TMean, TMin, TMax = angle_analysis(torch.t(W))
        print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
        test_num = 0
        hit_num = 0
        for x, y in val_loader:
            x = x.cuda()
            y = y.cuda()
            test_num += len(y)
            out = model(x)
            pred = out.argmax(dim=1, keepdim=True)
            hit_num += pred.eq(y.view_as(pred)).sum().item()
        print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

#######################
###########   0.1   ###########
1.50 0.50
epoch: 0, loss: 2.1134190559387207, accruacy: 0.2617
1.46 0.50
epoch: 1, loss: 2.032808542251587, accruacy: 0.2879
1.43 0.50
epoch: 2, loss: 2.004826784133911, accruacy: 0.3085
1.41 0.50
epoch: 3, loss: 1.9810376167297363, accruacy: 0.3206
1.39 0.50
epoch: 4, loss: 1.9487046003341675, accruacy: 0.3276
1.38 0.50
epoch: 5, loss: 1.7997831106185913, accruacy: 0.3351
1.36 0.50
epoch: 6, loss: 1.8502875566482544, accruacy: 0.3411
1.35 0.50
epoch: 7, loss: 1.8180392980575562, accruacy: 0.3463
1.34 0.50
epoch: 8, loss: 1.8315154314041138, accruacy: 0.3521
1.34 0.50
epoch: 9, loss: 1.7746917009353638, accruacy: 0.3551
#######################
###########   0.2   ###########
1.51 0.50
epoch: 0, loss: 2.138826370239258, accruacy: 0.2582
1.47 0.50
epoch: 1, loss: 2.7714345455169678, accruacy: 0.2875
1.45 0.50
epoch: 2, loss: 2.692617654800415, accruacy: 0.3083
1.41 0.50
epoch: 3, loss: 2.5747790336608887, accruacy: 0.3204
1.34 0.50


0.35 0.07
epoch: 4, loss: 4.947579860687256, accruacy: 0.3229
0.33 0.07
epoch: 5, loss: 5.0513834953308105, accruacy: 0.2616
0.31 0.06
epoch: 6, loss: 4.8058390617370605, accruacy: 0.3462
0.30 0.06
epoch: 7, loss: 4.797784805297852, accruacy: 0.3461
0.28 0.06
epoch: 8, loss: 4.810888767242432, accruacy: 0.3527
0.27 0.06
epoch: 9, loss: 4.7766852378845215, accruacy: 0.3571
#######################
###########   1.5   ###########
0.78 0.20
epoch: 0, loss: 4.666886806488037, accruacy: 0.2857
0.67 0.14
epoch: 1, loss: 2.9216794967651367, accruacy: 0.3195
0.61 0.12
epoch: 2, loss: 2.6076865196228027, accruacy: 0.3225
0.57 0.11
epoch: 3, loss: 2.551128387451172, accruacy: 0.3415
0.54 0.10
epoch: 4, loss: 2.5298352241516113, accruacy: 0.33
0.52 0.10
epoch: 5, loss: 2.441100835800171, accruacy: 0.3544
0.50 0.09
epoch: 6, loss: 2.3321566581726074, accruacy: 0.3618
0.49 0.09
epoch: 7, loss: 2.44405460357666, accruacy: 0.3612
0.47 0.09
epoch: 8, loss: 2.3308355808258057, accruacy: 0.3746
0.46 0.08

KeyboardInterrupt: 

In [55]:
# base, weight_decay
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.0001)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.53 1.49 1.56 : 0.90 0.04 1.57
mean norm:  0.589784562587738 0.05836125463247299
epoch: 0, loss: 2.1580748558044434, accruacy: 0.2593
1.37 0.05 1.57 : 0.90 0.04 1.57
mean norm:  0.6068843007087708 0.05998790264129639
epoch: 1, loss: 2.070744514465332, accruacy: 0.2941
1.36 0.04 1.57 : 0.90 0.04 1.57
mean norm:  0.6219269037246704 0.06139586865901947
epoch: 2, loss: 1.9798004627227783, accruacy: 0.3113
1.35 0.04 1.56 : 0.90 0.04 1.57
mean norm:  0.6346355676651001 0.06257417052984238
epoch: 3, loss: 1.9303721189498901, accruacy: 0.3204
1.34 0.05 1.55 : 0.89 0.04 1.57
mean norm:  0.6468605399131775 0.06370298564434052
epoch: 4, loss: 1.8194307088851929, accruacy: 0.3311


In [57]:
# base
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.53 1.48 1.57 : 0.90 0.04 1.57
mean norm:  0.5919156074523926 0.058548636734485626
epoch: 0, loss: 2.119493246078491, accruacy: 0.2649
1.36 0.05 1.55 : 0.91 0.04 1.57
mean norm:  0.6089869737625122 0.0601668506860733
epoch: 1, loss: 2.0129971504211426, accruacy: 0.2912
1.33 0.05 1.53 : 0.92 0.04 1.57
mean norm:  0.6243717670440674 0.06160520017147064
epoch: 2, loss: 1.9601598978042603, accruacy: 0.3056
1.33 0.04 1.57 : 0.91 0.04 1.57
mean norm:  0.6373639106750488 0.06281175464391708
epoch: 3, loss: 1.9458551406860352, accruacy: 0.3209
1.32 0.04 1.54 : 0.91 0.04 1.57
mean norm:  0.6497540473937988 0.06395592540502548
epoch: 4, loss: 1.9300075769424438, accruacy: 0.331


In [56]:
# ADK not double
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        nl, tl = ADK(model.layer3.weight, l, double=False)
        loss = criterion(out, y) + 0.1*nl + 0.01*tl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.52 1.46 1.56 : 0.90 0.04 1.57
mean norm:  0.7327307462692261 0.07248667627573013
epoch: 0, loss: 2.2849724292755127, accruacy: 0.2619
1.52 1.44 1.57 : 0.89 0.04 1.57
mean norm:  0.8564189076423645 0.08467362821102142
epoch: 1, loss: 2.0351409912109375, accruacy: 0.303
1.52 1.43 1.57 : 0.90 0.04 1.57
mean norm:  0.9326834678649902 0.09217473864555359
epoch: 2, loss: 1.9080106019973755, accruacy: 0.322
1.52 1.44 1.57 : 0.90 0.04 1.57
mean norm:  0.972987949848175 0.09613069891929626
epoch: 3, loss: 1.9408897161483765, accruacy: 0.3352
1.52 1.45 1.57 : 0.90 0.04 1.57
mean norm:  0.992944061756134 0.0980745330452919
epoch: 4, loss: 1.8335623741149902, accruacy: 0.3461


In [58]:
# ADK not double
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        nl, tl = ADK(model.layer3.weight, l, double=False)
        loss = criterion(out, y) + 0.1*nl + 0.1*tl
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.39 0.05 1.55 : 0.93 0.04 1.57
mean norm:  0.7329638600349426 0.0724828839302063
epoch: 0, loss: 2.3123581409454346, accruacy: 0.2677
1.54 1.52 1.56 : 0.92 0.04 1.57
mean norm:  0.8568193316459656 0.08468151092529297
epoch: 1, loss: 2.0182077884674072, accruacy: 0.3021
1.40 0.05 1.57 : 0.92 0.04 1.57
mean norm:  0.9329319000244141 0.09217444062232971
epoch: 2, loss: 1.901869773864746, accruacy: 0.3184
1.25 0.04 1.57 : 0.92 0.04 1.57
mean norm:  0.9734387397766113 0.09615301340818405
epoch: 3, loss: 1.9324917793273926, accruacy: 0.3376
1.26 0.04 1.57 : 0.92 0.04 1.57
mean norm:  0.9937114119529724 0.09812876582145691
epoch: 4, loss: 1.8722680807113647, accruacy: 0.3479


In [59]:
# SO
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        loss = criterion(out, y) + 0.1*SO(model.layer3.weight)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.39 0.04 1.57 : 0.90 0.04 1.57
mean norm:  0.7290999889373779 0.07216115295886993
epoch: 0, loss: 2.3027081883573137, accruacy: 0.2695
1.55 1.52 1.57 : 0.91 0.04 1.57
mean norm:  0.8540456891059875 0.08449258655309677
epoch: 1, loss: 2.039130000210295, accruacy: 0.3043
1.55 1.54 1.57 : 0.91 0.04 1.57
mean norm:  0.931410014629364 0.09212047606706619
epoch: 2, loss: 1.9808444433528196, accruacy: 0.3276
1.56 1.55 1.57 : 0.91 0.04 1.57
mean norm:  0.9727274775505066 0.09618840366601944
epoch: 3, loss: 1.9010257734667382, accruacy: 0.3421
1.56 1.55 1.57 : 0.91 0.04 1.57
mean norm:  0.9931061863899231 0.09818591922521591
epoch: 4, loss: 1.975194415642149, accruacy: 0.3525


In [60]:
# TSO
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        loss = criterion(out, y) + 0.1*SO(torch.t(model.layer3.weight))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.39 0.05 1.57 : 0.90 0.04 1.57
mean norm:  0.7337130308151245 0.07255121320486069
epoch: 0, loss: 101.31858130709698, accruacy: 0.2651
1.39 0.04 1.57 : 0.90 0.24 1.57
mean norm:  0.856711208820343 0.08468716591596603
epoch: 1, loss: 101.06299517101633, accruacy: 0.2997
1.40 0.04 1.57 : 0.90 0.24 1.57
mean norm:  0.9327279925346375 0.0921822190284729
epoch: 2, loss: 100.8942430170697, accruacy: 0.3207
1.40 0.04 1.57 : 0.91 0.24 1.57
mean norm:  0.9732166528701782 0.09616829454898834
epoch: 3, loss: 100.87449556708997, accruacy: 0.3375
1.41 0.04 1.57 : 0.91 0.24 1.57
mean norm:  0.9934465289115906 0.09814741462469101
epoch: 4, loss: 100.82878269491732, accruacy: 0.3434


In [61]:
# DSO
model = TestModel().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for e in range(epochs):
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        out = model(x)
        loss = criterion(out, y) + 0.1*DSO(model.layer3.weight)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    W = model.layer3.weight
    Mean, Min, Max = angle_analysis(W)
    TMean, TMin, TMax = angle_analysis(torch.t(W))
    print(f'{Mean.item():3.2f} {Min.item():3.2f} {Max.item():3.2f} : {TMean.item():3.2f} {TMin.item():3.2f} {TMax.item():3.2f}')
    print('mean norm: ', torch.mean(torch.norm(W, dim=1)).item(), torch.mean(torch.norm(W, dim=0)).item())
    test_num = 0
    hit_num = 0
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda()
        test_num += len(y)
        out = model(x)
        pred = out.argmax(dim=1, keepdim=True)
        hit_num += pred.eq(y.view_as(pred)).sum().item()
    print(f'epoch: {e}, loss: {loss}, accruacy: {hit_num/test_num}')

1.40 0.05 1.56 : 0.91 0.04 1.57
mean norm:  0.8451782464981079 0.08361684530973434
epoch: 0, loss: 101.2794976826328, accruacy: 0.2709
1.41 0.04 1.57 : 0.91 0.04 1.57
mean norm:  0.9665595889091492 0.0956035703420639
epoch: 1, loss: 100.95623395687777, accruacy: 0.3169
1.26 0.05 1.57 : 0.91 0.04 1.57
mean norm:  0.9974905848503113 0.09864223003387451
epoch: 2, loss: 100.88144357974193, accruacy: 0.3349
1.26 0.04 1.57 : 0.91 0.04 1.57
mean norm:  1.003974437713623 0.09926684945821762
epoch: 3, loss: 100.83553403829364, accruacy: 0.3426
1.57 1.56 1.57 : 0.91 0.04 1.57
mean norm:  1.0052741765975952 0.09937497228384018
epoch: 4, loss: 100.82762528798462, accruacy: 0.3531
