# Суперсеть с Softmax

In [None]:
import torch as t
import torchvision
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pylab as plt
from torch.nn.utils import clip_grad_value_
%matplotlib inline


In [None]:
input_dim = 32*32*1 
class_num = 10
n_epochs =  20
fine_tune_epochs = 10
batch_size = 256
random_seed = 42
valid_size = 0.1 # валидация не используется. Сохраняем ее для чистоты эксперимента
trials = 10
search_space = [1, 16, 32, 64, 256, 512, 1024]  # '1' кодирует тождественное отображение

In [None]:
train_data = torchvision.datasets.CIFAR10('./files/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                                  torchvision.transforms.Lambda(lambda x: x.mean(0).view(-1))
                             ]))

test_data = torchvision.datasets.CIFAR10('./files/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                              (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                                  torchvision.transforms.Lambda(lambda x: x.mean(0).view(-1))
                             ]))

num_train = len(train_data)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))

train_idx, valid_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = t.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, num_workers=0, pin_memory=True )
test_loader = t.utils.data.DataLoader(test_data, batch_size=batch_size)
valid_loader = t.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler)


In [None]:
class MixedLayer(nn.Module):
    def __init__(self, in_, dims, act=F.tanh):
        nn.Module.__init__(self)            
        self.dims = dims
        self.layer = nn.Linear(in_, max(dims)).cuda() 
        self.act =  act        
        self.s = nn.Parameter(t.ones(len(dims), device='cuda')+ t.randn(len(dims), device='cuda')*.0)    
        # мы добавляем шум в слои для уменьшения возможной подстройки одного слоя под другой
        # в противном случае становится более вероятным случай, когда
        # в первую очередь выбирается наиболее простая подмодель, а остальные "достраиваются"
        # с небольшими значениями для структурных параметров
        self.noises = [t.randn(in_).cuda() for _ in range(len(dims))]
        
    def forward(self, x, temp):
        gamma = F.softmax(self.s/temp)
        var_result = self.act(self.layer(x))
        
        result = t.zeros(x.shape).cuda()
        for i,d in enumerate(self.dims):
            if d == 1:
                result = result + (x + self.noises[i])*gamma[i] 
            else:
                result[:,:d] =result[:,:d] +  (var_result[:,:d]+self.noises[i][:d])*gamma[i]            
        return result
  
        
    
class SuperNet(nn.Module):
    def __init__(self, dims,  layer_num):
        nn.Module.__init__(self)
        layers = []
        for l in range(layer_num):
            layers.append(MixedLayer(input_dim, dims )) 
        layers.append(nn.Linear(input_dim, 10).cuda()) 
            
      
        self.model = nn.Sequential(*layers)
    
    def forward(self, x, temp):
        for l in self.model:
            if isinstance(l, MixedLayer):
            
                x = l(x, temp)
            else:
                x = l(x)
        return x
    
     

In [None]:
def test_acc(model, loader, temp, func = lambda x:x):
    tp = 0
    cases = 0
    model.eval()
    for x,y in loader: 
            x = func(x)
            x = x.cuda()
            y = y.cuda()
            if temp is None:
                out = model(x).argmax(1)
            else:
                out = model(x, temp).argmax(1)
            tp+=(out==y).sum()
            cases+=len(y)
    model.train()
    return  tp.cpu().numpy()*1.0/cases

In [None]:
import pickle
t.manual_seed(random_seed)
id=0
for trial in range(trials):
    net = SuperNet(search_space, 4)
    struct = [m.s for m in net.model if isinstance(m, MixedLayer)]
    optimizer1 = optim.Adam([p for p in net.parameters() if p not in set(struct)], lr=0.001)
    optimizer2 = optim.Adam(struct, lr=0.01) 
    loss_fn = nn.CrossEntropyLoss()    
    for epoch in range(n_epochs):        
        for x,y in train_loader:
            id+=1

            x = x.cuda()
            y = y.cuda()            
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            loss = 0
            out_loss = 0
            out = net(x, 1.0)
            out_loss += loss_fn(out, y)*len(train_idx)*1.0        
            loss = (out_loss)       
            if id %100 == 0:
                print (net.model[0].s)
                print (out_loss.data)
                print ('\n')

            loss.backward()
            clip_grad_value_(net.parameters(), 1.0)            
            optimizer1.step()  
            optimizer2.step()   

        acc = test_acc(net, valid_loader, 0.2)            
        print ('Trial {0}. Epoch {1}. Acc: {2}'.format(trial, epoch, acc))
    with open( 'naive_supernet{0}.pckl'.format(trial), 'wb') as out:
        pickle.dump(net, out)
    del net
    t.cuda.empty_cache()


In [None]:
class AddLayer(nn.Module):
    def __init__(self, bias):
        nn.Module.__init__(self)
        self.bias = bias
    
    def forward(self, x):
        return x + self.bias
    
def supernet_structure_to_net(model):
    var_layers = []
    in_dim = input_dim
    for mixed_layer in model.model[:-1]:        
        if isinstance(mixed_layer, MixedLayer):
            best_layer_id = mixed_layer.s.argmax()
            if mixed_layer.dims[best_layer_id] == 1:
                continue
            print (best_layer_id)
            new_layer = nn.Linear(in_dim, mixed_layer.dims[best_layer_id]).cuda()
            new_layer.weight.data *= 0
            new_layer.weight.data += mixed_layer.layer.weight[:mixed_layer.dims[best_layer_id], :in_dim]
            new_layer.bias.data *= 0
            new_layer.bias.data += mixed_layer.layer.bias[:mixed_layer.dims[best_layer_id]]
            var_layers.append(new_layer)
            var_layers.append(nn.Tanh())
            var_layers.append(AddLayer(mixed_layer.noises[best_layer_id][:mixed_layer.dims[best_layer_id]]))
            
            in_dim = mixed_layer.dims[best_layer_id]
    
    sublayer = model.model[-1] 
    out_ = 10
    
    new_layer = nn.Linear(in_dim, out_).cuda()
    new_layer.weight.data *= 0
    new_layer.weight.data += sublayer.weight[:, :in_dim]
    new_layer.bias.data *= 0
    new_layer.bias.data += sublayer.bias[:in_dim]
            
    var_layers.append(new_layer)
    return nn.Sequential(*var_layers)
#subnet = supernet_structure_to_net(net)

## Дообучение

In [None]:
import pickle
tuned_nets = []
for trial in range(trials):
    with open( 'naive_supernet{0}.pckl'.format(trial), 'rb') as inp:
        net = pickle.load(inp)
    subnet = supernet_structure_to_net(net)
    optimizer1 = optim.Adam(subnet.parameters(), lr=0.001) # для параметров
    loss_fn = nn.CrossEntropyLoss()    
    id=0
    for epoch in range(fine_tune_epochs):        
        for x,y in train_loader:
            id+=1

            x = x.cuda()
            y = y.cuda()            
            optimizer1.zero_grad()        
            loss = 0
            out_loss = 0
            out = subnet(x)
            out_loss += loss_fn(out, y)*len(train_idx)*1.0        
            loss = (out_loss)       
            if id %100 == 0:            
                print (out_loss.data)
                print ('\n')

            loss.backward()
            clip_grad_value_(net.parameters(), 1.0)            
            optimizer1.step()  

        acc = test_acc(subnet, valid_loader, None)            
        print ('Trial {0}. Epoch {1}. Acc: {2}'.format(trial, epoch, acc))
    tuned_nets.append(subnet)


In [None]:
with open('./naive_supernet_tuned.pckl', 'wb') as out:
    pickle.dump(tuned_nets, out)

In [None]:
stats = {}

In [None]:
pn = []
for subnet in tuned_nets:    
    num = 0
    for p in subnet.parameters():
    
        if len(p.size())==1:
            num+=p.size()[0]
        elif len(p.size())==0:
            num+=1
        else:
            num+=p.size()[1]*p.size()[0]
    pn.append(num)

stats['param number'] = pn
stats['param number']

In [None]:
def get_superposition_number(): 
    sn = []
    for subnet in tuned_nets:
        cnt = 0
        for submodel in subnet:
            if len(list(submodel.parameters()))>0:
                cnt+=1
        sn.append(cnt)
        
    return sn
stats['superposition number'] = get_superposition_number()
stats['superposition number']

In [None]:
t.manual_seed(random_seed)
X = []
Y = []
Y_std = []
accs = []
for noise in np.linspace(0, 1.0, 10):
    X.append(noise)
    acc = []
    for subnet in tuned_nets:
                     
        acc += [test_acc(subnet, test_loader, None, func = lambda x: x+t.randn(x.size())*noise)] 
    print (acc)
    Y.append(np.mean(acc))
    Y_std.append(np.std(acc))
    accs.append(acc)


In [None]:
stats['noise'] = [X,Y,Y_std, accs]

In [None]:
t.manual_seed(random_seed)
X = []
Y = []
Y_std = []
accs = []
for noise in range(1, 10):
    noise = int(noise)
    def blur(x):
        
        kernel = t.ones(1,1,noise, noise)*1.0/noise/noise
        
        x_ = x.view(-1, 1, 32, 32)
        
        x_ = F.conv2d(x_,  kernel, stride=1)    
        
        x_ = F.upsample(x_, ( 32,32)).view(-1, 32*32)
        return x_
    
    X.append(noise)
    acc = []
    for subnet in tuned_nets:
                     
        acc += [test_acc(subnet, test_loader, None, func = blur)] 
    print (acc)
    Y.append(np.mean(acc))
    Y_std.append(np.std(acc))
    accs.append(acc)
stats['blur'] = [X,Y,Y_std, accs]

In [None]:
t.manual_seed(random_seed)
X = []
Y = []
accs = []
Y_std = []
for noise in np.linspace(0, 0.1, 10):
    X.append(noise)
    acc = []
    for subnet in tuned_nets:
        m = subnet

        old_params = []
        for p in m.parameters():
            old_params.append(p.data*1.0)

        tp = 0        
        for x,y in test_loader:

            for p, o in zip(m.parameters(), old_params):                
                n = t.randn(p.data.shape)*noise
                n = n.cuda()                    
                p.data = o + n
            x = x.cuda()
            y = y.cuda()
            out = m(x).argmax(1)
            tp+=(out==y).sum()
            for p, o in zip(m.parameters(), old_params):                
                p.data = o
        acc.append(tp.cpu().numpy()*1.0/len(test_data))
    print (acc)
    accs.append(acc)
    Y.append(np.mean(acc))
    Y_std.append(np.std(acc))
stats['params'] = [X,Y,Y_std, accs]

In [None]:
with open('./naive_supernet_stats.pckl', 'wb') as out:
    pickle.dump(stats, out)