In [1]:
!nvidia-smi

Mon Jul 20 16:26:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:3B:00.0 Off |                  N/A |
| 50%   56C    P2    52W / 250W |   2728MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:5E:00.0 Off |                  N/A |
| 32%   48C    P8    18W / 250W |     11MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:86:00.0 Off |                  N/A |
| 45%   

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms

In [3]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=128,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [4]:
torch.Tensor(8)

tensor([2.8925e-36, 0.0000e+00, 2.8964e-36, 0.0000e+00, 2.8925e-36, 0.0000e+00,
        2.8925e-36, 0.0000e+00])

In [5]:
from torch.nn.parallel.data_parallel import DataParallel
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import os

os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend='nccl',rank=0,world_size=1)

In [6]:
from proxyless import Proxyless

net = Proxyless(C=8,num_classes=10,blocks=[16,32,64],num_layers=4)



In [7]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# CUDA 기기가 존재한다면, 아래 코드가 CUDA 장치를 출력합니다:

print(device)

cuda:1


In [8]:
net.to(device)
if torch.cuda.device_count() > 1:
    net = DDP(net,device_ids = [1],find_unused_parameters=True)
    pass

In [9]:
# [x if 'arch' in x[0] else  for x in net.named_parameters()]
arch_params = []
weight_params = []
for x in net.named_parameters():
    if 'alpha' in x[0]:
        arch_params.append(x[1])
    else:
        weight_params.append(x[1])


In [10]:
import torch.optim as optim
import torch
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
optimizer_weight = optim.SGD(weight_params, lr=0.1, momentum=0.9)
optimizer_arch = optim.Adam(arch_params, lr=0.001)

In [None]:
from tqdm import tqdm
import torch.nn.functional as F

num_classes = 10
trn_loss_list = []
val_loss_list = []
acc_list = []
acc_list_aug = []
num_batches=len(trainloader)
total_epoch = 50

for epoch in range(total_epoch):   # 데이터셋을 수차례 반복합니다.
    net.train()
    running_loss = 0.0
    for i, data in tqdm(enumerate(trainloader, 0),total=len(trainloader)):
        if i%6 == 0 and epoch != 0:
            inputs, labels = data[0].to(device), data[1].to(device)

            optimizer_arch.zero_grad()
            optimizer_weight.zero_grad()
            net.module.binarize()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            net.module.set_arch_grad()
            optimizer_arch.step()
            running_loss += loss.item()
        else:
            # [inputs, labels]의 목록인 data로부터 입력을 받은 후;
            inputs, labels = data[0].to(device), data[1].to(device)

            # 변화도(Gradient) 매개변수를 0으로 만들고
            optimizer_weight.zero_grad()
            optimizer_arch.zero_grad()
            net.module.binarize()

            # 순전파 + 역전파 + 최적화를 한 후
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_weight.step()
            running_loss += loss.item()
#         print(loss.item())
        
    net.eval()
    with torch.no_grad(): # very very very very important!!!
        val_loss = 0.0
        class_correct = list(0. for i in range(num_classes))
        class_total = list(0. for i in range(num_classes))
        correct = 0
        total = 0
        for j,val in enumerate(testloader):
            net.module.binarize()
            v1 = val[0].to(device)
            val_labels = val[1].to(device)
            val_output = net(v1)
            v_loss = criterion(val_output, val_labels)
            val_loss += v_loss
            _, predicted = torch.max(val_output, 1)
            c = (predicted == val_labels).squeeze()
            total += val_labels.size(0)
            correct += (predicted == val_labels).sum().item()
            for i in range(len(val_labels)):
                val_label = val_labels[i]
                class_correct[val_label] += c[i].item()
                class_total[val_label] += 1

        print("epoch:",str(epoch))
        temp_acc = []
        for i in range(num_classes):
            if class_total[i]==0:
                print('class_total = 0',class_correct,class_total)
            else:
                print('Accuracy of %5s : %2d %%' % (i, 100 * class_correct[i] / class_total[i]))
                temp_acc.append(100 * class_correct[i] / class_total[i])
        acc_list.append(temp_acc)
    
    for alp in arch_params:
        print(torch.argmax(alp.detach().data).item(),end=',')
    print()
#     print('arch_param_normal')
#     for i in range(4):
#         for op in torch.max(F.softmax(net.module.arch_param_normal[i],dim=1),1)[1]:
#             print(OPS[op],end=' ')
#         print()
# #         print(torch.max(F.softmax(net.module.arch_param_normal[i],dim=1),1)[1])

#     print()
#     print('arch_param_reduc')
#     for i in range(4):
#         for op in torch.max(F.softmax(net.module.arch_param_reduc[i],dim=1),1)[1]:
#             print(OPS[op],end=' ')
#         print()

    print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
        epoch+1, total_epoch, i+1, num_batches, running_loss / len(trainloader), val_loss / len(testloader)
    ))        
    print('Accuracy of the network on the test images: %d %%' % (100 * correct / total))

    trn_loss_list.append(running_loss/1875)
    val_loss_list.append(val_loss/len(testloader))
    running_loss = 0.0

print('Finished Training')

100%|██████████| 391/391 [00:56<00:00,  6.98it/s]


epoch: 0
Accuracy of     0 : 11 %
Accuracy of     1 : 14 %
Accuracy of     2 : 17 %
Accuracy of     3 :  3 %
Accuracy of     4 : 29 %
Accuracy of     5 :  4 %
Accuracy of     6 :  8 %
Accuracy of     7 :  9 %
Accuracy of     8 : 11 %
Accuracy of     9 : 11 %
0,3,0,0,0,0,0,0,0,3,3,3,
epoch: 1/50 | step: 10/391 | trn loss: 2.3228 | val loss: 2.3965
Accuracy of the network on the test images: 12 %


 17%|█▋        | 66/391 [00:09<00:43,  7.54it/s]

In [None]:
torch.argmax(alp.detach()).item()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.title('val_acc')
plt.plot([np.mean(x) for x in acc_list])
plt.legend()
plt.show()
plt.title('val_loss')
plt.plot(val_loss_list)
plt.legend()
plt.show()

In [None]:
import torch.nn.functional as F

print('arch_param_normal')
for i in range(4):
    print(torch.max(F.softmax(net.module.arch_param_normal[i],dim=1),1)[1])
    
print()
print('arch_param_reduc')
for i in range(4):
    print(torch.max(F.softmax(net.module.arch_param_reduc[i],dim=1),1)[1])

In [None]:
net.module.arch_param_reduc