<a href="https://colab.research.google.com/github/bekku/deeplearning/blob/master/%5B%E4%BF%9D%E5%AD%98%E7%89%88%5DDARTS%E5%AE%9F%E8%A1%8C%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri Jan  8 12:23:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    24W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from torchsummary import summary

# **utils.py**

In [3]:
import os
import numpy as np
import torch
import shutil
import torchvision.transforms as transforms
from torch.autograd import Variable


class AvgrageMeter(object):

  def __init__(self):
    self.reset()

  def reset(self):
    self.avg = 0
    self.sum = 0
    self.cnt = 0

  def update(self, val, n=1):
    self.sum += val * n
    self.cnt += n
    self.avg = self.sum / self.cnt


def accuracy(output, target, topk=(1,)):
  maxk = max(topk)
  batch_size = target.size(0)

  _, pred = output.topk(maxk, 1, True, True)
  pred = pred.t()
  correct = pred.eq(target.view(1, -1).expand_as(pred))

  res = []
  for k in topk:
    correct_k = correct[:k].reshape(-1).float().sum(0)
    ########### view　→ reshape  ###########
    res.append(correct_k.mul_(100.0/batch_size))
  return res


class Cutout(object):
    def __init__(self, length):
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)
        y = np.random.randint(h)
        x = np.random.randint(w)

        y1 = np.clip(y - self.length // 2, 0, h)
        y2 = np.clip(y + self.length // 2, 0, h)
        x1 = np.clip(x - self.length // 2, 0, w)
        x2 = np.clip(x + self.length // 2, 0, w)

        mask[y1: y2, x1: x2] = 0.
        mask = torch.from_numpy(mask)
        mask = mask.expand_as(img)
        img *= mask
        return img


def _data_transforms_cifar10(args):
  CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
  CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

  train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
  ])
  if args.cutout:
    train_transform.transforms.append(Cutout(args.cutout_length))

  valid_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    ])
  return train_transform, valid_transform


def count_parameters_in_MB(model):
  return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6


def save_checkpoint(state, is_best, save):
  filename = os.path.join(save, 'checkpoint.pth.tar')
  torch.save(state, filename)
  if is_best:
    best_filename = os.path.join(save, 'model_best.pth.tar')
    shutil.copyfile(filename, best_filename)


def save(model, model_path):
  torch.save(model.state_dict(), model_path)


def load(model, model_path):
  model.load_state_dict(torch.load(model_path))


def drop_path(x, drop_prob):
  if drop_prob > 0.:
    keep_prob = 1.-drop_prob
    mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob))
    x.div_(keep_prob)
    x.mul_(mask)
  return x


def create_exp_dir(path, scripts_to_save=None):
  if not os.path.exists(path):
    os.mkdir(path)
  print('Experiment dir : {}'.format(path))

  if scripts_to_save is not None:
    os.mkdir(os.path.join(path, 'scripts'))
    for script in scripts_to_save:
      dst_file = os.path.join(path, 'scripts', os.path.basename(script))
      shutil.copyfile(script, dst_file)



# **operations.py**

In [4]:
import torch
import torch.nn as nn

OPS = {
  'none' : lambda C, stride, affine: Zero(stride),
  'avg_pool_3x3' : lambda C, stride, affine: nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False),
  'max_pool_3x3' : lambda C, stride, affine: nn.MaxPool2d(3, stride=stride, padding=1),
  'skip_connect' : lambda C, stride, affine: Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
  'sep_conv_3x3' : lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine),
  'sep_conv_5x5' : lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine),
  'sep_conv_7x7' : lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine),
  'dil_conv_3x3' : lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine),
  'dil_conv_5x5' : lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine),
  'conv_7x1_1x7' : lambda C, stride, affine: nn.Sequential(
    nn.ReLU(inplace=False),
    nn.Conv2d(C, C, (1,7), stride=(1, stride), padding=(0, 3), bias=False),
    nn.Conv2d(C, C, (7,1), stride=(stride, 1), padding=(3, 0), bias=False),
    nn.BatchNorm2d(C, affine=affine)
    ),
}

class ReLUConvBN(nn.Module):

  def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
    super(ReLUConvBN, self).__init__()
    self.op = nn.Sequential(
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False),
      nn.BatchNorm2d(C_out, affine=affine)
    )

  def forward(self, x):
    return self.op(x)

class DilConv(nn.Module):

  def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
    super(DilConv, self).__init__()
    self.op = nn.Sequential(
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=C_in, bias=False),
      nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
      nn.BatchNorm2d(C_out, affine=affine),
      )

  def forward(self, x):
    return self.op(x)


class SepConv(nn.Module):

  def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
    super(SepConv, self).__init__()
    self.op = nn.Sequential(
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False),
      nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
      nn.BatchNorm2d(C_in, affine=affine),
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False),
      nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
      nn.BatchNorm2d(C_out, affine=affine),
      )

  def forward(self, x):
    return self.op(x)


class Identity(nn.Module):

  def __init__(self):
    super(Identity, self).__init__()

  def forward(self, x):
    return x


class Zero(nn.Module):

  def __init__(self, stride):
    super(Zero, self).__init__()
    self.stride = stride

  def forward(self, x):
    if self.stride == 1:
      return x.mul(0.)
    return x[:,:,::self.stride,::self.stride].mul(0.)

# conv1x1 (pointwise conv)を行うクラスと予想。
class FactorizedReduce(nn.Module):
  def __init__(self, C_in, C_out, affine=True):
    super(FactorizedReduce, self).__init__()
    assert C_out % 2 == 0
    # assert 条件式, 条件式がFalseの場合に出力するメッセージ
    # c_outは、CやC_curr(Cellに渡される時はCとなる)が代入されてこの関数が動く、reduction = True/False時に
    # 2がかけられるが、基本的に変化なし。Cのlayesごとの変動は[16,16,32,32,32,64,64,64]である。
    self.relu = nn.ReLU(inplace=False)
    self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
    self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
    # C_outは、2で確実に割り切れるために、c_outを2で割って問題なし。
    # catについて
    # https://qiita.com/Haaamaaaaa/items/709d774698082e9d342d
    self.bn = nn.BatchNorm2d(C_out, affine=affine)

  def forward(self, x):
    x = self.relu(x)
    out = torch.cat([self.conv_1(x), self.conv_2(x[:,:,1:,1:])], dim=1)
    # x(画像trainデータ) は四次元か。batch,chanel数、H,Wか
    # self.conv_2(x[:,:,1:,1:])は、、、
    # xのカーネルサイズが奇数以外の時は、成り立つ。恐らく奇数にはならない。
    out = self.bn(out)
    return out
    # 出力は、stride=2より、ダウンサンプリングされている。　１/２である。


# **genotypes.py**

In [5]:
from collections import namedtuple

Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')

PRIMITIVES = [
    'none',
    'max_pool_3x3',
    'avg_pool_3x3',
    'skip_connect',
    'sep_conv_3x3',
    'sep_conv_5x5',
    'dil_conv_3x3',
    'dil_conv_5x5'
]

NASNet = Genotype(
  normal = [
    ('sep_conv_5x5', 1),
    ('sep_conv_3x3', 0),
    ('sep_conv_5x5', 0),
    ('sep_conv_3x3', 0),
    ('avg_pool_3x3', 1),
    ('skip_connect', 0),
    ('avg_pool_3x3', 0),
    ('avg_pool_3x3', 0),
    ('sep_conv_3x3', 1),
    ('skip_connect', 1),
  ],
  normal_concat = [2, 3, 4, 5, 6],
  reduce = [
    ('sep_conv_5x5', 1),
    ('sep_conv_7x7', 0),
    ('max_pool_3x3', 1),
    ('sep_conv_7x7', 0),
    ('avg_pool_3x3', 1),
    ('sep_conv_5x5', 0),
    ('skip_connect', 3),
    ('avg_pool_3x3', 2),
    ('sep_conv_3x3', 2),
    ('max_pool_3x3', 1),
  ],
  reduce_concat = [4, 5, 6],
)
    
AmoebaNet = Genotype(
  normal = [
    ('avg_pool_3x3', 0),
    ('max_pool_3x3', 1),
    ('sep_conv_3x3', 0),
    ('sep_conv_5x5', 2),
    ('sep_conv_3x3', 0),
    ('avg_pool_3x3', 3),
    ('sep_conv_3x3', 1),
    ('skip_connect', 1),
    ('skip_connect', 0),
    ('avg_pool_3x3', 1),
    ],
  normal_concat = [4, 5, 6],
  reduce = [
    ('avg_pool_3x3', 0),
    ('sep_conv_3x3', 1),
    ('max_pool_3x3', 0),
    ('sep_conv_7x7', 2),
    ('sep_conv_7x7', 0),
    ('avg_pool_3x3', 1),
    ('max_pool_3x3', 0),
    ('max_pool_3x3', 1),
    ('conv_7x1_1x7', 0),
    ('sep_conv_3x3', 5),
  ],
  reduce_concat = [3, 4, 6]
)

DARTS_V1 = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 0), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('avg_pool_3x3', 0)], reduce_concat=[2, 3, 4, 5])
DARTS_V2 = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5])

DARTS = DARTS_V2



# **model_search.py**

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class MixedOp(nn.Module):

  def __init__(self, C, stride):
    super(MixedOp, self).__init__()
    self._ops = nn.ModuleList()
    # あるCellの各オペレーション処理を全て包含するリストとなる。

    for primitive in PRIMITIVES:
      op = OPS[primitive](C, stride, False)
      # PRIMITIVES = ['none','max_pool_3x3','avg_pool_3x3','skip_connect','sep_conv_3x3','sep_conv_5x5','dil_conv_3x3','dil_conv_5x5']
      # PRIMITIVESは、各実行名称が入っているリストである。
      # OPSは各、名称をキーとしてlamda関数を値とする、辞書が作成されていてそれを利用することで関数を定義できる。
      if 'pool' in primitive:
        op = nn.Sequential(op, nn.BatchNorm2d(C, affine=False))
      # 'max_pool_3x3','avg_pool_3x3'のop処理後に、batchNorm2dを行う。
      # pool後のみ、batchnorm採用？

      self._ops.append(op)
      # 各、オペレーションを_opsに追加していく。

      ##### ちなみに、strideとCによって、このオペレーションたちは変更し得るので毎回作成してる。 #####

  def forward(self, x, weights):
    return sum(w * op(x) for w, op in zip(weights, self._ops))


class Cell(nn.Module):

  def __init__(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev):
    super(Cell, self).__init__()
    self.reduction = reduction
    # 現在の削減状態、PrevがTrue時にダウンサンプリングされる。カーネルサイズが小さくなる。1/2倍

    if reduction_prev:
      self.preprocess0 = FactorizedReduce(C_prev_prev, C, affine=False)
      # reduction_prev == True 時、実行。
      # skip層であり、stride=2より、dawnsamplingされる。カーネルサイズが1/2になる。
    else:
      self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0, affine=False)
      # reduction_prev == False 時、実行。
      # (C_in, C_out, kernel_size, stride, padding, affine=True)となるただのconv層である。
      # ただの1*1のconv層　で、stride=1よりdawnsamplingされない。

    self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0, affine=False)
    self._steps = steps
    self._multiplier = multiplier
    # Network作成時に、steps=4、multiplier=3 がデフォルトで設定されている。

    # preprocessは、optを適用させる上での前過程。

    self._ops = nn.ModuleList()
    self._bns = nn.ModuleList()
    for i in range(self._steps):
      for j in range(2+i):
        stride = 2 if reduction and j < 2 else 1
        # reduction==True かつ j<2の時、stride=1となる。
        # 12/9 07:08 何故、j が 2以上ならOKなの？ reductionがTrueの時は前処理で1/2される。
        #  j  = 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4
        # str = 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1
        # 2個分のの入力ノードを含みます
        # リダクションは入力ノードにのみ使用する必要があるため。
        # [解釈] : これは論文読むべき？
        # 理解 : 知らんかったけど、そもそもノードとセルっていう概念があって。
        # 恐らくセルは層と同じ意味。　しかしノードを包含しているのがセルであり、各ノードごとにオペレーションの選択肢がある。
        # そもそもstepsはノードの個数？というかノードのグラフの階差みたいなイメージだった。

        op = MixedOp(C, stride)
        # 各、Cとstride毎に、あるCellの各オペレーション処理を全て包含するリストとなる。
        self._ops.append(op)
        # len(_ops)14個となる。

  def forward(self, s0, s1, weights):
    s0 = self.preprocess0(s0)
    s1 = self.preprocess1(s1)

    states = [s0, s1]
    offset = 0
    for i in range(self._steps):
      s = sum(self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states))
      offset += len(states)
      states.append(s)

    return torch.cat(states[-self._multiplier:], dim=1)


class Network(nn.Module):

  def __init__(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3):
    super(Network, self).__init__()

    self._C = C
    # C は args.init_channelsであり、入力チャンネル数:16

    self._num_classes = num_classes
    self._layers = layers
    self._criterion = criterion
    self._steps = steps
    self._multiplier = multiplier

    C_curr = stem_multiplier*C
    #  48   = 3 * (C=16)

    self.stem = nn.Sequential(
      nn.Conv2d(3, C_curr, 3, padding=1, bias=False),
      nn.BatchNorm2d(C_curr)
    )
    # 最初の層のconv層　チャンネル数が3
    # C_currはconvの出力フィルター数 → 上記より48
    # currentで今のフィルター出力数を表している

    C_prev_prev, C_prev, C_curr = C_curr, C_curr, C
    # C_prev_prev = (C_curr = 48)
    # C_prev = (C_curr = 48)
    # C_curr = (C = 16)
    # 各初期、出力フィルター数

    self.cells = nn.ModuleList()
    # cells は cellを詰め込むリスト
    reduction_prev = False
    # 一つ前ののreductionである、 reduction_prevの初期は、１つ前がないからFalse

    for i in range(layers):
      if i in [layers//3, 2*layers//3]:
        # (layesの3の商　or　2*layersの3の商)時、実行
        # default = 8の時、i == 2 or 5 で実行される。
        C_curr *= 2
        reduction = True
        # つまり、i==2と i==5の 3番目と6番目の層は、reduction = Trueとなり、フィルター数も二倍
      else:
        reduction = False

      cell = Cell(steps, multiplier, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
      # Cell(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev)で作成
      # i==2と i==5の 3番目と6番目の層は、reductionをTrue(c_curr*2)/Falseとして、Cellを作成していく。

      reduction_prev = reduction
      # 使用したreductionは、reduction_prevとなる。

      self.cells += [cell]
      # 作成した　cell　をcellsに追加

      C_prev_prev, C_prev = C_prev, multiplier*C_curr
      # preはprev_prevとなり、prevは multiplier*C_curr = (default=4)*C_curr となる。

    self.global_pooling = nn.AdaptiveAvgPool2d(1)
    self.classifier = nn.Linear(C_prev, num_classes)

    self._initialize_alphas()

  def new(self):
    model_new = Network(self._C, self._num_classes, self._layers, self._criterion).cuda()
    for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
        x.data.copy_(y.data)
        # arch_parameters = self.alphas_normal, self.alphas_reduce
        # 現状のモデルを コピーして新しく作り直している。
    return model_new

  def forward(self, input):
    s0 = s1 = self.stem(input)
    for i, cell in enumerate(self.cells):
      if cell.reduction:
        weights = F.softmax(self.alphas_reduce, dim=-1)
      else:
        weights = F.softmax(self.alphas_normal, dim=-1)
      s0, s1 = s1, cell(s0, s1, weights)
    out = self.global_pooling(s1)
    logits = self.classifier(out.view(out.size(0),-1))
    return logits

  def _loss(self, input, target):
    logits = self(input)
    return self._criterion(logits, target)

  def _initialize_alphas(self):
    k = sum(1 for i in range(self._steps) for n in range(2+i))
    num_ops = len(PRIMITIVES)

    # kは各ノードの矢印の総本数

    # torch.autograd.variable(torch.tensor,requires_grad=True/False)によって「αnormal」と「αreduce」を勾配対象
    self.alphas_normal = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)
    self.alphas_reduce = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)
    self._arch_parameters = [
      self.alphas_normal,
      self.alphas_reduce,
    ]
    # torch.randn(k, num_ops)は、{k × num_ops}のsizeとなるランダム値を生成。
    # 矢印の本数 × オペレーションの数
    # それぞれの矢印のオペレーションに適した、重みとなる。

  def arch_parameters(self):
    return self._arch_parameters

  def genotype(self):

    def _parse(weights):
      gene = []
      n = 2
      start = 0
      for i in range(self._steps):
        end = start + n
        W = weights[start:end].copy()
        # 1回目： start = 0 end = 2 n = 2 edge は0 ~ 1 
        # 2回目： start = 2 end = 5 n = 3 edge は2 ~ 4 
        # 3回目： start = 5 end = 9 n = 4 edge は5 ~ 8 
        # 4回目： start = 9 end = 14 n = 5 edge は9 ~ 13 
        edges = sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2]
        # sortのkeyが、14本のedgeから、それぞれ最大値のオペレーションが比較基準となる。
        # 各step毎に2本のedgeが選択されている。
        # stepsが進むにつれて(forによって)、rangeの範囲が一つずつ増えていく。steps = 4より　最大で0 ~ 4であり、14本見てなくない？

        # print(sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none'))))
        # print(edges)

        for j in edges:
          k_best = None
          for k in range(len(W[j])):
            # 選ばれたedgeの各オペレーションを見ていく。
            if k != PRIMITIVES.index('none'):
              if k_best is None or W[j][k] > W[j][k_best]:
                k_best = k
              # 選んだedge内での最大のオペレーションを選んでいく。
          gene.append((PRIMITIVES[k_best], j))
          # PRIMITIVES[k_best]は選んだedgeの最大オペレーション、jはedge番号。
        start = end
        n += 1
      return gene

    gene_normal = _parse(F.softmax(self.alphas_normal, dim=-1).data.cpu().numpy())
    # print("----------------------------------------  gane_一覧 ----------------------------------------")
    # print(F.softmax(self.alphas_normal, dim=-1))
    # print(*gene_normal)
    # print("---------------------------------------- gane_一覧---------------------------------------- ")
    gene_reduce = _parse(F.softmax(self.alphas_reduce, dim=-1).data.cpu().numpy())

    concat = range(2+self._steps-self._multiplier, self._steps+2)
    genotype = Genotype(
      normal=gene_normal, normal_concat=concat,
      reduce=gene_reduce, reduce_concat=concat
    )
    return genotype


In [7]:
# model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
# model.genotype()

# **architect.py**

In [8]:
import torch
import numpy as np
import torch.nn as nn
from torch.autograd import Variable


def _concat(xs):
  return torch.cat([x.view(-1) for x in xs])


class Architect(object):

  def __init__(self, model, args):
    self.network_momentum = args.momentum
    # momentum = 0.9がdefault
    self.network_weight_decay = args.weight_decay
    # weight_decay = 3e-4 がdefault
    # 学習用の値を扱うため、trainで活用されそう。
    self.model = model
    self.optimizer = torch.optim.Adam(self.model.arch_parameters(),
        lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)

  # unrolled_model を 計算する関数？
  def _compute_unrolled_model(self, input, target, eta, network_optimizer):
    # https://qiita.com/tokkuman/items/1944c00415d129ca0ee9
    ########## モーメント と wight_decayを計算するよ。 ###########
    loss = self.model._loss(input, target)
    theta = _concat(self.model.parameters()).data
    # torch.cat([x.view(-1) for x in xs])
    # 重みパラメータを結合し続ける。
    try:
      moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum)
      # optimizer.state の self.model.parameters()(パラメータ名) を キーとして、'momentum_buffer' のキーを更に選択して
      # 一列に並べてつなげる。
      # その後.mul_で　self.network_momentum　を掛け合わせる。
      # αΔwt　←モーメントで、それを求めている。
      # Δwt = optimizer.state[各パラメータ]['momentum_buffer']、　α = network_momentumである。

    except:
      moment = torch.zeros_like(theta)
      # thetaと同等のsizeの 0 テンソルを作成する。

    dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta
    #　weight_decay : ηλw
    # λ = network_weight_decay
    # W = theta (重みパラメータの一列)
    # δL/δw が　左。

    # dtheta = δL/δw + λw
    # moment = Δwt

    # 最終式 : w ← w -η(δL/δw) - ηλw + αΔwt

    unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta))
    # torch.sub(a,b,c=1)
    # moment + dtheta　= Δwt - δL/δw - λw
    # a.sub(other,alpha) a - othre * alpha
    # w - eta(η) * (Δwt + δL/δw + λw)

    # unrolled_modelは、最終的に、現存モデル　から　新しい複製モデルに、パラメータを移行した状態となる。
    return unrolled_model

  def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled):
    # eta は学習率
    self.optimizer.zero_grad()
    if unrolled:
        # unrolled の　default は　Falseである。
        self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer)
    else:
        self._backward_step(input_valid, target_valid)
        # lossを計算して、backwardするだけ。
    self.optimizer.step()

  def _backward_step(self, input_valid, target_valid):
    loss = self.model._loss(input_valid, target_valid)
    loss.backward()

  def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer):
    unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer)
    unrolled_loss = unrolled_model._loss(input_valid, target_valid)

    unrolled_loss.backward()
    dalpha = [v.grad for v in unrolled_model.arch_parameters()]
    vector = [v.grad.data for v in unrolled_model.parameters()]
    # arch_parameters = self.alphas_normal, self.alphas_reduce
    # arch_parameters　を　vに入れて、grad →　tensor([6., 5.])みたいなテンソルで返す。
    # ちなみに、grad.data と grad の違いがわからない。

    implicit_grads = self._hessian_vector_product(vector, input_train, target_train)
    # (gradベクトル,input_train,target_train)

    for g, ig in zip(dalpha, implicit_grads):
      g.data.sub_(eta, ig.data)

    for v, g in zip(self.model.arch_parameters(), dalpha):
      if v.grad is None:
        v.grad = Variable(g.data)
      else:
        v.grad.data.copy_(g.data)

  def _construct_model_from_theta(self, theta):
    model_new = self.model.new()
    # 現存モデルをコピー
    model_dict = self.model.state_dict()
    # 現存モデルのstate_dict(conv1とかチャネル1の重みパラメータを取り出す)

    # theta : w - eta(η) * (Δwt + δL/δw + λw)

    params, offset = {}, 0
    for k, v in self.model.named_parameters():
      v_length = np.prod(v.size())
      # v.size()の要素積となる。 vの総要素数

      params[k] = theta[offset: offset+v_length].view(v.size())
      # それぞれの勾配式、適用後の要素を一列化
      # params = [(0 : v_length), (v_length : 2 * v_length), (2 * v_length : 3 * v_length),,,]
      # 各forwardの1操作の重みパラメータが取り出される。

      offset += v_length

    assert offset == len(theta)
    model_dict.update(params)
    model_new.load_state_dict(model_dict)
    return model_new.cuda()

  def _hessian_vector_product(self, vector, input, target, r=1e-2):
    # ヘッセ行列の関数ー数値微分の近似部分？  正解
    # (gradベクトル,input_train,target_train)
    R = r / _concat(vector).norm()
    # grad ベクトル一列 を 2ノルム化したもので r を割る。
    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)
    loss = self.model._loss(input, target)
    grads_p = torch.autograd.grad(loss, self.model.arch_parameters())

    for p, v in zip(self.model.parameters(), vector):
      p.data.sub_(2*R, v)
    loss = self.model._loss(input, target)
    grads_n = torch.autograd.grad(loss, self.model.arch_parameters())

    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)

    return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]



# **train_search.pyの前半**

In [11]:
def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr):
  objs = AvgrageMeter()
  top1 = AvgrageMeter()
  top5 = AvgrageMeter()
  #len(train_queue) 1563

  for step, (input, target) in enumerate(train_queue):
    # !nvidia-smi
    model.train()
    n = input.size(0)

    input = Variable(input, requires_grad=False).cuda()
    # target = Variable(target, requires_grad=False).cuda(async=True)
    target = Variable(target, requires_grad=False).cuda(non_blocking=True)
    # Variable  Tensorをラップしたクラス。
    # data grad creatorを保有するクラス。

    input_search, target_search = next(iter(valid_queue))
    # iter(リスト)でイテレータを作成。
    #nextで、イテレータを一つずつ取り出す。

    input_search = Variable(input_search, requires_grad=False).cuda()
    # target_search = Variable(target_search, requires_grad=False).cuda(async=True)
    target_search = Variable(target_search, requires_grad=False).cuda(non_blocking=True)

    architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled)
    # step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled)
    # unrolled は　(default=False)であり、展開するかしないの意味とは？

    optimizer.zero_grad()
    logits = model(input)
    loss = criterion(logits, target)

    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
    optimizer.step()

    prec1, prec5 = accuracy(logits, target, topk=(1, 5))

    objs.update(loss.data, n)
    top1.update(prec1.data, n)
    top5.update(prec5.data, n)

    if step % args.report_freq == 0:
      logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)
      # summary(model,(3,32,32))
  
  return top1.avg, objs.avg


def infer(valid_queue, model, criterion):
  objs = AvgrageMeter()
  top1 = AvgrageMeter()
  top5 = AvgrageMeter()
  # summary(model,(3,32,32))
  model.eval()
  with torch.no_grad():
    for step, (input, target) in enumerate(valid_queue):
      input = Variable(input, volatile=True).cuda()
      target = Variable(target, volatile=True).cuda(async=True)

      logits = model(input)
      loss = criterion(logits, target)

      prec1, prec5 = accuracy(logits, target, topk=(1, 5))
      n = input.size(0)
      objs.update(loss.data, n)
      top1.update(prec1.data, n)
      top5.update(prec5.data, n)

      if step % args.report_freq == 0:
        logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

  return top1.avg, objs.avg

# **train_search.py**

In [12]:
import os
import sys
import time
import glob
import numpy as np
import torch

import logging
import argparse
import torch.nn as nn
import torch.utils
import torch.nn.functional as F
import torchvision.datasets as dset
import torch.backends.cudnn as cudnn
import torch.utils

from torch.autograd import Variable
# from model_search import Network
# from architect import Architect

parser = argparse.ArgumentParser("cifar")
parser.add_argument('--data', type=str, default='./data', help='location of the data corpus')
# parser.add_argument('--batch_size', type=int, default=64, help='batch size')
parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate')
parser.add_argument('--learning_rate_min', type=float, default=0.001, help='min learning rate')
parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay')
parser.add_argument('--report_freq', type=float, default=50, help='report frequency')
parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
parser.add_argument('--epochs', type=int, default=1, help='num of training epochs')
parser.add_argument('--init_channels', type=int, default=16, help='num of init channels')
parser.add_argument('--layers', type=int, default=8, help='total number of layers')
parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model')
parser.add_argument('--cutout', action='store_true', default=False, help='use cutout')
parser.add_argument('--cutout_length', type=int, default=16, help='cutout length')
parser.add_argument('--drop_path_prob', type=float, default=0.3, help='drop path probability')
parser.add_argument('--save', type=str, default='EXP', help='experiment name')
parser.add_argument('--seed', type=int, default=2, help='random seed')
parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping')
parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data')
parser.add_argument('--unrolled', action='store_true', default=False, help='use one-step unrolled validation loss')
parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding')
parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding')
args = parser.parse_args(args=[])

args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))
log_format = '%(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
    format=log_format, datefmt='%m/%d %I:%M:%S %p')
fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
fh.setFormatter(logging.Formatter(log_format))
logging.getLogger().addHandler(fh)

CIFAR_CLASSES = 10

if not torch.cuda.is_available():
  logging.info('no gpu device available')
  sys.exit(1)
np.random.seed(args.seed)
torch.cuda.set_device(args.gpu)
cudnn.benchmark = True
torch.manual_seed(args.seed)
cudnn.enabled=True
torch.cuda.manual_seed(args.seed)
logging.info('gpu device = %d' % args.gpu)
logging.info("args = %s", args)

criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()

model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
# model.search.py の Network の初期引数(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3):

model = model.cuda()
logging.info("param size = %fMB", count_parameters_in_MB(model))

optimizer = torch.optim.SGD(
    model.parameters(),
    args.learning_rate,
    momentum=args.momentum,
    weight_decay=args.weight_decay)

train_transform, valid_transform = _data_transforms_cifar10(args)
train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform)

num_train = len(train_data)
indices = list(range(num_train))
split = int(np.floor(args.train_portion * num_train))
# split = args.train_portion(default=0.5) * num_train(len(train_data)) 30000
# 30000 train　30000 validationをしている。

train_queue = torch.utils.data.DataLoader(
    train_data, batch_size=args.batch_size,
    sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
    pin_memory=True, num_workers=2)

valid_queue = torch.utils.data.DataLoader(
    train_data, batch_size=args.batch_size,
    sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
    pin_memory=True, num_workers=2)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
      optimizer, float(args.epochs), eta_min=args.learning_rate_min)
# scheduler は、 args.epochs ごとに args.learning_rate_min まで下げる。下記のurlみればわかりやすいです。
# https://katsura-jp.hatenablog.com/entry/2019/01/30/183501#PyTorch%E3%83%A9%E3%82%A4%E3%83%96%E3%83%A9%E3%83%AA%E5%86%85%E3%81%AB%E3%81%82%E3%82%8Bscheduler

architect = Architect(model, args)
# architect.pyの Architect Class
#初期引数は(self, model, args)
# ~疑~　12/8 04:07 architectは何をするもの？

for epoch in range(args.epochs):
  scheduler.step()
  # epoch ごとに、scheduler.step()を実行する。
  lr = scheduler.get_lr()[0]
  logging.info('epoch %d lr %e', epoch, lr)

  genotype = model.genotype()
  # modelのgenotypeを目視、恐らく現在の構築されたモデル構造が明記
  logging.info('genotype = %s', genotype)

  print(F.softmax(model.alphas_normal, dim=-1))
  print(F.softmax(model.alphas_reduce, dim=-1))
  # modelの alphas_normal、 alphas_reduce　option の重み和となる項目のソフトマックス値を出力

  # training
  train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr)
  # !nvidia-smi
  # 下記のtrain関数を実行
  logging.info('train_acc %f', train_acc)

  # validation
  valid_acc, valid_obj = infer(valid_queue, model, criterion)
  # 下記のinfer関数を実行
  logging.info('valid_acc %f', valid_acc)

  # 各epochごとに構造が変化するので保存？
  save(model, os.path.join(args.save, 'weights.pt'))




Experiment dir : search-EXP-20210108-122419
01/08 12:24:19 PM gpu device = 0
01/08 12:24:19 PM args = Namespace(arch_learning_rate=0.0003, arch_weight_decay=0.001, batch_size=32, cutout=False, cutout_length=16, data='./data', drop_path_prob=0.3, epochs=1, gpu=0, grad_clip=5, init_channels=16, layers=8, learning_rate=0.025, learning_rate_min=0.001, model_path='saved_models', momentum=0.9, report_freq=50, save='search-EXP-20210108-122419', seed=2, train_portion=0.5, unrolled=False, weight_decay=0.0003)
01/08 12:24:19 PM param size = 1.930618MB




Files already downloaded and verified
01/08 12:24:20 PM epoch 0 lr 1.000000e-03
01/08 12:24:20 PM genotype = Genotype(normal=[('dil_conv_5x5', 1), ('sep_conv_3x3', 0), ('dil_conv_3x3', 2), ('dil_conv_5x5', 1), ('avg_pool_3x3', 1), ('dil_conv_5x5', 2), ('dil_conv_3x3', 0), ('max_pool_3x3', 3)], normal_concat=range(2, 6), reduce=[('avg_pool_3x3', 1), ('sep_conv_5x5', 0), ('dil_conv_5x5', 0), ('sep_conv_3x3', 1), ('max_pool_3x3', 0), ('avg_pool_3x3', 2), ('sep_conv_3x3', 4), ('sep_conv_3x3', 0)], reduce_concat=range(2, 6))
tensor([[0.1250, 0.1247, 0.1251, 0.1250, 0.1251, 0.1251, 0.1250, 0.1250],
        [0.1249, 0.1250, 0.1250, 0.1249, 0.1249, 0.1251, 0.1251, 0.1251],
        [0.1251, 0.1250, 0.1250, 0.1251, 0.1248, 0.1251, 0.1248, 0.1251],
        [0.1250, 0.1250, 0.1251, 0.1249, 0.1249, 0.1250, 0.1250, 0.1251],
        [0.1248, 0.1250, 0.1249, 0.1249, 0.1249, 0.1251, 0.1252, 0.1251],
        [0.1249, 0.1250, 0.1249, 0.1250, 0.1251, 0.1251, 0.1250, 0.1250],
        [0.1249, 0.1249, 0.125



01/08 12:24:22 PM train 000 2.360637e+00 9.375000 53.125000




01/08 12:25:38 PM train 050 2.195386e+00 18.995098 66.544121
01/08 12:26:53 PM train 100 2.093620e+00 21.905941 73.452972
01/08 12:28:09 PM train 150 2.008966e+00 25.062086 77.255798
01/08 12:29:25 PM train 200 1.947019e+00 27.114428 79.430969
01/08 12:30:39 PM train 250 1.902250e+00 28.847113 80.702194
01/08 12:31:54 PM train 300 1.864524e+00 30.409052 81.883301
01/08 12:33:07 PM train 350 1.830649e+00 31.677351 82.950500
01/08 12:34:22 PM train 400 1.805041e+00 32.567020 83.806114
01/08 12:35:37 PM train 450 1.781531e+00 33.508869 84.381927


Traceback (most recent call last):


01/08 12:36:51 PM train 500 1.760049e+00 34.593315 84.986275
01/08 12:38:06 PM train 550 1.740980e+00 35.299454 85.531990
01/08 12:39:22 PM train 600 1.722062e+00 36.137688 85.914101
01/08 12:40:37 PM train 650 1.705400e+00 36.770351 86.367126


Traceback (most recent call last):


01/08 12:41:52 PM train 700 1.687140e+00 37.388554 86.871437
01/08 12:43:07 PM train 750 1.673930e+00 37.812084 87.204559
01/08 12:43:53 PM train_acc 38.124001


	cuda(torch.device device, bool async, *, torch.memory_format memory_format)
Consider using one of the following signatures instead:
	cuda(torch.device device, bool non_blocking, *, torch.memory_format memory_format) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)


01/08 12:43:54 PM valid 000 1.763657e+00 37.500000 81.250000
01/08 12:44:02 PM valid 050 1.440385e+00 47.671570 91.911766
01/08 12:44:11 PM valid 100 1.424297e+00 48.483910 92.172028
01/08 12:44:20 PM valid 150 1.436282e+00 47.806290 91.928810
01/08 12:44:28 PM valid 200 1.434706e+00 47.932213 92.024254
01/08 12:44:37 PM valid 250 1.438071e+00 47.734066 92.131477
01/08 12:44:45 PM valid 300 1.434340e+00 47.996262 92.296509
01/08 12:44:54 PM valid 350 1.431325e+00 48.112534 92.200851
01/08 12:45:03 PM valid 400 1.427002e+00 48.316711 92.269333
01/08 12:45:12 PM valid 450 1.429189e+00 48.156872 92.308754
01/08 12:45:21 PM valid 500 1.432006e+00 48.128742 92.252991
01/08 12:45:30 PM valid 550 1.431860e+00 48.145416 92.235710
01/08 12:45:39 PM valid 600 1.431238e+00 48.018925 92.294090
01/08 12:45:48 PM valid 650 1.427326e+00 48.094276 92.343513
01/08 12:45:57 PM valid 700 1.427020e+00 47.953815 92.376961
01/08 12:46:06 PM valid 750 1.427519e+00 48.015144 92.347702
01/08 12:46:11 PM valid_

In [13]:
genotype

Genotype(normal=[('dil_conv_5x5', 1), ('sep_conv_3x3', 0), ('dil_conv_3x3', 2), ('dil_conv_5x5', 1), ('avg_pool_3x3', 1), ('dil_conv_5x5', 2), ('dil_conv_3x3', 0), ('max_pool_3x3', 3)], normal_concat=range(2, 6), reduce=[('avg_pool_3x3', 1), ('sep_conv_5x5', 0), ('dil_conv_5x5', 0), ('sep_conv_3x3', 1), ('max_pool_3x3', 0), ('avg_pool_3x3', 2), ('sep_conv_3x3', 4), ('sep_conv_3x3', 0)], reduce_concat=range(2, 6))

# **visualize.py**

In [14]:
import sys
# import genotypes
from graphviz import Digraph


def plot(genotype, filename):
  g = Digraph(
      format='pdf',
      edge_attr=dict(fontsize='20', fontname="times"),
      node_attr=dict(style='filled', shape='rect', align='center', fontsize='20', height='0.5', width='0.5', penwidth='2', fontname="times"),
      engine='dot')
  g.body.extend(['rankdir=LR'])

  g.node("c_{k-2}", fillcolor='darkseagreen2')
  g.node("c_{k-1}", fillcolor='darkseagreen2')
  assert len(genotype) % 2 == 0
  steps = len(genotype) // 2

  for i in range(steps):
    g.node(str(i), fillcolor='lightblue')

  for i in range(steps):
    for k in [2*i, 2*i + 1]:
      op, j = genotype[k]
      if j == 0:
        u = "c_{k-2}"
      elif j == 1:
        u = "c_{k-1}"
      else:
        u = str(j-2)
      v = str(i)
      g.edge(u, v, label=op, fillcolor="gray")

  g.node("c_{k}", fillcolor='palegoldenrod')
  for i in range(steps):
    g.edge(str(i), "c_{k}", fillcolor="gray")

  g.render(filename, view=True)
  g.view()


# if __name__ == '__main__':
#   if len(sys.argv) != 2:
#     print("usage:\n python {} ARCH_NAME".format(sys.argv[0]))
#     sys.exit(1)

#   genotype_name = sys.argv[1]
#   try:
#     genotype = eval("%s" % genotype_name)
#   except AttributeError:
#     print("{} is not specified in genotypes.py".format(genotype_name)) 
#     sys.exit(1)

#   plot(genotype.normal, "normal")
#   plot(genotype.reduce, "reduction")

plot(genotype.normal, "normal")
plot(genotype.reduce, "reduce")


# **model.py**

In [15]:
import torch
import torch.nn as nn
from torch.autograd import Variable


class Cell(nn.Module):

  def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev):
    super(Cell, self).__init__()
    print(C_prev_prev, C_prev, C)

    if reduction_prev:
      self.preprocess0 = FactorizedReduce(C_prev_prev, C)
    else:
      self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0)
    self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0)
    
    if reduction:
      op_names, indices = zip(*genotype.reduce)
      concat = genotype.reduce_concat
    else:
      op_names, indices = zip(*genotype.normal)
      concat = genotype.normal_concat
    self._compile(C, op_names, indices, concat, reduction)

  def _compile(self, C, op_names, indices, concat, reduction):
    assert len(op_names) == len(indices)
    self._steps = len(op_names) // 2
    self._concat = concat
    self.multiplier = len(concat)

    self._ops = nn.ModuleList()
    for name, index in zip(op_names, indices):
      stride = 2 if reduction and index < 2 else 1
      op = OPS[name](C, stride, True)
      self._ops += [op]
    self._indices = indices

  def forward(self, s0, s1, drop_prob):
    s0 = self.preprocess0(s0)
    s1 = self.preprocess1(s1)

    states = [s0, s1]
    for i in range(self._steps):
      h1 = states[self._indices[2*i]]
      h2 = states[self._indices[2*i+1]]
      op1 = self._ops[2*i]
      op2 = self._ops[2*i+1]
      h1 = op1(h1)
      h2 = op2(h2)
      if self.training and drop_prob > 0.:
        if not isinstance(op1, Identity):
          h1 = drop_path(h1, drop_prob)
        if not isinstance(op2, Identity):
          h2 = drop_path(h2, drop_prob)
      s = h1 + h2
      states += [s]
    return torch.cat([states[i] for i in self._concat], dim=1)


class AuxiliaryHeadCIFAR(nn.Module):

  def __init__(self, C, num_classes):
    """assuming input size 8x8"""
    super(AuxiliaryHeadCIFAR, self).__init__()
    self.features = nn.Sequential(
      nn.ReLU(inplace=True),
      nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), # image size = 2 x 2
      nn.Conv2d(C, 128, 1, bias=False),
      nn.BatchNorm2d(128),
      nn.ReLU(inplace=True),
      nn.Conv2d(128, 768, 2, bias=False),
      nn.BatchNorm2d(768),
      nn.ReLU(inplace=True)
    )
    self.classifier = nn.Linear(768, num_classes)

  def forward(self, x):
    x = self.features(x)
    x = self.classifier(x.view(x.size(0),-1))
    return x


class AuxiliaryHeadImageNet(nn.Module):

  def __init__(self, C, num_classes):
    """assuming input size 14x14"""
    super(AuxiliaryHeadImageNet, self).__init__()
    self.features = nn.Sequential(
      nn.ReLU(inplace=True),
      nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False),
      nn.Conv2d(C, 128, 1, bias=False),
      nn.BatchNorm2d(128),
      nn.ReLU(inplace=True),
      nn.Conv2d(128, 768, 2, bias=False),
      # NOTE: This batchnorm was omitted in my earlier implementation due to a typo.
      # Commenting it out for consistency with the experiments in the paper.
      # nn.BatchNorm2d(768),
      nn.ReLU(inplace=True)
    )
    self.classifier = nn.Linear(768, num_classes)

  def forward(self, x):
    x = self.features(x)
    x = self.classifier(x.view(x.size(0),-1))
    return x


class NetworkCIFAR(nn.Module):

  def __init__(self, C, num_classes, layers, auxiliary, genotype):
    super(NetworkCIFAR, self).__init__()
    self._layers = layers
    self._auxiliary = auxiliary

    stem_multiplier = 3
    C_curr = stem_multiplier*C
    self.stem = nn.Sequential(
      nn.Conv2d(3, C_curr, 3, padding=1, bias=False),
      nn.BatchNorm2d(C_curr)
    )
    
    C_prev_prev, C_prev, C_curr = C_curr, C_curr, C
    self.cells = nn.ModuleList()
    reduction_prev = False
    for i in range(layers):
      if i in [layers//3, 2*layers//3]:
        C_curr *= 2
        reduction = True
      else:
        reduction = False
      cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
      reduction_prev = reduction
      self.cells += [cell]
      C_prev_prev, C_prev = C_prev, cell.multiplier*C_curr
      if i == 2*layers//3:
        C_to_auxiliary = C_prev

    if auxiliary:
      self.auxiliary_head = AuxiliaryHeadCIFAR(C_to_auxiliary, num_classes)
    self.global_pooling = nn.AdaptiveAvgPool2d(1)
    self.classifier = nn.Linear(C_prev, num_classes)

  def forward(self, input):
    logits_aux = None
    s0 = s1 = self.stem(input)
    for i, cell in enumerate(self.cells):
      s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
      if i == 2*self._layers//3:
        if self._auxiliary and self.training:
          logits_aux = self.auxiliary_head(s1)
    out = self.global_pooling(s1)
    logits = self.classifier(out.view(out.size(0),-1))
    return logits, logits_aux


class NetworkImageNet(nn.Module):

  def __init__(self, C, num_classes, layers, auxiliary, genotype):
    super(NetworkImageNet, self).__init__()
    self._layers = layers
    self._auxiliary = auxiliary

    self.stem0 = nn.Sequential(
      nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False),
      nn.BatchNorm2d(C // 2),
      nn.ReLU(inplace=True),
      nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False),
      nn.BatchNorm2d(C),
    )

    self.stem1 = nn.Sequential(
      nn.ReLU(inplace=True),
      nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False),
      nn.BatchNorm2d(C),
    )

    C_prev_prev, C_prev, C_curr = C, C, C

    self.cells = nn.ModuleList()
    reduction_prev = True
    for i in range(layers):
      if i in [layers // 3, 2 * layers // 3]:
        C_curr *= 2
        reduction = True
      else:
        reduction = False
      cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
      reduction_prev = reduction
      self.cells += [cell]
      C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr
      if i == 2 * layers // 3:
        C_to_auxiliary = C_prev

    if auxiliary:
      self.auxiliary_head = AuxiliaryHeadImageNet(C_to_auxiliary, num_classes)
    self.global_pooling = nn.AvgPool2d(7)
    self.classifier = nn.Linear(C_prev, num_classes)

  def forward(self, input):
    logits_aux = None
    s0 = self.stem0(input)
    s1 = self.stem1(s0)
    for i, cell in enumerate(self.cells):
      s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
      if i == 2 * self._layers // 3:
        if self._auxiliary and self.training:
          logits_aux = self.auxiliary_head(s1)
    out = self.global_pooling(s1)
    logits = self.classifier(out.view(out.size(0), -1))
    return logits, logits_aux



# **train.py**

In [16]:
import os
import sys
import time
import glob
import numpy as np
import torch
# import utils
import logging
import argparse
import torch.nn as nn
# import genotypes
import torch.utils
import torchvision.datasets as dset
import torch.backends.cudnn as cudnn

from torch.autograd import Variable
# from model import NetworkCIFAR as Network


parser = argparse.ArgumentParser("cifar")
parser.add_argument('--data', type=str, default='../data', help='location of the data corpus')
# parser.add_argument('--batch_size', type=int, default=96, help='batch size')
parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate')
parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay')
parser.add_argument('--report_freq', type=float, default=50, help='report frequency')
parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
# parser.add_argument('--epochs', type=int, default=600, help='num of training epochs')
# parser.add_argument('--epochs', type=int, default=30, help='num of training epochs')
parser.add_argument('--epochs', type=int, default=1, help='num of training epochs')
parser.add_argument('--init_channels', type=int, default=36, help='num of init channels')
parser.add_argument('--layers', type=int, default=20, help='total number of layers')
parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model')
parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower')
parser.add_argument('--auxiliary_weight', type=float, default=0.4, help='weight for auxiliary loss')
parser.add_argument('--cutout', action='store_true', default=False, help='use cutout')
parser.add_argument('--cutout_length', type=int, default=16, help='cutout length')
parser.add_argument('--drop_path_prob', type=float, default=0.2, help='drop path probability')
parser.add_argument('--save', type=str, default='EXP', help='experiment name')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use')
parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping')
args = parser.parse_args(args=[])

args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'))

log_format = '%(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
    format=log_format, datefmt='%m/%d %I:%M:%S %p')
fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
fh.setFormatter(logging.Formatter(log_format))
logging.getLogger().addHandler(fh)

CIFAR_CLASSES = 10


def main():
  if not torch.cuda.is_available():
    logging.info('no gpu device available')
    sys.exit(1)

  np.random.seed(args.seed)
  torch.cuda.set_device(args.gpu)
  cudnn.benchmark = True
  torch.manual_seed(args.seed)
  cudnn.enabled=True
  torch.cuda.manual_seed(args.seed)
  logging.info('gpu device = %d' % args.gpu)
  logging.info("args = %s", args)

  # genotypesは、import genotypesによるもの。
  # evalは("計算式")を計算する関数。

  # %sは %演算子であり、文字列(str()で変換)、 % 以下の文字列をsに対して代入している。
  # 今回は args.arch = DARTSより　genotypes.DARTSという文字が入る。

  #genotypesは、namedtupleとなっており、

  #   >>> Car = namedtuple('Car', [
  # ...     'color',
  # ...     'mileage',
  # ... ])
  # >>> my_car = Car('red', 3812.4)
  # >>> my_car.color
  # 'red'
  # >>> my_car.mileage
  # 3812.4

  # Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')が設定されており,(恐らく)初期設定値が入っている。
  # genotypeに文字列が含まれないようにevalとして渡している。


  # genotype = eval("genotypes.%s" % args.arch)
  model = NetworkCIFAR(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype)
  model = model.cuda()

  logging.info("param size = %fMB", count_parameters_in_MB(model))

  criterion = nn.CrossEntropyLoss()
  criterion = criterion.cuda()
  optimizer = torch.optim.SGD(
      model.parameters(),
      args.learning_rate,
      momentum=args.momentum,
      weight_decay=args.weight_decay
      )

  train_transform, valid_transform = _data_transforms_cifar10(args)
  train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform)
  valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform)

  train_queue = torch.utils.data.DataLoader(
      train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2)

  valid_queue = torch.utils.data.DataLoader(
      valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2)

  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs))

  for epoch in range(args.epochs):
    scheduler.step()
    logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
    model.drop_path_prob = args.drop_path_prob * epoch / args.epochs

    train_acc, train_obj = train(train_queue, model, criterion, optimizer)
    logging.info('train_acc %f', train_acc)

    valid_acc, valid_obj = infer(valid_queue, model, criterion)
    logging.info('valid_acc %f', valid_acc)

    save(model, os.path.join(args.save, 'weights.pt'))


def train(train_queue, model, criterion, optimizer):
  objs = AvgrageMeter()
  top1 = AvgrageMeter()
  top5 = AvgrageMeter()
  model.train()

  for step, (input, target) in enumerate(train_queue):
    input = Variable(input).cuda()
    target = Variable(target).cuda(async=True)

    optimizer.zero_grad()
    logits, logits_aux = model(input)
    loss = criterion(logits, target)
    if args.auxiliary:
      loss_aux = criterion(logits_aux, target)
      loss += args.auxiliary_weight*loss_aux
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), args.grad_clip)
    optimizer.step()

    prec1, prec5 = accuracy(logits, target, topk=(1, 5))
    n = input.size(0)
    objs.update(loss.data.item(), n)
    top1.update(prec1.data.item(), n)
    top5.update(prec5.data.item(), n)

    if step % args.report_freq == 0:
      logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

  return top1.avg, objs.avg


def infer(valid_queue, model, criterion):
  objs = AvgrageMeter()
  top1 = AvgrageMeter()
  top5 = AvgrageMeter()
  model.eval()

  for step, (input, target) in enumerate(valid_queue):
    input = Variable(input, volatile=True).cuda()
    target = Variable(target, volatile=True).cuda(async=True)

    logits, _ = model(input)
    loss = criterion(logits, target)

    prec1, prec5 = accuracy(logits, target, topk=(1, 5))
    n = input.size(0)
    objs.update(loss.data.item(), n)
    top1.update(prec1.data.item(), n)
    top5.update(prec5.data.item(), n)

    if step % args.report_freq == 0:
      logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

  return top1.avg, objs.avg


if __name__ == '__main__':
  main()


Experiment dir : eval-EXP-20210108-124613
01/08 12:46:13 PM gpu device = 0
01/08 12:46:13 PM args = Namespace(arch='DARTS', auxiliary=False, auxiliary_weight=0.4, batch_size=32, cutout=False, cutout_length=16, data='../data', drop_path_prob=0.2, epochs=1, gpu=0, grad_clip=5, init_channels=36, layers=20, learning_rate=0.025, model_path='saved_models', momentum=0.9, report_freq=50, save='eval-EXP-20210108-124613', seed=0, weight_decay=0.0003)
108 108 36
108 144 36
144 144 36
144 144 36
144 144 36
144 144 36
144 144 72
144 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 144
288 576 144
576 576 144
576 576 144
576 576 144
576 576 144
576 576 144
01/08 12:46:13 PM param size = 2.967238MB




Files already downloaded and verified
Files already downloaded and verified
01/08 12:46:15 PM epoch 0 lr 0.000000e+00




01/08 12:46:15 PM train 000 2.199876e+00 3.125000 68.750000




01/08 12:46:26 PM train 050 2.370465e+00 11.458333 51.041667
01/08 12:46:36 PM train 100 2.378709e+00 10.550743 50.247525
01/08 12:46:47 PM train 150 2.375889e+00 11.175497 50.186258
01/08 12:46:57 PM train 200 2.371212e+00 11.069652 50.699627
01/08 12:47:08 PM train 250 2.374375e+00 10.943725 50.647410
01/08 12:47:18 PM train 300 2.373662e+00 10.714286 50.685216
01/08 12:47:28 PM train 350 2.375482e+00 10.630342 50.543091
01/08 12:47:39 PM train 400 2.375891e+00 10.629676 50.561097
01/08 12:47:49 PM train 450 2.376600e+00 10.636086 50.665188
01/08 12:47:59 PM train 500 2.376043e+00 10.610030 50.711078
01/08 12:48:10 PM train 550 2.376124e+00 10.690789 50.776996
01/08 12:48:20 PM train 600 2.377257e+00 10.570923 50.774750
01/08 12:48:31 PM train 650 2.377261e+00 10.613479 50.753648
01/08 12:48:42 PM train 700 2.376719e+00 10.614301 50.744472
01/08 12:48:52 PM train 750 2.376938e+00 10.602530 50.790613
01/08 12:49:03 PM train 800 2.376276e+00 10.537609 50.788077
01/08 12:49:14 PM train 



01/08 12:51:47 PM valid 050 2.443318e+00 10.110294 49.877451
01/08 12:51:50 PM valid 100 2.424396e+00 10.519802 50.185644
01/08 12:51:52 PM valid 150 2.415442e+00 9.850993 50.807119
01/08 12:51:55 PM valid 200 2.406560e+00 9.794776 50.824005
01/08 12:51:57 PM valid 250 2.396475e+00 10.209163 50.996016
01/08 12:51:59 PM valid 300 2.398997e+00 10.184801 50.384136
01/08 12:52:00 PM valid_acc 10.220000


In [17]:
genotype

Genotype(normal=[('dil_conv_5x5', 1), ('sep_conv_3x3', 0), ('dil_conv_3x3', 2), ('dil_conv_5x5', 1), ('avg_pool_3x3', 1), ('dil_conv_5x5', 2), ('dil_conv_3x3', 0), ('max_pool_3x3', 3)], normal_concat=range(2, 6), reduce=[('avg_pool_3x3', 1), ('sep_conv_5x5', 0), ('dil_conv_5x5', 0), ('sep_conv_3x3', 1), ('max_pool_3x3', 0), ('avg_pool_3x3', 2), ('sep_conv_3x3', 4), ('sep_conv_3x3', 0)], reduce_concat=range(2, 6))

# **test.py**

In [18]:
import os
import sys
import glob
import numpy as np
import torch

import logging
import argparse
import torch.nn as nn

import torch.utils
import torchvision.datasets as dset
import torch.backends.cudnn as cudnn

from torch.autograd import Variable
# from model import NetworkCIFAR


parser = argparse.ArgumentParser("cifar")
parser.add_argument('--data', type=str, default='./data', help='location of the data corpus')
# parser.add_argument('--batch_size', type=int, default=96, help='batch size')
parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--report_freq', type=float, default=50, help='report frequency')
parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
parser.add_argument('--init_channels', type=int, default=36, help='num of init channels')
parser.add_argument('--layers', type=int, default=20, help='total number of layers')
parser.add_argument('--model_path', type=str, default='eval-EXP-20210108-124613/weights.pt', help='path of pretrained model')
parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower')
parser.add_argument('--cutout', action='store_true', default=False, help='use cutout')
parser.add_argument('--cutout_length', type=int, default=16, help='cutout length')
parser.add_argument('--drop_path_prob', type=float, default=0.2, help='drop path probability')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use')
args = parser.parse_args(args=[])

log_format = '%(asctime)s %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
    format=log_format, datefmt='%m/%d %I:%M:%S %p')

CIFAR_CLASSES = 10

def main():
  if not torch.cuda.is_available():
    logging.info('no gpu device available')
    sys.exit(1)
  np.random.seed(args.seed)
  torch.cuda.set_device(args.gpu)
  cudnn.benchmark = True
  torch.manual_seed(args.seed)
  cudnn.enabled=True
  torch.cuda.manual_seed(args.seed)
  logging.info('gpu device = %d' % args.gpu)
  logging.info("args = %s", args)

  # genotype = genotype
  model = NetworkCIFAR(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype)
  model = model.cuda()
  load(model, args.model_path)

  logging.info("param size = %fMB", count_parameters_in_MB(model))
  criterion = nn.CrossEntropyLoss()
  criterion = criterion.cuda()

  _, test_transform = _data_transforms_cifar10(args)
  test_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=test_transform)

  test_queue = torch.utils.data.DataLoader(
      test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2)

  model.drop_path_prob = args.drop_path_prob
  test_acc, test_obj = infer(test_queue, model, criterion)
  logging.info('test_acc %f', test_acc)


def infer(test_queue, model, criterion):
  objs = AvgrageMeter()
  top1 = AvgrageMeter()
  top5 = AvgrageMeter()
  model.eval()

  for step, (input, target) in enumerate(test_queue):
    input = Variable(input, volatile=True).cuda()
    target = Variable(target, volatile=True).cuda(async=True)

    logits, _ = model(input)
    loss = criterion(logits, target)

    prec1, prec5 = accuracy(logits, target, topk=(1, 5))
    n = input.size(0)
    objs.update(loss.data.item(), n)
    top1.update(prec1.data.item(), n)
    top5.update(prec5.data.item(), n)

    if step % args.report_freq == 0:
      logging.info('test %03d %e %f %f', step, objs.avg, top1.avg, top5.avg)

  return top1.avg, objs.avg


if __name__ == '__main__':
  main() 

01/08 12:53:06 PM gpu device = 0
01/08 12:53:06 PM args = Namespace(arch='DARTS', auxiliary=False, batch_size=32, cutout=False, cutout_length=16, data='./data', drop_path_prob=0.2, gpu=0, init_channels=36, layers=20, model_path='eval-EXP-20210108-124613/weights.pt', report_freq=50, seed=0)
108 108 36
108 144 36
144 144 36
144 144 36
144 144 36
144 144 36
144 144 72
144 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 72
288 288 144
288 576 144
576 576 144
576 576 144
576 576 144
576 576 144
576 576 144
01/08 12:53:06 PM param size = 2.967238MB




Files already downloaded and verified
01/08 12:53:07 PM test 000 2.456669e+00 6.250000 37.500000




01/08 12:53:09 PM test 050 2.443318e+00 10.110294 49.877451
01/08 12:53:12 PM test 100 2.424396e+00 10.519802 50.185644
01/08 12:53:14 PM test 150 2.415442e+00 9.850993 50.807119
01/08 12:53:16 PM test 200 2.406560e+00 9.794776 50.824005
01/08 12:53:18 PM test 250 2.396475e+00 10.209163 50.996016
01/08 12:53:21 PM test 300 2.398997e+00 10.184801 50.384136
01/08 12:53:21 PM test_acc 10.220000
