In [1]:
import argparse

from models import ImagenetRunConfig
from nas_manager import *
from models.super_nets.super_proxyless import SuperProxylessNASNets
import pickle

IMAGENET_TRAINING_SET_SIZE = 1231167

# ref values
ref_values = {
    'flops': {
        '0.35': 59 * 1e6,
        '0.50': 97 * 1e6,
        '0.75': 209 * 1e6,
        '1.00': 475 * 1e6,
        '1.30': 509 * 1e6,
        '1.40': 582 * 1e6,
    },
    # ms
    'mobile': {
        '1.00': 80,
    },
    'cpu': {},
    'gpu8': {},
}


parser = argparse.ArgumentParser()
parser.add_argument('--gpu', type=int, default=0, help='local rank for distributed training')
parser.add_argument('--path', type=str, default='ABS')
parser.add_argument('--gpus', help='gpu available', default='0,1')
parser.add_argument('--resume', action='store_true')
parser.add_argument('--debug', help='freeze the weight parameters', action='store_true')
parser.add_argument('--manual_seed', default=0, type=int)

""" run config """
parser.add_argument('--n_epochs', type=int, default=120)
parser.add_argument('--init_lr', type=float, default=0.025)
parser.add_argument('--lr_schedule_type', type=str, default='cosine')
# lr_schedule_param

parser.add_argument('--dataset', type=str, default='imagenet', choices=['imagenet'])
parser.add_argument('--train_batch_size', type=int, default=64)
parser.add_argument('--test_batch_size', type=int, default=250)
parser.add_argument('--valid_size', type=int, default=50000)

parser.add_argument('--opt_type', type=str, default='sgd', choices=['sgd'])
parser.add_argument('--momentum', type=float, default=0.9)  # opt_param
parser.add_argument('--no_nesterov', action='store_true')  # opt_param
parser.add_argument('--weight_decay', type=float, default=4e-5)
parser.add_argument('--label_smoothing', type=float, default=0.1)
parser.add_argument('--no_decay_keys', type=str, default=None, choices=[None, 'bn', 'bn#bias'])

parser.add_argument('--model_init', type=str, default='he_fout', choices=['he_fin', 'he_fout'])
parser.add_argument('--init_div_groups', action='store_true')
parser.add_argument('--validation_frequency', type=int, default=1)
parser.add_argument('--print_frequency', type=int, default=10)
parser.add_argument('--train_iters', type=int, default=-1)

parser.add_argument('--n_worker', type=int, default=32)
parser.add_argument('--resize_scale', type=float, default=0.08)
parser.add_argument('--distort_color', type=str, default='normal', choices=['normal', 'strong', 'None'])

""" net config """
# parser.add_argument('--width_stages', type=str, default='24,40,80,96,192,320')
parser.add_argument('--width_stages', type=str, default='32,56,112,128,256,432')
parser.add_argument('--n_cell_stages', type=str, default='4,4,4,4,4,1')
parser.add_argument('--stride_stages', type=str, default='2,2,2,1,2,1')
parser.add_argument('--width_mult', type=float, default=1.0)
parser.add_argument('--bn_momentum', type=float, default=0.1)
parser.add_argument('--bn_eps', type=float, default=1e-3)
parser.add_argument('--dropout', type=float, default=0)

# architecture search config
""" arch search algo and warmup """
parser.add_argument('--arch_algo', type=str, default='grad', choices=['grad', 'rl'])
parser.add_argument('--warmup_epochs', type=int, default=0)
""" shared hyper-parameters """
parser.add_argument('--arch_init_type', type=str, default='normal', choices=['normal', 'uniform'])
parser.add_argument('--arch_init_ratio', type=float, default=1e-3)
parser.add_argument('--arch_opt_type', type=str, default='adam', choices=['adam'])
parser.add_argument('--arch_lr', type=float, default=1e-3)
parser.add_argument('--arch_adam_beta1', type=float, default=0)  # arch_opt_param
parser.add_argument('--arch_adam_beta2', type=float, default=0.999)  # arch_opt_param
parser.add_argument('--arch_adam_eps', type=float, default=1e-8)  # arch_opt_param
parser.add_argument('--arch_weight_decay', type=float, default=0)
parser.add_argument('--target_hardware', type=str, default='flops', choices=['mobile', 'cpu', 'gpu8', 'flops', None])
""" Grad hyper-parameters """
parser.add_argument('--grad_update_arch_param_every', type=int, default=5)
parser.add_argument('--grad_update_steps', type=int, default=1)
parser.add_argument('--grad_binary_mode', type=str, default='full_v2', choices=['full_v2', 'full', 'two'])
parser.add_argument('--grad_data_batch', type=int, default=None)
parser.add_argument('--grad_reg_loss_type', type=str, default='mul#log', choices=['add#linear', 'mul#log'])
parser.add_argument('--grad_reg_loss_lambda', type=float, default=1e-1)  # grad_reg_loss_params
parser.add_argument('--grad_reg_loss_alpha', type=float, default=0.2)  # grad_reg_loss_params
parser.add_argument('--grad_reg_loss_beta', type=float, default=0.3)  # grad_reg_loss_params
""" RL hyper-parameters """
parser.add_argument('--rl_batch_size', type=int, default=10)
parser.add_argument('--rl_update_per_epoch', action='store_true')
parser.add_argument('--rl_update_steps_per_epoch', type=int, default=300)
parser.add_argument('--rl_baseline_decay_weight', type=float, default=0.99)
parser.add_argument('--rl_tradeoff_ratio', type=float, default=0.1)
parser.add_argument('--operations_path', type=str, default='../../SPOS/shrinking/shrunk_search_space.p', help='shrunk search space')
parser.add_argument('--train_dir', type=str, default='/data/mzhang3/imagenet/train', help='path to training dataset')
parser.add_argument('--test_dir', type=str, default='/data/mzhang3/imagenet/val', help='path to test dataset')


_StoreAction(option_strings=['--test_dir'], dest='test_dir', nargs=None, const=None, default='/data/mzhang3/imagenet/val', type=<class 'str'>, choices=None, help='path to test dataset', metavar=None)

In [2]:
args = parser.parse_args([])
torch.manual_seed(args.manual_seed)
torch.cuda.manual_seed_all(args.manual_seed)
np.random.seed(args.manual_seed)

os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus

args.train_iters = IMAGENET_TRAINING_SET_SIZE // args.train_batch_size
os.makedirs(args.path, exist_ok=True)

# build run config from args
args.lr_schedule_param = None
args.opt_param = {
    'momentum': args.momentum,
    'nesterov': not args.no_nesterov,
}
run_config = ImagenetRunConfig(
    **args.__dict__
)

# debug, adjust run_config
if args.debug:
    run_config.train_batch_size = 256
    run_config.test_batch_size = 256
    run_config.valid_size = 256
    run_config.n_worker = 0

width_stages_str = '-'.join(args.width_stages.split(','))
# build net from args
args.width_stages = [int(val) for val in args.width_stages.split(',')]
args.n_cell_stages = [int(val) for val in args.n_cell_stages.split(',')]
args.stride_stages = [int(val) for val in args.stride_stages.split(',')]
candidates = [  '3x3_MBConv3',
                '3x3_MBConv6',
                '5x5_MBConv3',
                '5x5_MBConv6',
                '7x7_MBConv3', 
                '7x7_MBConv6',
                'Zero'
            ]

# load the shrunk search space ABS finds
#operations = pickle.load(open(args.operations_path, 'rb'))
#print('operations={}'.format(operations))

args.conv_candidates = []
for i in range(21):
    args.conv_candidates.append([])
    for op in range(6):
        args.conv_candidates[i].append(candidates[op])
print('candidates={}'.format(args.conv_candidates))        

super_net = SuperProxylessNASNets(
    width_stages=args.width_stages, n_cell_stages=args.n_cell_stages, stride_stages=args.stride_stages,
    conv_candidates=args.conv_candidates, n_classes=run_config.n_classes, width_mult=args.width_mult,
    bn_param=(args.bn_momentum, args.bn_eps), dropout_rate=args.dropout
)

# build arch search config from args
if args.arch_opt_type == 'adam':
    args.arch_opt_param = {
        'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
        'eps': args.arch_adam_eps,
    }
else:
    args.arch_opt_param = None
if args.target_hardware is None:
    args.ref_value = None
else:
    args.ref_value = ref_values[args.target_hardware]['%.2f' % args.width_mult]
if args.arch_algo == 'grad':
    from nas_manager import GradientArchSearchConfig
    if args.grad_reg_loss_type == 'add#linear':
        args.grad_reg_loss_params = {'lambda': args.grad_reg_loss_lambda}
    elif args.grad_reg_loss_type == 'mul#log':
        args.grad_reg_loss_params = {
            'alpha': args.grad_reg_loss_alpha,
            'beta': args.grad_reg_loss_beta,
        }
    else:
        args.grad_reg_loss_params = None
    arch_search_config = GradientArchSearchConfig(**args.__dict__)
elif args.arch_algo == 'rl':
    from nas_manager import RLArchSearchConfig
    arch_search_config = RLArchSearchConfig(**args.__dict__)
else:
    raise NotImplementedError

print('Run config:')
for k, v in run_config.config.items():
    print('\t%s: %s' % (k, v))
print('Architecture Search config:')
for k, v in arch_search_config.config.items():
    print('\t%s: %s' % (k, v))

# arch search run manager
arch_search_run_manager = ArchSearchRunManager(args.path, super_net, run_config, arch_search_config)

# resume
if args.resume:
    try:
        arch_search_run_manager.load_model()
    except Exception:
        from pathlib import Path
        home = str(Path.home())
        warmup_path = os.path.join(
            home, 'Workspace/Exp/arch_search/%s_ProxylessNAS_%.2f_%s/warmup.pth.tar' %
                  (run_config.dataset, args.width_mult, width_stages_str)
        )
        if os.path.exists(warmup_path):
            print('load warmup weights')
            arch_search_run_manager.load_model(model_fname=warmup_path)
        else:
            print('fail to load models')

if arch_search_run_manager.warmup:
    arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs)

test_batch_size=250, valid_size=50000, val_iters=200
batch_size=250
dict_keys(['path', 'gpus', 'resume', 'debug', 'manual_seed', 'momentum', 'no_nesterov', 'width_stages', 'n_cell_stages', 'stride_stages', 'width_mult', 'bn_momentum', 'bn_eps', 'dropout', 'arch_algo', 'warmup_epochs', 'arch_init_type', 'arch_init_ratio', 'arch_opt_type', 'arch_lr', 'arch_adam_beta1', 'arch_adam_beta2', 'arch_adam_eps', 'arch_weight_decay', 'target_hardware', 'grad_update_arch_param_every', 'grad_update_steps', 'grad_binary_mode', 'grad_data_batch', 'grad_reg_loss_type', 'grad_reg_loss_lambda', 'grad_reg_loss_alpha', 'grad_reg_loss_beta', 'rl_batch_size', 'rl_update_per_epoch', 'rl_update_steps_per_epoch', 'rl_baseline_decay_weight', 'rl_tradeoff_ratio', 'operations_path'])
candidates=[['3x3_MBConv3', '3x3_MBConv6', '5x5_MBConv3', '5x5_MBConv6', '7x7_MBConv3', '7x7_MBConv6'], ['3x3_MBConv3', '3x3_MBConv6', '5x5_MBConv3', '5x5_MBConv6', '7x7_MBConv3', '7x7_MBConv6'], ['3x3_MBConv3', '3x3_MBConv6', '5x5_M

In [3]:
data_loader = arch_search_run_manager.run_manager.run_config.train_loader

In [4]:
fix_net_weights=args.debug
nBatch = arch_search_run_manager.run_manager.run_config.train_iters

if fix_net_weights:
    data_loader = [(0, 0)] * nBatch

arch_param_num = len(list(arch_search_run_manager.net.architecture_parameters()))
binary_gates_num = len(list(arch_search_run_manager.net.binary_gates()))
weight_param_num = len(list(arch_search_run_manager.net.weight_parameters()))
print(
    '#arch_params: %d\t#binary_gates: %d\t#weight_params: %d' %
    (arch_param_num, binary_gates_num, weight_param_num)
)

update_schedule = arch_search_run_manager.arch_search_config.get_update_schedule(nBatch)

#arch_params: 21	#binary_gates: 21	#weight_params: 1148


In [5]:
epoch=0
now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
print('now: {}'.format(now))
print('\n', '-' * 30, 'Train epoch: %d' % (epoch + 1), '-' * 30, '\n')
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
entropy = AverageMeter()
# switch to train mode
arch_search_run_manager.run_manager.net.train()

end = time.time()
# for i, (images, labels) in enumerate(data_loader):

now: 2022-02-05 08:48:04

 ------------------------------ Train epoch: 1 ------------------------------ 



In [6]:
images, labels = data_loader.next()
images = Variable(images, requires_grad=False)
labels = Variable(labels, requires_grad=False)


In [7]:
data_time.update(time.time() - end)
# lr
lr = arch_search_run_manager.run_manager.run_config.adjust_learning_rate(
    arch_search_run_manager.run_manager.optimizer, epoch, batch=i, nBatch=nBatch
)
# network entropy
net_entropy = arch_search_run_manager.net.entropy()
entropy.update(net_entropy.data.item() / arch_param_num, 1)

In [8]:
arch_loss, exp_value = arch_search_run_manager.gradient_step_Synflow()

In [9]:
arch_search_run_manager.arch_optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [10]:
for para in arch_search_run_manager.net.architecture_parameters():
    print(para)

Parameter containing:
tensor([-0.0019,  0.0006, -0.0036,  0.0011,  0.0009, -0.0016], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([-0.0008, -0.0016,  0.0019,  0.0003,  0.0004,  0.0019], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([ 0.0015, -0.0017,  0.0005, -0.0009, -0.0009,  0.0006], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([ 0.0014, -0.0004,  0.0002,  0.0013, -0.0018, -0.0003], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([-0.0013, -0.0004,  0.0016,  0.0007, -0.0010, -0.0020], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([ 0.0012, -0.0005, -0.0006,  0.0024, -0.0018,  0.0018], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([ 0.0002,  0.0006,  0.0003,  0.0026, -0.0006, -0.0013], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([-0.0008, -0.0004,  0.0002,  0.0016, -0.0023,  0.0007], device='cuda

In [11]:
for gate in arch_search_run_manager.net.binary_gates():
    print(gate)

Parameter containing:
tensor([0., 0., 0., 0., 1., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 1., 0., 0., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 1., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 1., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 1.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 1., 0., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 1., 0., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 1., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 1., 0., 0., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 1., 0., 0., 0.], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 1.], device='cuda:0', requires

In [12]:
for para in arch_search_run_manager.net.architecture_parameters():
    print(abs(para*para.grad).argmax())

tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')
tensor(4, device='cuda:0')
tensor(2, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(4, device='cuda:0')
tensor(4, device='cuda:0')
tensor(3, device='cuda:0')
tensor(4, device='cuda:0')
tensor(4, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(5, device='cuda:0')
tensor(1, device='cuda:0')
tensor(0, device='cuda:0')
tensor(4, device='cuda:0')
tensor(3, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
