In [1]:
import argparse
import sys
sys.path.append('../') # append root directory
import os
import argparse
from utils import getLogger
from models import ResNet18_wby16
from config import Config
from admm.warmup_scheduler import GradualWarmupScheduler
from admm.cross_entropy import CrossEntropyLossMaybeSmooth
from admm.utils import mixup_data, mixup_criterion
import admm
import torch.optim as optim
import torch
import torchvision
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn
import time
import torch.nn.functional as F
from numpy import linalg as LA
import admm
import numpy as np
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res
def validate(val_loader, criterion, config):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    config.model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            input = input.cuda()
            target = target.cuda()
            # compute output
            output = config.model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % config.print_freq == 0:
                print('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      .format(
                          i, len(val_loader), batch_time=batch_time, loss=losses,
                          top1=top1))

        print(' * Acc@1 {top1.avg:.3f} '
              .format(top1=top1))


    return top1.avg


class AttackPGD(torch.nn.Module):
    def __init__(self, basic_model, config):
        super(AttackPGD, self).__init__()
        self.basic_model = basic_model
        self.rand = config.random_start
        self.step_size = config.step_size / 255
        self.epsilon = config.epsilon / 255
        self.num_steps = config.num_steps
        print(f"PGD: step_size:{self.step_size} | epsilon:{self.epsilon} | num_steps:{self.num_steps}")
    def forward(self, input, target):    # do forward in the module.py
        # if not args.attack :
        #    return self.basic_model(input), input

        x = input.detach()

        if self.rand:
            x = x + torch.zeros_like(x).uniform_(-self.epsilon, self.epsilon)
        for i in range(self.num_steps):
            x.requires_grad_()
            with torch.enable_grad():
                logits = self.basic_model(x)
                loss = F.cross_entropy(logits, target, size_average=False)
            grad = torch.autograd.grad(loss, [x])[0]
            x = x.detach() + self.step_size * torch.sign(grad.detach())
            x = torch.min(torch.max(x, input - self.epsilon), input + self.epsilon)

            x = torch.clamp(x, 0, 1)

        return self.basic_model(input), self.basic_model(x), x

def validate_adv(val_loader, criterion, config):
    batch_time = AverageMeter()
    nat_losses = AverageMeter()
    adv_losses = AverageMeter()
    nat_top1 = AverageMeter()
    adv_top1 = AverageMeter()
    nat_loss = 0
    adv_loss = 0

    # switch to evaluate mode
    config.model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            input = input.cuda()
            target = target.cuda()
            # compute output
            nat_output, adv_output, pert_inputs = config.model(input, target)
            nat_loss = criterion(nat_output, target)
            adv_loss = criterion(adv_output, target)

            # measure accuracy and record loss
            nat_acc1, nat_acc5 = accuracy(nat_output, target, topk=(1, 5))
            adv_acc1, adv_acc5 = accuracy(adv_output, target, topk=(1, 5))
            nat_losses.update(nat_loss.item(), input.size(0))
            adv_losses.update(adv_loss.item(), input.size(0))
            nat_top1.update(nat_acc1[0], input.size(0))
            adv_top1.update(adv_acc1[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % config.print_freq == 0:
                print('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Nat_Loss {nat_loss.val:.4f} ({nat_loss.avg:.4f})\t'
                      'Nat_Acc@1 {nat_top1.val:.3f} ({nat_top1.avg:.3f})\t'
                      'Adv_Loss {adv_loss.val:.4f} ({adv_loss.avg:.4f})\t'
                      'Adv_Acc@1 {adv_top1.val:.3f} ({adv_top1.avg:.3f})\t'
                      .format(
                          i, len(val_loader), batch_time=batch_time, nat_loss=nat_losses,
                          nat_top1=nat_top1, adv_loss=adv_losses, adv_top1=adv_top1))

        print(' * Nat_Acc@1 {nat_top1.avg:.3f} *Adv_Acc@1 {adv_top1.avg:.3f}'
              .format(nat_top1=nat_top1, adv_top1=adv_top1))

    return adv_top1.avg

criterion = CrossEntropyLossMaybeSmooth(smooth_eps=0.0).cuda()
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

testset = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=16)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Files already downloaded and verified


# Naturally trained resnet18 with w=16

In [3]:
parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--config_file', type=str, default='./prune/config_w16to8.yaml', help ="config file")
parser.add_argument('--stage', type=str, default='admm', help ="select the pruning stage")
args = parser.parse_args("")
config = Config(args)

print('==> Building model..')
model = ResNet18_wby16(config.w)
config.model = model

if device == 'cuda':
    if config.gpu is not None:
        torch.cuda.set_device(config.gpu)
        config.model = torch.nn.DataParallel(model, device_ids=[config.gpu])
    else:
        config.model.cuda()
        config.model = torch.nn.DataParallel(model)
    cudnn.benchmark = True

if config.load_model:
    # unlike resume, load model does not care optimizer status or start_epoch
    if config.stage == 'admm':
        config.load_model = config.load_model.replace('w', str(config.w))
    else:
        config.load_model = config.load_model.replace('w', str(config.w))
        prune_alpha = config._prune_ratios['conv1.weight']
        config.load_model = f"{config.load_model.split('.pt')[0]}_{prune_alpha}.pt"
        config.save_model = f"{config.save_model.split('.pt')[0]}_{prune_alpha}.pt"
    print('==> Loading from {}'.format(config.load_model))

    config.model.load_state_dict(torch.load(config.load_model))  # i call 'net' "model"
print('==> Adversarial Accuracy..')
config.model =  AttackPGD(config.model, config)
validate_adv(testloader, criterion, config)

==> Building model..
==> Loading from ./trainedMod/resnet18_16by16_pretrained.pt
==> Adversarial Accuracy..
PGD: step_size:0.00784313725490196 | epsilon:0.03137254901960784 | num_steps:10




Test: [0/100]	Time 4.048 (4.048)	Nat_Loss 0.3306 (0.3306)	Nat_Acc@1 95.000 (95.000)	Adv_Loss 18.2532 (18.2532)	Adv_Acc@1 17.000 (17.000)	
Test: [10/100]	Time 0.501 (0.821)	Nat_Loss 0.1819 (0.5375)	Nat_Acc@1 97.000 (93.455)	Adv_Loss 17.3895 (18.5189)	Adv_Acc@1 19.000 (15.273)	
Test: [20/100]	Time 0.498 (0.668)	Nat_Loss 0.7931 (0.5728)	Nat_Acc@1 93.000 (93.143)	Adv_Loss 18.2467 (18.6935)	Adv_Acc@1 20.000 (14.714)	
Test: [30/100]	Time 0.500 (0.613)	Nat_Loss 0.5674 (0.6170)	Nat_Acc@1 92.000 (93.097)	Adv_Loss 17.9494 (18.6740)	Adv_Acc@1 14.000 (14.516)	
Test: [40/100]	Time 0.501 (0.586)	Nat_Loss 1.0210 (0.6508)	Nat_Acc@1 90.000 (92.659)	Adv_Loss 21.2271 (18.8756)	Adv_Acc@1 17.000 (14.829)	
Test: [50/100]	Time 0.501 (0.569)	Nat_Loss 0.2226 (0.6480)	Nat_Acc@1 96.000 (92.843)	Adv_Loss 18.9179 (18.6796)	Adv_Acc@1 13.000 (15.137)	
Test: [60/100]	Time 0.501 (0.558)	Nat_Loss 0.4355 (0.6285)	Nat_Acc@1 95.000 (93.049)	Adv_Loss 18.6501 (18.6832)	Adv_Acc@1 18.000 (14.951)	
Test: [70/100]	Time 0.502 (0

tensor(15.0200, device='cuda:0')

# Naturally trained resnet18 with w=8

In [6]:
w = 8
print('==> Building model..')
model = ResNet18_wby16(w)
config.model = model

config.model.cuda()
config.model = torch.nn.DataParallel(model)
cudnn.benchmark = True
config.load_model = f'./trainedMod/resnet18_{w}by16_pretrained.pt'
print('==> Loading from {}'.format(config.load_model))
config.model.load_state_dict(torch.load(config.load_model))  
print('==> Adversarial Accuracy..')
config.model =  AttackPGD(config.model, config)
validate_adv(testloader, criterion, config)

==> Building model..
==> Loading from ./trainedMod/resnet18_8by16_pretrained.pt
==> Adversarial Accuracy..
PGD: step_size:0.00784313725490196 | epsilon:0.03137254901960784 | num_steps:10
Test: [0/100]	Time 1.154 (1.154)	Nat_Loss 0.4664 (0.4664)	Nat_Acc@1 93.000 (93.000)	Adv_Loss 17.9320 (17.9320)	Adv_Acc@1 16.000 (16.000)	
Test: [10/100]	Time 0.207 (0.278)	Nat_Loss 0.7127 (0.4825)	Nat_Acc@1 95.000 (93.000)	Adv_Loss 18.4954 (20.3697)	Adv_Acc@1 14.000 (12.182)	
Test: [20/100]	Time 0.187 (0.235)	Nat_Loss 1.3370 (0.6512)	Nat_Acc@1 85.000 (91.524)	Adv_Loss 20.9146 (20.6679)	Adv_Acc@1 14.000 (11.190)	
Test: [30/100]	Time 0.187 (0.219)	Nat_Loss 0.5593 (0.6837)	Nat_Acc@1 90.000 (91.419)	Adv_Loss 19.6131 (20.5044)	Adv_Acc@1 12.000 (11.032)	
Test: [40/100]	Time 0.188 (0.212)	Nat_Loss 1.1814 (0.7059)	Nat_Acc@1 88.000 (91.463)	Adv_Loss 23.3852 (20.7484)	Adv_Acc@1 8.000 (11.024)	
Test: [50/100]	Time 0.189 (0.207)	Nat_Loss 0.4618 (0.6872)	Nat_Acc@1 92.000 (91.510)	Adv_Loss 21.4839 (20.6831)	Adv_Acc@

tensor(10.8000, device='cuda:0')

# Naturally trained resnet18 with w=16 and prune to 8

In [7]:
w = 16
print('==> Building model..')
model = ResNet18_wby16(w)
config.model = model

config.model.cuda()
config.model = torch.nn.DataParallel(model)
cudnn.benchmark = True
config.load_model = f'./trainedMod/resnet18_{w}by16_retrained_0.5.pt'
print('==> Loading from {}'.format(config.load_model))
config.model.load_state_dict(torch.load(config.load_model))  
print('==> Adversarial Accuracy..')
config.model =  AttackPGD(config.model, config)
validate_adv(testloader, criterion, config)

==> Building model..
==> Loading from ./trainedMod/resnet18_16by16_retrained_0.5.pt
==> Adversarial Accuracy..
PGD: step_size:0.00784313725490196 | epsilon:0.03137254901960784 | num_steps:10
Test: [0/100]	Time 1.283 (1.283)	Nat_Loss 0.1662 (0.1662)	Nat_Acc@1 97.000 (97.000)	Adv_Loss 18.9075 (18.9075)	Adv_Acc@1 17.000 (17.000)	
Test: [10/100]	Time 0.496 (0.567)	Nat_Loss 0.2888 (0.5162)	Nat_Acc@1 96.000 (93.455)	Adv_Loss 18.8089 (19.3458)	Adv_Acc@1 14.000 (13.091)	
Test: [20/100]	Time 0.494 (0.533)	Nat_Loss 0.4950 (0.5747)	Nat_Acc@1 92.000 (92.810)	Adv_Loss 19.1343 (19.6161)	Adv_Acc@1 15.000 (12.333)	
Test: [30/100]	Time 0.496 (0.521)	Nat_Loss 0.5773 (0.6116)	Nat_Acc@1 92.000 (92.548)	Adv_Loss 19.4043 (19.6198)	Adv_Acc@1 12.000 (12.032)	
Test: [40/100]	Time 0.496 (0.515)	Nat_Loss 0.9862 (0.6497)	Nat_Acc@1 88.000 (92.171)	Adv_Loss 21.9048 (19.7843)	Adv_Acc@1 9.000 (11.878)	
Test: [50/100]	Time 0.497 (0.511)	Nat_Loss 0.2701 (0.6550)	Nat_Acc@1 96.000 (92.353)	Adv_Loss 19.9243 (19.5861)	Adv_

tensor(11.9200, device='cuda:0')

In [8]:
def test_sparsity(config):
    """
    test sparsity for every involved layer and the overall compression rate

    """
    total_zeros = 0
    total_nonzeros = 0

    print('<===sparsity type is {}'.format(config.sparsity_type))
    print('<===layers to be pruned are \n{}'.format(config._prune_ratios))
    if config.sparsity_type == "irregular":
        for name, W in config.model.named_parameters():
            if 'bias' in name:
                continue
            W = W.cpu().detach().numpy()
            zeros = np.sum(W == 0)
            total_zeros += zeros
            nonzeros = np.sum(W != 0)
            total_nonzeros += nonzeros
            print("sparsity at layer {} is {}".format(name, zeros / (zeros + nonzeros)))
        total_weight_number = total_zeros + total_nonzeros
        print('overal compression rate is {}'.format(total_weight_number / total_nonzeros))
    elif config.sparsity_type == "filter":
        print('inside if')
        print(config.prune_ratios)
        for name, W in config.model.named_parameters():
            if name not in config.prune_ratios:
                continue
            W = W.cpu().detach().numpy()
            shape = W.shape
            W2d = W.reshape(shape[0], -1)
            row_l2_norm = LA.norm(W2d, 2, axis=1)
            zero_row = np.sum(row_l2_norm == 0)
            nonzero_row = np.sum(row_l2_norm != 0)
            total_zeros += np.sum(W == 0)
            total_nonzeros += np.sum(W != 0)
            print("filter sparsity of layer {} is {}".format(name, zero_row / (zero_row + nonzero_row)))
        print('only consider conv layers, compression rate is {}'.format((total_zeros + total_nonzeros) / total_nonzeros))

In [11]:
config.prepare_pruning() 
ADMM = admm.ADMM(config, device)
test_sparsity(config)

['', 'basic_model', 'basic_model.module', 'basic_model.module.conv1', 'basic_model.module.bn1', 'basic_model.module.layer1', 'basic_model.module.layer1.0', 'basic_model.module.layer1.0.conv1', 'basic_model.module.layer1.0.bn1', 'basic_model.module.layer1.0.conv2', 'basic_model.module.layer1.0.bn2', 'basic_model.module.layer1.0.shortcut', 'basic_model.module.layer1.1', 'basic_model.module.layer1.1.conv1', 'basic_model.module.layer1.1.bn1', 'basic_model.module.layer1.1.conv2', 'basic_model.module.layer1.1.bn2', 'basic_model.module.layer1.1.shortcut', 'basic_model.module.layer2', 'basic_model.module.layer2.0', 'basic_model.module.layer2.0.conv1', 'basic_model.module.layer2.0.bn1', 'basic_model.module.layer2.0.conv2', 'basic_model.module.layer2.0.bn2', 'basic_model.module.layer2.0.shortcut', 'basic_model.module.layer2.0.shortcut.0', 'basic_model.module.layer2.0.shortcut.1', 'basic_model.module.layer2.1', 'basic_model.module.layer2.1.conv1', 'basic_model.module.layer2.1.bn1', 'basic_model.m

# Naturally trained resnet18 with w=1

In [12]:
w = 1
print('==> Building model..')
model = ResNet18_wby16(w)
config.model = model

config.model.cuda()
config.model = torch.nn.DataParallel(model)
cudnn.benchmark = True
config.load_model = f'./trainedMod/resnet18_{w}by16_pretrained.pt'
print('==> Loading from {}'.format(config.load_model))
config.model.load_state_dict(torch.load(config.load_model))  
print('==> Adversarial Accuracy..')
config.model =  AttackPGD(config.model, config)
validate_adv(testloader, criterion, config)

==> Building model..
==> Loading from ./trainedMod/resnet18_1by16_pretrained.pt
==> Adversarial Accuracy..
PGD: step_size:0.00784313725490196 | epsilon:0.03137254901960784 | num_steps:10
Test: [0/100]	Time 0.920 (0.920)	Nat_Loss 0.9712 (0.9712)	Nat_Acc@1 70.000 (70.000)	Adv_Loss 4.7230 (4.7230)	Adv_Acc@1 6.000 (6.000)	
Test: [10/100]	Time 0.137 (0.210)	Nat_Loss 0.6861 (0.8973)	Nat_Acc@1 78.000 (70.273)	Adv_Loss 4.3154 (4.8665)	Adv_Acc@1 7.000 (6.182)	
Test: [20/100]	Time 0.137 (0.175)	Nat_Loss 0.9934 (0.8936)	Nat_Acc@1 71.000 (70.333)	Adv_Loss 4.8932 (4.9534)	Adv_Acc@1 11.000 (6.524)	
Test: [30/100]	Time 0.137 (0.164)	Nat_Loss 0.8345 (0.9065)	Nat_Acc@1 75.000 (70.387)	Adv_Loss 5.3006 (4.9888)	Adv_Acc@1 5.000 (6.613)	
Test: [40/100]	Time 0.137 (0.158)	Nat_Loss 1.0701 (0.9153)	Nat_Acc@1 71.000 (70.073)	Adv_Loss 5.1748 (5.0502)	Adv_Acc@1 9.000 (6.780)	
Test: [50/100]	Time 0.139 (0.154)	Nat_Loss 0.9353 (0.9031)	Nat_Acc@1 71.000 (70.627)	Adv_Loss 5.4839 (5.0262)	Adv_Acc@1 5.000 (6.804)	
Tes

tensor(6.7500, device='cuda:0')

# Naturally trained resnet18 with w=16 and prune to 1

In [18]:
parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--config_file', type=str, default='./prune/config_w16to1.yaml', help ="config file")
parser.add_argument('--stage', type=str, default='admm', help ="select the pruning stage")
args = parser.parse_args("")
config = Config(args)

print('==> Building model..')
w = 16
print('==> Building model..')
model = ResNet18_wby16(w)
config.model = model

config.model.cuda()
config.model = torch.nn.DataParallel(model)
cudnn.benchmark = True
config.load_model = f'./trainedMod/resnet18_{w}by16_retrained_0.9375.pt'
print('==> Loading from {}'.format(config.load_model))
config.model.load_state_dict(torch.load(config.load_model))  
print('==> Adversarial Accuracy..')
config.model =  AttackPGD(config.model, config)
validate_adv(testloader, criterion, config)

==> Building model..
==> Building model..
==> Loading from ./trainedMod/resnet18_16by16_retrained_0.9375.pt
==> Adversarial Accuracy..
PGD: step_size:0.00784313725490196 | epsilon:0.03137254901960784 | num_steps:10
Test: [0/100]	Time 1.286 (1.286)	Nat_Loss 1.1525 (1.1525)	Nat_Acc@1 61.000 (61.000)	Adv_Loss 3.4195 (3.4195)	Adv_Acc@1 8.000 (8.000)	
Test: [10/100]	Time 0.488 (0.561)	Nat_Loss 1.2681 (1.2239)	Nat_Acc@1 52.000 (55.818)	Adv_Loss 3.3332 (3.6798)	Adv_Acc@1 9.000 (7.545)	
Test: [20/100]	Time 0.488 (0.527)	Nat_Loss 1.4683 (1.2567)	Nat_Acc@1 49.000 (55.095)	Adv_Loss 3.7568 (3.7610)	Adv_Acc@1 8.000 (7.381)	
Test: [30/100]	Time 0.488 (0.515)	Nat_Loss 1.0839 (1.2597)	Nat_Acc@1 62.000 (54.645)	Adv_Loss 3.7214 (3.7729)	Adv_Acc@1 5.000 (7.742)	
Test: [40/100]	Time 0.489 (0.508)	Nat_Loss 1.2460 (1.2645)	Nat_Acc@1 56.000 (55.000)	Adv_Loss 3.9271 (3.8231)	Adv_Acc@1 7.000 (7.585)	
Test: [50/100]	Time 0.490 (0.505)	Nat_Loss 1.3535 (1.2560)	Nat_Acc@1 53.000 (55.216)	Adv_Loss 3.9928 (3.8187)	A

tensor(7.1000, device='cuda:0')

In [19]:
config.prepare_pruning() 
ADMM = admm.ADMM(config, device)
test_sparsity(config)

['', 'basic_model', 'basic_model.module', 'basic_model.module.conv1', 'basic_model.module.bn1', 'basic_model.module.layer1', 'basic_model.module.layer1.0', 'basic_model.module.layer1.0.conv1', 'basic_model.module.layer1.0.bn1', 'basic_model.module.layer1.0.conv2', 'basic_model.module.layer1.0.bn2', 'basic_model.module.layer1.0.shortcut', 'basic_model.module.layer1.1', 'basic_model.module.layer1.1.conv1', 'basic_model.module.layer1.1.bn1', 'basic_model.module.layer1.1.conv2', 'basic_model.module.layer1.1.bn2', 'basic_model.module.layer1.1.shortcut', 'basic_model.module.layer2', 'basic_model.module.layer2.0', 'basic_model.module.layer2.0.conv1', 'basic_model.module.layer2.0.bn1', 'basic_model.module.layer2.0.conv2', 'basic_model.module.layer2.0.bn2', 'basic_model.module.layer2.0.shortcut', 'basic_model.module.layer2.0.shortcut.0', 'basic_model.module.layer2.0.shortcut.1', 'basic_model.module.layer2.1', 'basic_model.module.layer2.1.conv1', 'basic_model.module.layer2.1.bn1', 'basic_model.m