<a href="https://colab.research.google.com/github/asalcedo31/CSC2516_project/blob/master/clean_pruning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# set up


In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import torch.utils.model_zoo as model_zoo
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.nn.modules import Module
import torchvision.models.vgg as tv_vgg
import time
import numpy as np
import torchvision
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from torch.autograd import Variable
import time
import os
import copy
import math

In [0]:
transform = transforms.Compose(
    [transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=5,
                                         shuffle=False, num_workers=0)
# _,trainset = torch.utils.data.random_split(trainset,(49200,800))
# _,trainset = torch.utils.data.random_split(trainset,(49995,5))
# print(trainset.__len__())

# train_data, val_data = torch.utils.data.random_split(trainset,(int(0.8*len(trainset)),int(0.2*len(trainset))))
# print(train_data.__len__(),val_data.__len__() )

# trainloader = torch.utils.data.DataLoader(train_data, batch_size=5,
#                                           shuffle=True, num_workers=0)
# valloader = torch.utils.data.DataLoader(val_data, batch_size=5,
#                                           shuffle=True, num_workers=0)


Files already downloaded and verified
Files already downloaded and verified


In [0]:
# image_datasets= {'train': train_data,'val': val_data}
# dataloaders = {'train': trainloader, 'val': valloader}

# dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
# class_names = image_datasets['train'].classes


In [0]:
def freeze_layers(model_ft, exclude=[]):
#   children = list(model_ft.named_children())
  for name,param in model_ft.named_parameters():   
    if(name not in  exclude):
      param.requires_grad = False

In [0]:
def countNonZeroWeights(model):
    nonzeros = 0
    weights = 0
    for name,param in model.named_parameters():
        if param is not None:
            nonzeros += torch.sum((param != 0).int()).data[0]
            weights += torch.sum(param).data[0]
    
    return nonzeros, weights

In [0]:
def set_threshold(model,prop=0.05):
  for child in model.named_children():    
    for child in child[1].named_children():
#       print(child)
      if type(child[1]) == MaskedLinear or type(child[1]) == MaskedConv: 
        child[1].set_threshold(prop=prop)
        print("layer {}  new threshold {:.4f}".format(child[0], child[1].threshold))        

In [0]:
def train_model_prune(model, dloaders, dataset_sizes, criterion, optimizer, scheduler,prop=0.05, num_epochs=25, device='cuda',pruning='threshold'):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    print(len(dloaders['train']))
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
                data_idx = 0
            else:
                model.eval()   # Set model to evaluate mode
                data_idx = 1

            running_loss = 0.0
            running_corrects = 0
            i=0
      
            # Iterate over data.
            for inputs, labels in dloaders[phase]:               
#                 print("batch {} phase {}".format(i, phase))
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    if pruning == 'L0':
                      loss = criterion(outputs, labels,model)
                    else:
                      loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                        #HACKS REPLACE!!!!!!
                        model.classifier[0].constrain_parameters()
                        exp_flops, exp_l0 = model.classifier[0].count_expected_flops_and_l0()
                i+=1
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                           
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            if epoch % 5 == 0 and phase == 'train': 
              if pruning == 'threshold':
                set_threshold(model,prop=prop)
              elif pruning == 'L0':
                print(exp_flops.item(), exp_l0.item())
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [0]:
class Masked:
  def make_mask(self, threshold,mask=None):
    if mask is None:
      print("new mask",device)
      self.mask = torch.ones(self.weight.size(), requires_grad=False).to(device)
    else:
      self.mask = mask      
    self.zeros = torch.zeros(self.weight.size(), requires_grad=False).to(device)
    self.threshold = threshold
  def set_threshold(self,prop=0.05):
    unique_weights = torch.unique(self.weight*self.mask)
    mask_size = self.mask.reshape(-1).size()[0]
#     mask_size = mask_size[0]*mask_size[1]
    mask_nonzero = torch.sum(self.mask.view([mask_size]))
    mask_total = mask_size
    print('nonzero proportion: {:.4f}'.format(mask_nonzero/mask_total))
    self.threshold = torch.max(torch.topk(torch.abs(unique_weights),int(prop*unique_weights.size()[0]),largest=False)[0])    
  def make_threshold_mask(self):
    self.mask = torch.where(torch.abs(self.weight) >= self.threshold,self.mask,self.zeros).to(device)
#     self.mask.requires_grad_(requires_grad=False)
  def mask_weight(self):
    self.weight = torch.nn.Parameter(self.weight*self.mask).to(device) 

    

# L0 pruning


In [0]:
class MaskedLinear(torch.nn.Linear,Masked):
  def __init__(self, in_features, out_features, bias=True, threshold=0.001,mask=None):
    super(MaskedLinear, self).__init__(in_features,out_features)
    self.make_mask(threshold,mask)
  def forward(self, input):
    self.make_threshold_mask()
    self.mask_weight()
#     print(self.mask[125:135,125:135])
#     print(self.weight[125:135,125:135])
    return F.linear(input, self.weight, self.bias)

class MaskedConv(torch.nn.Conv2d,Masked):
  def __init__(self, in_channels, out_channels, kernel_size, stride,
                 padding, dilation, groups, bias=True,threshold=0.0001):
    super(MaskedConv,self).__init__(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
    self.make_mask(threshold)    
  def forward(self, input):
    self.mask_weight()
    return F.conv2d(input, self.weight, self.bias, self.stride,
                    self.padding, self.dilation, self.groups)

limit_a, limit_b, epsilon = -.1, 1.1, 1e-6
device='cuda'
class LinearL0(Module):
  """Implementation of L0 regularization for the input units of a fully connected layer"""
  def __init__(self, in_features, out_features, bias=True, weight_decay=1., droprate_init=0.5, temperature=2./3.,
                 lamba=1., local_rep=False, **kwargs):
        """
        :param in_features: Input dimensionality
        :param out_features: Output dimensionality
        :param bias: Whether we use a bias
        :param weight_decay: Strength of the L2 penalty
        :param droprate_init: Dropout rate that the L0 gates will be initialized to
        :param temperature: Temperature of the concrete distribution
        :param lamba: Strength of the L0 penalty
        :param local_rep: Whether we will use a separate gate sample per element in the minibatch
        """
        super(LinearL0, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.prior_prec = weight_decay
        self.weights = torch.nn.Parameter(torch.Tensor(in_features, out_features).to(device))
        self.qz_loga = torch.Tensor(in_features).to(device)
#         self.qz_loga = torch.nn.Parameter(torch.Tensor(in_features).to(device))
        self.temperature = temperature
        self.droprate_init = droprate_init if droprate_init != 0. else 0.5
        self.lamba = lamba
        self.use_bias = False
        self.local_rep = local_rep
        if bias:
            self.bias = torch.nn.Parameter(torch.Tensor(out_features))
            self.use_bias = True
        self.floatTensor = torch.FloatTensor if not torch.cuda.is_available() else torch.cuda.FloatTensor
        self.reset_parameters()
        print(self)
  def reset_parameters(self):
      torch.nn.init.kaiming_normal(self.weights, mode='fan_out')

      self.qz_loga.data.normal_(math.log(1 - self.droprate_init) - math.log(self.droprate_init), 1e-2)

      if self.use_bias:
          self.bias.data.fill_(0)

  def constrain_parameters(self, **kwargs):
      self.qz_loga.data.clamp_(min=math.log(1e-2), max=math.log(1e2))

  def cdf_qz(self, x):
      """Implements the CDF of the 'stretched' concrete distribution"""
      xn = (x - limit_a) / (limit_b - limit_a)
      logits = math.log(xn) - math.log(1 - xn)
      return F.sigmoid(logits * self.temperature - self.qz_loga).clamp(min=epsilon, max=1 - epsilon).to(device)

  def quantile_concrete(self, x):
      """Implements the quantile, aka inverse CDF, of the 'stretched' concrete distribution"""
      y = F.sigmoid((torch.log(x) - torch.log(1 - x) + self.qz_loga) / self.temperature).to(device)
      return y * (limit_b - limit_a) + limit_a

  def _reg_w(self):
      """Expected L0 norm under the stochastic gates, takes into account and re-weights also a potential L2 penalty"""
      logpw_col = torch.sum(- (.5 * self.prior_prec * self.weights.pow(2)) - self.lamba, 1).to(device)
      logpw = torch.sum((1 - self.cdf_qz(0)) * logpw_col).to(device)
      logpb = 0 if not self.use_bias else - torch.sum(.5 * self.prior_prec * self.bias.pow(2)).to(device)
      return logpw + logpb

  def regularization(self):
      return self._reg_w()

  def count_expected_flops_and_l0(self):
      """Measures the expected floating point operations (FLOPs) and the expected L0 norm"""
      # dim_in multiplications and dim_in - 1 additions for each output neuron for the weights
      # + the bias addition for each neuron
      # total_flops = (2 * in_features - 1) * out_features + out_features
      ppos = torch.sum(1 - self.cdf_qz(0))
      expected_flops = (2 * ppos - 1) * self.out_features
      expected_l0 = ppos * self.out_features
      if self.use_bias:
          expected_flops += self.out_features
          expected_l0 += self.out_features
#       return expected_flops.data[0], expected_l0.data[0]
      return expected_flops, expected_l0

  def get_eps(self, size):
      """Uniform random numbers for the concrete distribution"""
      eps = self.floatTensor(size).uniform_(epsilon, 1-epsilon).to(device)
      eps = Variable(eps)
      return eps

  def sample_z(self, batch_size, sample=True):
      """Sample the hard-concrete gates for training and use a deterministic value for testing"""
      if sample:
          eps = self.get_eps(self.floatTensor(batch_size, self.in_features))
          z = self.quantile_concrete(eps)
          return F.hardtanh(z, min_val=0, max_val=1).to(device)
      else:  # mode
          pi = F.sigmoid(self.qz_loga).view(1, self.in_features).expand(batch_size, self.in_features).to(device)
          return F.hardtanh(pi * (limit_b - limit_a) + limit_a, min_val=0, max_val=1).to(device)

  def sample_weights(self):
      z = self.quantile_concrete(self.get_eps(self.floatTensor(self.in_features)))
      mask = F.hardtanh(z, min_val=0, max_val=1).to(device)
      return mask.view(self.in_features, 1) * self.weights

  def forward(self, input):
      if self.local_rep or not self.training:
          z = self.sample_z(input.size(0), sample=self.training)
          xin = input.mul(z)
          output = xin.mm(self.weights)
      else:
          weights = self.sample_weights()
          output = input.mm(weights)
      if self.use_bias:
          output.add_(self.bias)
      return output

  def __repr__(self):
      s = ('{name}({in_features} -> {out_features}, droprate_init={droprate_init}, '
           'lamba={lamba}, temperature={temperature}, weight_decay={prior_prec}, '
           'local_rep={local_rep}')
      if not self.use_bias:
          s += ', bias=False'
      s += ')'
      return s.format(name=self.__class__.__name__, **self.__dict__)


In [0]:
def mask_network(network,layers_to_mask, threshold=0.002, linear_masking=None,random_init=False, bias=True,masks=None):
  """"
  replaces linear layers with masked linear layers
  network is the initial sequential container
  layers is a list of layers to mask
  random init is a logical indicating whether to preserve the initial weights or to modify them
  """
  for name,layer in network.named_children():   
    if int(name) in layers_to_mask:
      layer_mask = None
      if masks is not None:
        if name in masks:
          layer_mask = masks.get(name)      
      if type(layer)== torch.nn.Linear and linear_masking is None:
        masked_layer = MaskedLinear(layer.in_features, layer.out_features, bias=bias,threshold=threshold,mask=layer_mask)
      elif type(layer)== torch.nn.Linear and linear_masking =='L0':
        masked_layer = LinearL0(layer.in_features, layer.out_features, bias=bias)
      elif type(layer)== torch.nn.Conv2d:
        masked_layer = MaskedConv(layer.in_channels, layer.out_channels, layer.kernel_size, layer.stride, layer.padding, layer.dilation,layer.groups, bias=bias, threshold=threshold)
      if random_init != True:
        masked_layer.weight = copy.deepcopy(layer.weight)
        masked_layer.bias = copy.deepcopy(layer.bias)
      network[int(name)] = masked_layer

In [0]:
class VGG_L0(tv_vgg.VGG):
  def regularization(self):
    regularization = 0.
    for layer in self.layers:
        regularization += - (1. / self.N) * layer.regularization()
    if torch.cuda.is_available():
        regularization = regularization.cuda()
    return regularization
  
  def regularize(self, N):
    regularization = 0.
    for child in self.named_children():    
      for child in child[1].named_children():
        if type(child[1]) == LinearL0:
          regularization += - (1. / N) * child[1].regularization()          
    if torch.cuda.is_available():
        regularization = regularization.cuda()
    return regularization
  
          

def vgg16_L0(pretrained=False, **kwargs):
  """VGG 16-layer model (configuration "D")
  Args:
      pretrained (bool): If True, returns a model pre-trained on ImageNet
  """
  if pretrained:
      kwargs['init_weights'] = False
  model = VGG_L0(tv_vgg.make_layers(tv_vgg.cfg['D']), **kwargs)
  if pretrained:
      model.load_state_dict(model_zoo.load_url(tv_vgg.model_urls['vgg16']))
  return model

  

def run_normal_training_with_L0_pruning(this_trainset):
  print(this_trainset.__len__())  
  _,mytrainset = torch.utils.data.random_split(this_trainset,(49200,800))
  # _,trainset = torch.utils.data.random_split(trainset,(49995,5))
  print(mytrainset.__len__())

  mytrain_data, myval_data = torch.utils.data.random_split(mytrainset,(int(0.8*len(mytrainset)),int(0.2*len(mytrainset))))
  print(mytrain_data.__len__(),myval_data.__len__() )

  mytrainloader = torch.utils.data.DataLoader(mytrain_data, batch_size=5,
                                            shuffle=True, num_workers=0)
  myvalloader = torch.utils.data.DataLoader(myval_data, batch_size=5,
                                            shuffle=True, num_workers=0)
  mydataloaders = {'train': mytrainloader, 'val': myvalloader}
  image_datasets= {'train': mytrain_data,'val': myval_data}
  dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}  

  model_ft = vgg16_L0(pretrained=True)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_ft = model_ft.to(device)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_ft = model_ft.to(device)

  freeze_layers(model_ft.features, exclude=[])
  mask_network(model_ft.classifier,[0],linear_masking="L0")

  set_threshold(model_ft)

  criterion = nn.CrossEntropyLoss()

  def loss_function(outputs,targets, model):
    loss = criterion(outputs,targets)
    loss += model.regularize(640)
    return loss


  # Observe that all parameters are being optimized
  optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
#   [print(p) for p in model_ft.parameters()]
#   return

  # Decay LR by a factor of 0.1 every 7 epochs
  exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

  model_ft = train_model_prune(model_ft, mydataloaders,dataset_sizes, loss_function, optimizer_ft, exp_lr_scheduler,
                       num_epochs=20, pruning="L0")
  
run_normal_training_with_L0_pruning(trainset)

50000
800
640 160




LinearL0(25088 -> 4096, droprate_init=0.5, lamba=1.0, temperature=0.6666666666666666, weight_decay=1.0, local_rep=False)
128
Epoch 0/19
----------
train Loss: 133570.0876 Acc: 0.0813
170955968.0 85482080.0
val Loss: 133567.1133 Acc: 0.1375

Epoch 1/19
----------
train Loss: 133567.0071 Acc: 0.1922
val Loss: 133566.5557 Acc: 0.3125

Epoch 2/19
----------
train Loss: 133566.5800 Acc: 0.3328
val Loss: 133566.4395 Acc: 0.3250

Epoch 3/19
----------
train Loss: 133566.4303 Acc: 0.3641
val Loss: 133566.2407 Acc: 0.4062

Epoch 4/19
----------
train Loss: 133566.2543 Acc: 0.4172
val Loss: 133566.2256 Acc: 0.3875

Epoch 5/19
----------
train Loss: 133566.2024 Acc: 0.4344
170955968.0 85482080.0
val Loss: 133566.2197 Acc: 0.3750

Epoch 6/19
----------
train Loss: 133566.0262 Acc: 0.4859
val Loss: 133566.3208 Acc: 0.3875

Epoch 7/19
----------
train Loss: 133565.9509 Acc: 0.4969
val Loss: 133566.0093 Acc: 0.4563

Epoch 8/19
----------
train Loss: 133565.8813 Acc: 0.5281
val Loss: 133566.0034 Acc: 

# run

In [0]:
def run_normal_training_with_pruning(this_trainset):
  _,mytrainset = torch.utils.data.random_split(this_trainset,(49200,800))

  mytrain_data, myval_data = torch.utils.data.random_split(mytrainset,(int(0.8*len(mytrainset)),int(0.2*len(mytrainset))))
  print(mytrain_data.__len__(),myval_data.__len__() )

  mytrainloader = torch.utils.data.DataLoader(mytrain_data, batch_size=5,
                                            shuffle=True, num_workers=0)
  myvalloader = torch.utils.data.DataLoader(myval_data, batch_size=5,
                                            shuffle=True, num_workers=0)
  mydataloaders = {'train': mytrainloader, 'val': myvalloader}
  image_datasets= {'train': mytrain_data,'val': myval_data}
  dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

  model_ft = models.vgg16(pretrained=True)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_ft = model_ft.to(device)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_ft = model_ft.to(device)

  freeze_layers(model_ft.features, exclude=[])
  mask_network(model_ft.classifier,[0],threshold=0.0001)
  set_threshold(model_ft)
  
  criterion = nn.CrossEntropyLoss()  
     
  # Observe that all parameters are being optimized
  optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

  # Decay LR by a factor of 0.1 every 7 epochs
  exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
  
  
  model_ft = train_model_prune(model_ft, mydataloaders,dataset_sizes, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=2)
  
# run_normal_training_with_pruning(trainset)

In [0]:
def train_meta_prune(model,trainset, outer_steps, num_samples=800, device='cuda'):
  mask_dict = {'0':torch.ones(model.classifier[0].weight.size()).to(device)}
  shuffled_train = torch.utils.data.RandomSampler(trainset)
  train_sample_list = list(torch.utils.data.BatchSampler(shuffled_train,num_samples,False))
  shuffled_train = [x for x in shuffled_train]
  for i in range(outer_steps):
#     train_sample = [trainset[j] for j in train_sample_list[i]] 
    
#     print(len(train_sample))
    _,train_sample = torch.utils.data.random_split(trainset,(49200,800))
    train_data, val_data = torch.utils.data.random_split(train_sample,(int(0.8*num_samples),int(0.2*num_samples)))

    trainloader = torch.utils.data.DataLoader(train_data, batch_size=5,
                                            shuffle=True, num_workers=0)
    valloader = torch.utils.data.DataLoader(val_data, batch_size=5,
                                            shuffle=True, num_workers=0)
    
    subdataloaders = {'train': trainloader, 'val': valloader}
    image_datasets= {'train': train_data,'val': val_data}
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
    
    model_ft = models.vgg16(pretrained=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

    # freeze_layers(model_ft.features, exclude=['28.weight'])
    freeze_layers(model_ft.features)   
    mask_network(model_ft.classifier,[0],threshold=0.0001,masks=mask_dict)
    model_ft = train_model_prune(model_ft, subdataloaders, dataset_sizes, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=10, prop=0.1)
    mask_dict = {'0':model_ft.classifier[0].mask}
#     set_threshold(model_ft)

#     cost = meta_objective({'train':trainloader, 'val':valoader}, model, optimizer, inner_epochs)


# model_ft = models.vgg16(pretrained=True)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_ft = model_ft.to(device)

# train_meta_prune(model_ft,trainset,15)