In [None]:
import torch.nn as nn
import torch
import torchvision
import torchvision.transforms as transforms
import os
import math
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
transforms = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,download=True, transform=transforms)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,download=True, transform=transforms)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified



In [None]:
import torchvision.models as models
vgg16 = models.vgg16()
compressed_vgg16 = models.vgg16()
encoding_vgg16 = models.vgg16()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = 'Vgg16.pth'
path = F"/content/drive/My Drive/Colab Notebooks/{model_name}"
# path = F"/Downloads/Vgg16.pth"
vgg16.load_state_dict(torch.load(path))
compressed_vgg16.load_state_dict(torch.load(path))
# vgg16.load_state_dict(torch.load(path, map_location=device))
# compressed_vgg16.load_state_dict(torch.load(path, map_location=device))
vgg16.to(device)
compressed_vgg16.to(device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [None]:
print(vgg16.features[0].weight.size())
# 192*9
print(vgg16.features[0].weight.view(vgg16.features[0].weight.size()[0]*vgg16.features[0].weight.size()[2],vgg16.features[0].weight.size()[1]*vgg16.features[0].weight.size()[3]))

torch.Size([64, 3, 3, 3])
tensor([[ 0.0469,  0.0367,  0.0944,  ..., -0.0233,  0.1195,  0.0220],
        [-0.0737, -0.0697,  0.0038,  ..., -0.0780, -0.0652,  0.0009],
        [-0.0079,  0.0420,  0.0043,  ..., -0.0429,  0.0675, -0.0482],
        ...,
        [ 0.0579,  0.2846,  0.2503,  ..., -0.0722,  0.1032,  0.2381],
        [-0.2223, -0.0271,  0.0845,  ..., -0.2082, -0.2243, -0.0729],
        [-0.1516,  0.0750,  0.0907,  ..., -0.1377, -0.1458, -0.0125]],
       device='cuda:0', grad_fn=<ViewBackward>)


In [None]:
# Linearization L(x)=f(a)+f'(a)(x-a)
def Linearization (model,p):
  model = model.clone().view(1,-1)
  sorted, index = torch.sort(model[0])
  nonzero_value = sorted[sorted.nonzero().squeeze().detach()]  # sorted value
  nonzero_index = index[sorted.nonzero().squeeze().detach()] # equal to index, nonzero value original position
  slope = []
  interval = []
  interval_index = 0
  new_value = []
  delta_x = [] # x-a
  all_fixed_points = []
  all_derivatives = []
  distance = 0
  approximation_error = 0 # True error
  average_error = 0 # Estimated error
  # Evaluate slopes at every points
    # left end-point
  m = float(nonzero_value[1]-nonzero_value[0])
  slope.append(m)

    #interior points
  for i in range(1, nonzero_value.size()[0]-1):
    m = 0.5 * float(nonzero_value[i+1]-nonzero_value[i-1])
    slope.append(m)

    # right end-point
  m = float(nonzero_value[nonzero_value.size()[0]-1]-nonzero_value[nonzero_value.size()[0]-2])
  slope.append(m)

  derivative = slope[0]
  fixed_point = nonzero_value[0]
  all_derivatives.append(derivative) # f'(0)
  all_fixed_points.append(fixed_point) # f(0)

  # Partition weights to intervals
  for j in range(0,len(slope)):
    # if (j+1<len(slope) and (slope[j+1] > slope_threshold or (slope[j]>slope_threshold and slope[j+1]<slope_threshold))):
    if (j+1<len(slope) and (abs(derivative-slope[j]) > p*derivative)):  # The change of slope should be less then slope
      interval.append(interval_index)
      interval_index = interval_index + 1
      new_value.append(fixed_point + distance * derivative) # Linearization to approximate values on the same interval. distance is (x-a).
      approximation_error = approximation_error+abs((nonzero_value[j] - new_value[j]).item())
      average_error = average_error + distance*p*derivative
      delta_x.append(distance)
      derivative = float(slope[j+1])  # Assign the next slope as f'(a)
      fixed_point = float(nonzero_value[j+1]) # Assign the next point as f(a)
      all_derivatives.append(derivative)
      all_fixed_points.append(fixed_point)
      distance=0
    else:
      interval.append(interval_index)
      new_value.append(fixed_point + distance * derivative) # Linearization to approximate values on the same interval. distance is (x-a).
      approximation_error = approximation_error+abs((nonzero_value[j] - new_value[j]).item())
      average_error = average_error + distance*p*derivative
      delta_x.append(distance)
      distance = distance + 1
  approximation_error = approximation_error / len(nonzero_value)
  average_error = average_error / len(nonzero_value)
  interval = torch.tensor(interval) # Convert list to tensor

  # Convert list to tensor
  new_value = torch.tensor(new_value).to(device)
  delta_x = torch.tensor(delta_x).to(device)
  all_fixed_points = torch.tensor(all_fixed_points).to(device)
  all_derivatives = torch.tensor(all_derivatives).to(device)

  # print(alexnet.features[0].weight[0][0][0])
  print(new_value[2000:2020])
  print('slope:'+str(slope[2000:2020]))
  print('distance:'+str(delta_x[2000:2020]))
  print('approximation error:'+str(approximation_error))
  print('average error:'+str(average_error))
  # Assign new value to model
  model[0][index]=new_value
  delta_x[index]=delta_x
  interval[index]=interval
  return model, interval, delta_x, all_fixed_points, all_derivatives

In [None]:
# Linearization L(x)=f(a)+f'(a)(x-a)
def Linearization2 (model,p):
  model = model.clone().view(1,-1)
  sorted, index = torch.sort(model[0])
  nonzero_value = sorted[sorted.nonzero().squeeze().detach()]  # sorted value
  nonzero_index = index[sorted.nonzero().squeeze().detach()].type(torch.uint8) # equal to index, nonzero value original position
  del model
  distance = 1
  approximation_error =0 
  average_error = 0
  fixed_point = nonzero_value[0]
  derivative = float(nonzero_value[1]-nonzero_value[0])
  next_derivative = 0.5 * float(nonzero_value[2]-nonzero_value[0])
  new_value = []
  new_value.append(float(nonzero_value[0]))

  for i in range(1,len(nonzero_value)):
    if (i==len(nonzero_value)-1):
      new_value.append(fixed_point+derivative*distance)
      break
    elif ((i+1)<len(nonzero_value)-1 and abs(next_derivative-derivative)>p*derivative):
      new_value.append(fixed_point+derivative*distance)
      approximation_error = approximation_error+abs((nonzero_value[i] - fixed_point+derivative*distance).item())
      average_error = average_error + distance*p*derivative
      distance = 0
      derivative = next_derivative
      next_derivative = 0.5 * float(nonzero_value[i+2]-nonzero_value[i])
    else:
      new_value.append(fixed_point+derivative*distance)
      approximation_error = approximation_error+abs((nonzero_value[i] - fixed_point+derivative*distance).item())
      average_error = average_error + distance*p*derivative
      distance = distance + 1
      next_derivative = 0.5 * float(nonzero_value[i+2]-nonzero_value[i])
  approximation_error = approximation_error / len(nonzero_value)
  average_error = average_error / len(nonzero_value)
  new_value[index] = new_value
  # model[0][index]=new_value
  print(approximation_error)
  print(average_error)
  return model

In [None]:
import datetime
features = [2,5,7,10,12,14,17,19,21,24,26,28]
starttime = datetime.datetime.now()
for f in features: 
  model, interval, distance, all_fixed_points, all_derivatives = Linearization(vgg16.features[f].weight,0.95) 
  model = model.view(vgg16.features[f].weight.size())
  del compressed_vgg16.features[f].weight
  del encoding_vgg16.features[f].weight
  compressed_vgg16.features[f].register_parameter('weight', nn.Parameter(model))
  encoding_vgg16.features[f].register_parameter('fixed_points',nn.Parameter(all_fixed_points))
  encoding_vgg16.features[f].register_parameter('derivatives',nn.Parameter(all_derivatives))
  encoding_vgg16.features[f].register_parameter('interval',nn.Parameter(interval.type(torch.uint8),False))
  encoding_vgg16.features[f].register_parameter('distance',nn.Parameter(distance.type(torch.uint8),False))

endtime = datetime.datetime.now()
print(str(endtime - starttime)+'seconds')

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  """


tensor([-0.0998, -0.0998, -0.0997, -0.0997, -0.0997, -0.0997, -0.0996, -0.0996,
        -0.0996, -0.0996, -0.1010, -0.1010, -0.1009, -0.1009, -0.1009, -0.1008,
        -0.1008, -0.1008, -0.1007, -0.1007])
slope:[1.920759677886963e-05, 1.5810132026672363e-05, 4.503875970840454e-06, 2.5369226932525635e-06, 4.462897777557373e-06, 5.122274160385132e-06, 5.5655837059021e-06, 6.92903995513916e-06, 3.866851329803467e-06, 4.6193599700927734e-07, 3.855675458908081e-06, 3.4052878618240356e-05, 3.836676478385925e-05, 1.6003847122192383e-05, 1.4152377843856812e-05, 1.6279518604278564e-05, 1.944601535797119e-05, 1.6383826732635498e-05, 9.961426258087158e-06, 7.3909759521484375e-06]
distance:tensor([140, 141, 142, 143, 144, 145, 146, 147, 148, 149,   0,   1,   0,   1,
          2,   3,   4,   5,   6,   7])
approximation error:0.0003930379839874752
average error:0.0007317308072047937
tensor([-0.0876, -0.0878, -0.0878, -0.0878, -0.0878, -0.0877, -0.0877, -0.0877,
        -0.0877, -0.0877, -0.0877, -0.

In [None]:
import datetime
classifier=[3,6]
starttime = datetime.datetime.now()
for c in classifier:
  model, interval, distance, all_fixed_points, all_derivatives = Linearization(vgg16.classifier[c].weight,0.95)
  model = model.view(vgg16.classifier[c].weight.size())
  del compressed_vgg16.classifier[c].weight
  del encoding_vgg16.classifier[c].weight
  compressed_vgg16.classifier[c].register_parameter('weight', nn.Parameter(model))
  encoding_vgg16.classifier[c].register_parameter('fixed_points',nn.Parameter(all_fixed_points))
  encoding_vgg16.classifier[c].register_parameter('derivatives',nn.Parameter(all_derivatives))
  encoding_vgg16.classifier[c].register_parameter('interval',nn.Parameter(interval.type(torch.uint8),False))
  encoding_vgg16.classifier[c].register_parameter('distance',nn.Parameter(distance.type(torch.uint8),False))
endtime = datetime.datetime.now()
print(str(endtime - starttime)+'seconds')

tensor([-0.0414, -0.0414, -0.0414, -0.0414, -0.0414, -0.0414, -0.0414, -0.0414,
        -0.0414, -0.0414, -0.0414, -0.0413, -0.0413, -0.0413, -0.0413, -0.0413,
        -0.0413, -0.0413, -0.0413, -0.0413])
slope:[1.860782504081726e-06, 1.780688762664795e-06, 8.847564458847046e-07, 7.413327693939209e-07, 1.259148120880127e-06, 1.0170042514801025e-06, 4.824250936508179e-07, 5.6438148021698e-07, 1.0058283805847168e-06, 1.1101365089416504e-06, 3.632158041000366e-07, 1.2777745723724365e-06, 1.1492520570755005e-06, 1.1529773473739624e-06, 1.642853021621704e-06, 5.885958671569824e-07, 2.1979212760925293e-06, 3.602355718612671e-06, 2.034008502960205e-06, 1.1771917343139648e-06]
distance:tensor([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
        29, 30])
approximation error:2.833114195639464e-07
average error:5.273976124620357e-07
tensor([-0.0426, -0.0426, -0.0426, -0.0426, -0.0426, -0.0426, -0.0425, -0.0425,
        -0.0425, -0.0425, -0.0425, -0.0425, -0.0425, -0.04

In [None]:
import datetime
starttime = datetime.datetime.now()
c = 0
model = Linearization2(vgg16.classifier[c].weight,0.95)
model = model.view(vgg16.classifier[c].weight.size())
# del encoding_vgg16.classifier[c].weight
# compressed_vgg16.classifier[c].register_parameter('weight', nn.Parameter(model))
endtime = datetime.datetime.now()
print(str(endtime - starttime)+'seconds')

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  """


In [None]:
import torch.optim as optim
epochs = 10
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
# criterion = nn.MSELoss()
optimizer = optim.SGD(vgg16.parameters(), lr=learning_rate, momentum=0.9)

In [None]:
for epoch in range(epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device),labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = vgg16(inputs)
        loss = criterion(outputs, labels).to(device)
        loss.backward()
        optimizer.step()
        print('epoch: %d iteration: %d loss: %.3f' % (epoch + 1, i + 1, loss))
        # print('\n')
        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    learning_rate = learning_rate /10
print('Finished Training')

epoch: 1 iteration: 1 loss: 0.009
epoch: 1 iteration: 2 loss: 0.003
epoch: 1 iteration: 3 loss: 0.008
epoch: 1 iteration: 4 loss: 0.002
epoch: 1 iteration: 5 loss: 0.050
epoch: 1 iteration: 6 loss: 0.006
epoch: 1 iteration: 7 loss: 0.006
epoch: 1 iteration: 8 loss: 0.002
epoch: 1 iteration: 9 loss: 0.008
epoch: 1 iteration: 10 loss: 0.042
epoch: 1 iteration: 11 loss: 0.045
epoch: 1 iteration: 12 loss: 0.045
epoch: 1 iteration: 13 loss: 0.004
epoch: 1 iteration: 14 loss: 0.005
epoch: 1 iteration: 15 loss: 0.031
epoch: 1 iteration: 16 loss: 0.005
epoch: 1 iteration: 17 loss: 0.086
epoch: 1 iteration: 18 loss: 0.065
epoch: 1 iteration: 19 loss: 0.021
epoch: 1 iteration: 20 loss: 0.176
epoch: 1 iteration: 21 loss: 0.003
epoch: 1 iteration: 22 loss: 0.001
epoch: 1 iteration: 23 loss: 0.070
epoch: 1 iteration: 24 loss: 0.022
epoch: 1 iteration: 25 loss: 0.013
epoch: 1 iteration: 26 loss: 0.198
epoch: 1 iteration: 27 loss: 0.033
epoch: 1 iteration: 28 loss: 0.006
epoch: 1 iteration: 29 loss: 

KeyboardInterrupt: ignored

In [None]:
import time
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        # output
        outputs = compressed_vgg16(images)
        # outputs = vgg16(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %.2f %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 80.65 %


In [None]:
for name, para in vgg16.named_parameters():
  print(name+":"+str(para.numel()))

features.0.weight:1728
features.0.bias:64
features.2.weight:36864
features.2.bias:64
features.5.weight:73728
features.5.bias:128
features.7.weight:147456
features.7.bias:128
features.10.weight:294912
features.10.bias:256
features.12.weight:589824
features.12.bias:256
features.14.weight:589824
features.14.bias:256
features.17.weight:1179648
features.17.bias:512
features.19.weight:2359296
features.19.bias:512
features.21.weight:2359296
features.21.bias:512
features.24.weight:2359296
features.24.bias:512
features.26.weight:2359296
features.26.bias:512
features.28.weight:2359296
features.28.bias:512
classifier.0.weight:102760448
classifier.0.bias:4096
classifier.3.weight:16777216
classifier.3.bias:4096
classifier.6.weight:40960
classifier.6.bias:10


In [None]:
PATH = './Vgg16.pth'
torch.save(vgg16.state_dict(), PATH)