In [None]:
import torch.nn as nn
import torch
import torchvision
import torchvision.transforms as transforms
import os
import math
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
transforms = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,download=True, transform=transforms)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,download=True, transform=transforms)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified



In [None]:
import torchvision.models as models
Resnet152=models.resnet152()
compressed_Resnet152=models.resnet152()
encoding_Resnet152=models.resnet152()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = 'Resnet152.pth'
path = F"/content/drive/My Drive/Colab Notebooks/{model_name}"
Resnet152.load_state_dict(torch.load(path))
compressed_Resnet152.load_state_dict(torch.load(path))
Resnet152.to(device)
compressed_Resnet152.to(device)
encoding_Resnet152.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
# Find all convolutional layers and fully-connected layers
conv_layers=[]
for name,layer in compressed_Resnet152.named_modules():
  if isinstance(layer, torch.nn.Conv2d):
    conv_layers.append(layer)
fc_layers=[]
for name,layer in compressed_Resnet152.named_modules():
  if isinstance(layer, torch.nn.Linear):
    fc_layers.append(layer)

encoding_conv_layers=[]
for name,layer in encoding_Resnet152.named_modules():
  if isinstance(layer, torch.nn.Conv2d):
    encoding_conv_layers.append(layer)
encoding_fc_layers=[]
for name,layer in encoding_Resnet152.named_modules():
  if isinstance(layer, torch.nn.Linear):
    encoding_fc_layers.append(layer)

In [None]:
# Linearization L(x)=f(a)+f'(a)(x-a)
def Linearization (model,p):
  model = model.clone().view(1,-1)
  sorted, index = torch.sort(model[0])
  nonzero_value = sorted[sorted.nonzero().squeeze().detach()]  # sorted value
  nonzero_index = index[sorted.nonzero().squeeze().detach()] # equal to index, nonzero value original position
  slope = []
  interval = []
  interval_index = 0
  new_value = []
  delta_x = [] # x-a
  all_fixed_points = []
  all_derivatives = []
  distance = 0
  approximation_error = 0 # True error
  average_error = 0 # Estimated error
  # Evaluate slopes at every points
    # left end-point
  m = float(nonzero_value[1]-nonzero_value[0])
  slope.append(m)

    #interior points
  for i in range(1, nonzero_value.size()[0]-1):
    m = 0.5 * float(nonzero_value[i+1]-nonzero_value[i-1])
    slope.append(m)

    # right end-point
  m = float(nonzero_value[nonzero_value.size()[0]-1]-nonzero_value[nonzero_value.size()[0]-2])
  slope.append(m)

  derivative = slope[0]
  fixed_point = nonzero_value[0]
  all_derivatives.append(derivative) # f'(0)
  all_fixed_points.append(fixed_point) # f(0)

  # Partition weights to intervals
  for j in range(0,len(slope)):
    # if (j+1<len(slope) and (slope[j+1] > slope_threshold or (slope[j]>slope_threshold and slope[j+1]<slope_threshold))):
    if (j+1<len(slope) and (abs(derivative-slope[j]) > p*derivative)):  # The change of slope should be less then slope
      interval.append(interval_index)
      interval_index = interval_index + 1
      new_value.append(fixed_point + distance * derivative) # Linearization to approximate values on the same interval. distance is (x-a).
      approximation_error = approximation_error+abs((nonzero_value[j] - new_value[j]).item())
      average_error = average_error + distance*p*derivative
      delta_x.append(distance)
      derivative = float(slope[j+1])  # Assign the next slope as f'(a)
      fixed_point = float(nonzero_value[j+1]) # Assign the next point as f(a)
      all_derivatives.append(derivative)
      all_fixed_points.append(fixed_point)
      distance=0
    else:
      interval.append(interval_index)
      new_value.append(fixed_point + distance * derivative) # Linearization to approximate values on the same interval. distance is (x-a).
      approximation_error = approximation_error+abs((nonzero_value[j] - new_value[j]).item())
      average_error = average_error + distance*p*derivative
      delta_x.append(distance)
      distance = distance + 1
  approximation_error = approximation_error / len(nonzero_value)
  average_error = average_error / len(nonzero_value)
  interval = torch.tensor(interval) # Convert list to tensor

  # Convert list to tensor
  new_value = torch.tensor(new_value).to(device)
  delta_x = torch.tensor(delta_x).to(device)
  all_fixed_points = torch.tensor(all_fixed_points).to(device)
  all_derivatives = torch.tensor(all_derivatives).to(device)

  # print(alexnet.features[0].weight[0][0][0])
  print(new_value[2000:2020])
  print('slope:'+str(slope[2000:2020]))
  print('distance:'+str(delta_x[2000:2020]))
  print('approximation error:'+str(approximation_error))
  print('average error:'+str(average_error))
  # Assign new value to model
  model[0][index]=new_value
  delta_x[index]=delta_x
  interval[index]=interval
  return model, interval, delta_x, all_fixed_points, all_derivatives

In [None]:
# compress convolutional layers
import datetime
starttime = datetime.datetime.now()
for l in range(len(conv_layers)): 
  model, interval, distance, all_fixed_points, all_derivatives = Linearization(conv_layers[l].weight,0.92) 
  model = model.view(conv_layers[l].weight.size())
  del conv_layers[l].weight
  del encoding_conv_layers[l].weight
  conv_layers[l].register_parameter('weight', nn.Parameter(model))
  encoding_conv_layers[l].register_parameter('fixed_points',nn.Parameter(all_fixed_points))
  encoding_conv_layers[l].register_parameter('derivatives',nn.Parameter(all_derivatives))
  encoding_conv_layers[l].register_parameter('interval',nn.Parameter(interval.type(torch.uint8),False))
  encoding_conv_layers[l].register_parameter('distance',nn.Parameter(distance.type(torch.uint8),False))

endtime = datetime.datetime.now()
print(str(endtime - starttime)+'seconds')

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  """


tensor([-0.4373, -0.4370, -0.4366, -0.4363, -0.4360, -0.4357, -0.4362, -0.4357,
        -0.4352, -0.4347, -0.4342, -0.4337, -0.4332, -0.4326, -0.4321, -0.4316,
        -0.4311, -0.4306, -0.4301, -0.4296], device='cuda:0')
slope:[0.00011044740676879883, 0.00013151764869689941, 4.531443119049072e-05, 6.355345249176025e-05, 0.00047966837882995605, 0.000707775354385376, 0.0005022138357162476, 0.00021335482597351074, 4.437565803527832e-05, 7.328391075134277e-05, 7.180869579315186e-05, 5.431473255157471e-05, 0.000157088041305542, 0.00034587085247039795, 0.000509411096572876, 0.00032539665699005127, 9.503960609436035e-05, 0.00026462972164154053, 0.00034894049167633057, 0.0004679560661315918]
distance:tensor([ 8,  9, 10, 11, 12, 13,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,
        12, 13], device='cuda:0')
approximation error:0.021287658029620685
average error:0.04400931560129901
tensor([-0.0047, -0.0046, -0.0044, -0.0043, -0.0041, -0.0043, -0.0039, -0.0035,
        -0.0030, -0.0026, -0

In [None]:
# compress fully-connected layers
import datetime
starttime = datetime.datetime.now()
for l in range(len(fc_layers)): 
  model, interval, distance, all_fixed_points, all_derivatives = Linearization(fc_layers[l].weight,0.92) 
  model = model.view(fc_layers[l].weight.size())
  del fc_layers[l].weight
  del encoding_fc_layers[l].weight
  fc_layers[l].register_parameter('weight', nn.Parameter(model))
  encoding_fc_layers[l].register_parameter('fixed_points',nn.Parameter(all_fixed_points))
  encoding_fc_layers[l].register_parameter('derivatives',nn.Parameter(all_derivatives))
  encoding_fc_layers[l].register_parameter('interval',nn.Parameter(interval.type(torch.uint8),False))
  encoding_fc_layers[l].register_parameter('distance',nn.Parameter(distance.type(torch.uint8),False))

endtime = datetime.datetime.now()
print(str(endtime - starttime)+'seconds')

tensor([-0.0453, -0.0453, -0.0453, -0.0453, -0.0452, -0.0452, -0.0452, -0.0453,
        -0.0453, -0.0453, -0.0453, -0.0453, -0.0453, -0.0453, -0.0453, -0.0452,
        -0.0452, -0.0452, -0.0452, -0.0452], device='cuda:0')
slope:[2.5091692805290222e-05, 9.378418326377869e-06, 5.602836608886719e-06, 1.1382624506950378e-05, 1.4716759324073792e-05, 6.487593054771423e-06, 1.821666955947876e-06, 3.557652235031128e-06, 2.4978071451187134e-06, 1.6633421182632446e-06, 4.8335641622543335e-06, 7.76536762714386e-06, 4.176050424575806e-06, 2.512708306312561e-06, 1.6946345567703247e-05, 1.7542392015457153e-05, 1.0600313544273376e-05, 1.3899058103561401e-05, 1.35079026222229e-05, 7.826834917068481e-06]
distance:tensor([0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 0, 1, 2, 0, 1, 2, 3, 4],
       device='cuda:0')
approximation error:3.013833128716878e-06
average error:6.3323389372864865e-06
0:03:05.812732seconds


In [None]:
import torch.optim as optim
epochs = 20
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
# criterion = nn.MSELoss()
optimizer = optim.SGD(Resnet152.parameters(), lr=learning_rate, momentum=0.9)

In [None]:
for epoch in range(epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device),labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = Resnet152(inputs)
        loss = criterion(outputs, labels).to(device)
        loss.backward()
        optimizer.step()
        print('epoch: %d iteration: %d loss: %.3f' % (epoch + 1, i + 1, loss))
        # print('\n')
        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    learning_rate = learning_rate /10
print('Finished Training')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
epoch: 14 iteration: 476 loss: 0.064
epoch: 14 iteration: 477 loss: 0.199
epoch: 14 iteration: 478 loss: 0.087
epoch: 14 iteration: 479 loss: 0.076
epoch: 14 iteration: 480 loss: 0.095
epoch: 14 iteration: 481 loss: 0.205
epoch: 14 iteration: 482 loss: 0.036
epoch: 14 iteration: 483 loss: 0.119
epoch: 14 iteration: 484 loss: 0.070
epoch: 14 iteration: 485 loss: 0.108
epoch: 14 iteration: 486 loss: 0.223
epoch: 14 iteration: 487 loss: 0.226
epoch: 14 iteration: 488 loss: 0.189
epoch: 14 iteration: 489 loss: 0.131
epoch: 14 iteration: 490 loss: 0.092
epoch: 14 iteration: 491 loss: 0.152
epoch: 14 iteration: 492 loss: 0.059
epoch: 14 iteration: 493 loss: 0.124
epoch: 14 iteration: 494 loss: 0.250
epoch: 14 iteration: 495 loss: 0.092
epoch: 14 iteration: 496 loss: 0.032
epoch: 14 iteration: 497 loss: 0.164
epoch: 14 iteration: 498 loss: 0.108
epoch: 14 iteration: 499 loss: 0.187
epoch: 14 iteration: 500 loss: 0.234
epoch: 14 

In [None]:
import time
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        # output
        outputs = compressed_Resnet152(images)
        # outputs = Resnet152(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %.2f %%' % (
    100 * correct / total))  

Accuracy of the network on the 10000 test images: 79.75 %


In [None]:
for name, para in encoding_Resnet152.named_parameters():
  print(name+":"+str(para.numel()))

conv1.fixed_points:661
conv1.derivatives:661
conv1.interval:9408
conv1.distance:9408
bn1.weight:64
bn1.bias:64
layer1.0.conv1.fixed_points:305
layer1.0.conv1.derivatives:305
layer1.0.conv1.interval:4096
layer1.0.conv1.distance:4096
layer1.0.bn1.weight:64
layer1.0.bn1.bias:64
layer1.0.conv2.fixed_points:2549
layer1.0.conv2.derivatives:2549
layer1.0.conv2.interval:36864
layer1.0.conv2.distance:36864
layer1.0.bn2.weight:64
layer1.0.bn2.bias:64
layer1.0.conv3.fixed_points:1246
layer1.0.conv3.derivatives:1246
layer1.0.conv3.interval:16384
layer1.0.conv3.distance:16384
layer1.0.bn3.weight:256
layer1.0.bn3.bias:256
layer1.0.downsample.0.fixed_points:1194
layer1.0.downsample.0.derivatives:1194
layer1.0.downsample.0.interval:16384
layer1.0.downsample.0.distance:16384
layer1.0.downsample.1.weight:256
layer1.0.downsample.1.bias:256
layer1.1.conv1.fixed_points:1202
layer1.1.conv1.derivatives:1202
layer1.1.conv1.interval:16384
layer1.1.conv1.distance:16384
layer1.1.bn1.weight:64
layer1.1.bn1.bias:6

In [None]:
PATH = './Resnet152.pth'
torch.save(encoding_Resnet152.state_dict(), PATH)