## 1、获取模型

In [1]:
# 引入对于的库
import torch
import torch.nn as nn

import torch.nn.functional as F



class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1个图片输入通道, 6个输出通道, 5x5 面积的卷积核
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5, bias=False)
        self.bn1 = nn.BatchNorm2d(6)
        self.conv2 = nn.Conv2d(6, 16, 5, bias=False)
        self.bn2 = nn.BatchNorm2d(16)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 10)  # 5*5来源于图片维度中

    def forward(self, x):
        # 最大池化层通过了一个2*2的窗口
        x = F.max_pool2d(F.relu(self.bn1(self.conv1(x))), (2, 2))
        # 如果大小是正方形，则可以用单个数字指定
        x = F.max_pool2d(F.relu(self.bn2(self.conv2(x))), 2)
        x = torch.flatten(x, 1) # 除batch（批量）使用的维度外的所有尺寸都要打平，即把高维降成一维
        x = self.fc1(x)
        return x


net = Net().cuda()
# print(net)

In [2]:
# 前向
input = torch.randn(1, 1, 32, 32, requires_grad=True).cuda()
out = net(input)
print(out)

# 反向
net.zero_grad()
out.backward(torch.randn(1, 10).cuda())

tensor([[ 0.4945, -1.1050, -0.1268,  0.3931, -0.5763, -0.3752,  0.2148,  1.0043,
         -0.1475, -0.2972]], device='cuda:0', grad_fn=<AddmmBackward>)


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


## 2、对模型进行聚类

In [3]:
from utils.model_utils import *
model_utils = ModelUtils(local_rank=0)
model_utils.register_state(model=net)

kernel_namedvalue_list = model_utils.get_all_conv_kernel_namedvalue_as_list()
for i in range(len(kernel_namedvalue_list)):
        print(i, kernel_namedvalue_list[i].name, kernel_namedvalue_list[i].value.shape)

0 conv1.weight (6, 1, 5, 5)
1 conv2.weight (16, 6, 5, 5)


In [7]:
# 获取聚类的层
target_deps = [3, 8]
pacesetter_dict = None
layer_idx_to_clusters = get_layer_idx_to_clusters(kernel_namedvalue_list=kernel_namedvalue_list,
                                                        target_deps=target_deps,
                                                        pacesetter_dict=pacesetter_dict)
print(layer_idx_to_clusters)

{0: [[0, 2], [1, 3, 5], [4]], 1: [[3, 14, 15], [0, 4, 12, 13], [2], [9, 10], [6], [1, 7], [5, 8], [11]]}


## 3、生成 merge matrix

In [13]:
deps = [6, 16]
param_name_to_merge_matrix = generate_merge_matrix_for_kernel(deps=deps,
                                                                      layer_idx_to_clusters=layer_idx_to_clusters,
                                                                      kernel_namedvalue_list=kernel_namedvalue_list)
print(param_name_to_merge_matrix['conv1.weight'])

tensor([[0.5000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3333, 0.0000, 0.3333, 0.0000, 0.3333],
        [0.5000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3333, 0.0000, 0.3333, 0.0000, 0.3333],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000],
        [0.0000, 0.3333, 0.0000, 0.3333, 0.0000, 0.3333]], device='cuda:0')


In [14]:
add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix)
print(param_name_to_merge_matrix.keys())

dict_keys(['conv1.weight', 'conv2.weight', 'conv1.bias', 'bn1.weight', 'bn1.bias', 'conv2.bias', 'bn2.weight', 'bn2.bias'])


### （举例）获取梯度，并和 merge matrix 相乘

In [15]:
for name, params in net.named_parameters():
    print(name, params.shape)
    # 获取了第一层卷积的梯度
    conv1_grad = params.grad
    break

conv1.weight torch.Size([6, 1, 5, 5])


In [16]:
p_size = conv1_grad.size()
# 将梯度 reshape 成 (batch_size, -1) 的形状
g_mat = conv1_grad.reshape(p_size[0], -1)
print(g_mat.shape)


torch.Size([6, 25])


In [17]:
# 将上面得到的 merge_matrix 的第一项 乘以 reshape 后的梯度
# 实现 类内的梯度都变成一致 的功能
param_name_to_merge_matrix['conv1.weight'].matmul(g_mat)

tensor([[-2.0396,  0.8630, -0.4818, -1.2639,  0.5858, -0.2540,  0.7740, -1.3581,
          1.4525, -0.7023, -1.0456, -1.3194, -0.2839, -0.9921, -0.0654, -1.5389,
         -0.2749,  1.3421, -0.9044,  1.1340, -0.0627, -1.2317, -2.0808, -0.2170,
         -0.2554],
        [ 1.7174, -0.8448,  0.9213, -0.1279, -0.7719,  2.0492,  0.0197, -0.2615,
         -0.6791, -2.4863,  0.3439, -1.4839,  0.2268,  1.2236,  0.3676, -0.4354,
          0.4534,  2.8289, -0.3710,  1.1506, -0.1032, -2.1530,  0.9907,  0.6596,
          0.5187],
        [-2.0396,  0.8630, -0.4818, -1.2639,  0.5858, -0.2540,  0.7740, -1.3581,
          1.4525, -0.7023, -1.0456, -1.3194, -0.2839, -0.9921, -0.0654, -1.5389,
         -0.2749,  1.3421, -0.9044,  1.1340, -0.0627, -1.2317, -2.0808, -0.2170,
         -0.2554],
        [ 1.7174, -0.8448,  0.9213, -0.1279, -0.7719,  2.0492,  0.0197, -0.2615,
         -0.6791, -2.4863,  0.3439, -1.4839,  0.2268,  1.2236,  0.3676, -0.4354,
          0.4534,  2.8289, -0.3710,  1.1506, -0.1032

In [18]:
print(g_mat)

tensor([[-1.6329, -3.1539,  0.5518, -1.4467,  0.6566,  0.1471,  0.6453, -0.7972,
          2.2775,  0.0403, -2.8020, -0.5291,  0.5153,  0.1309,  1.2363, -2.9589,
         -0.9837, -0.5219,  0.6431,  1.9916,  1.1198, -1.8530, -1.0454, -0.3382,
          0.6545],
        [ 2.5459, -0.2484,  0.0402,  1.8763, -3.3801,  3.6180,  0.1552, -1.0768,
          1.2706, -1.5935,  0.6328, -0.8717,  1.2098,  1.0271,  3.0224,  1.5558,
          0.6416,  4.4933, -0.9768,  1.6233,  0.7888, -1.8414,  1.1348,  0.9133,
          0.1807],
        [-2.4463,  4.8799, -1.5154, -1.0811,  0.5150, -0.6551,  0.9028, -1.9190,
          0.6275, -1.4449,  0.7108, -2.1098, -1.0830, -2.1150, -1.3670, -0.1189,
          0.4339,  3.2061, -2.4520,  0.2764, -1.2452, -0.6104, -3.1162, -0.0959,
         -1.1653],
        [ 1.6050, -2.1427,  2.6661, -3.1526, -1.6934,  1.6422, -0.6681, -1.2742,
         -2.5014, -2.5893, -1.2991, -0.4299, -2.5529,  2.6336, -0.2178,  0.5499,
          1.1212,  3.8123,  1.8353,  0.6225,  0.8001

## 4、生成 decay matrix

In [30]:
weight_decay = 0.1
weight_decay_bias = 0
centri_strength = 0.06
param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(
                    deps=deps,
                    layer_idx_to_clusters=layer_idx_to_clusters,
                    kernel_namedvalue_list=kernel_namedvalue_list,
                    weight_decay=weight_decay,
                    weight_decay_bias=weight_decay_bias,
                    centri_strength=centri_strength)

### （举例）
前面对于第一层卷积的聚类结果为 0: [[0, 2], [1, 3, 5], [4]]
之前的推导中，需要将权重衰减项，替换成

-n·a ----> -[(n+y) - y/count]·a -y/count·(b+c+...)


In [31]:
print(param_name_to_decay_matrix['conv1.weight'].shape)
print(param_name_to_decay_matrix['conv1.weight'])

torch.Size([6, 6])
tensor([[ 0.1300,  0.0000, -0.0300,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.1400,  0.0000, -0.0200,  0.0000, -0.0200],
        [-0.0300,  0.0000,  0.1300,  0.0000,  0.0000,  0.0000],
        [ 0.0000, -0.0200,  0.0000,  0.1400,  0.0000, -0.0200],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.1000,  0.0000],
        [ 0.0000, -0.0200,  0.0000, -0.0200,  0.0000,  0.1400]],
       device='cuda:0')


In [32]:
for name, params in net.named_parameters():
    print(name, params.shape)
    # 获取了第一层卷积的参数
    conv1_params = params
    break

p_size = conv1_params.size()
# 将梯度 reshape 成 (batch_size, -1) 的形状
param_mat = conv1_params.reshape(p_size[0], -1)
print(param_mat.shape)


conv1.weight torch.Size([6, 1, 5, 5])
torch.Size([6, 25])


In [33]:
param_name_to_decay_matrix['conv1.weight'].matmul(param_mat)

tensor([[-1.1169e-02,  1.7027e-02, -1.1201e-02,  8.1528e-03, -2.3723e-02,
         -2.1395e-02, -2.4254e-02,  1.7864e-02,  1.3272e-03, -2.5312e-02,
         -2.1429e-02,  1.5881e-02, -6.5480e-03, -4.0760e-03,  1.2942e-02,
          1.1402e-03, -1.3104e-02, -2.0617e-02,  3.5007e-03,  1.6871e-02,
          1.8786e-02, -1.4906e-04,  1.6725e-02,  1.0890e-02, -2.0234e-02],
        [-3.0975e-03, -3.1905e-02,  1.7929e-03, -1.8728e-02, -1.3165e-02,
         -2.3963e-03, -1.7731e-02, -1.1764e-02,  3.0402e-03,  1.5733e-02,
          1.7867e-02, -2.3607e-02,  1.2589e-02, -1.9807e-03,  2.4690e-02,
         -2.5431e-02, -3.6354e-03, -1.2870e-02, -1.5690e-02,  2.6999e-03,
         -4.9478e-03,  3.3441e-03, -2.9080e-02,  2.9334e-04, -2.3720e-02],
        [-2.0614e-02,  8.6912e-04, -8.5209e-03,  1.9991e-02,  2.9166e-02,
          6.8650e-03,  4.5499e-03, -1.4297e-02, -1.9930e-02,  2.5329e-02,
          4.2086e-03,  1.7595e-02, -1.4575e-02,  7.2445e-03,  1.7970e-03,
         -2.1384e-02,  1.4028e-04, -

In [34]:
print(param_mat)

tensor([[-0.1294,  0.1400, -0.1070,  0.1037, -0.1381, -0.1610, -0.1885,  0.1183,
         -0.0266, -0.1582, -0.1662,  0.1620, -0.0805, -0.0195,  0.1085, -0.0308,
         -0.1062, -0.1752,  0.0188,  0.1108,  0.1113,  0.0160,  0.1302,  0.0631,
         -0.1392],
        [ 0.0048, -0.1944,  0.0136, -0.1053, -0.1101, -0.0606, -0.0925, -0.1279,
         -0.0088,  0.1003,  0.1422, -0.1815,  0.1174, -0.0113,  0.1807, -0.1841,
          0.0052, -0.0680, -0.0700,  0.0480, -0.0666,  0.0580, -0.1892, -0.0487,
         -0.1824],
        [-0.1884,  0.0390, -0.0902,  0.1777,  0.1925,  0.0157, -0.0085, -0.0827,
         -0.1594,  0.1583, -0.0060,  0.1727, -0.1307,  0.0512,  0.0389, -0.1716,
         -0.0234, -0.0719, -0.0351, -0.0822, -0.1438,  0.0743,  0.0067, -0.0897,
          0.0711],
        [ 0.1257,  0.0701,  0.0974,  0.1320, -0.0351, -0.1650,  0.1465, -0.1165,
         -0.1245, -0.0870,  0.1929, -0.1080,  0.0062, -0.0061,  0.1779,  0.0834,
          0.1538,  0.1329,  0.1454,  0.0545, -0.1064

## 5、更新梯度

In [35]:
csgd_gradient = param_name_to_merge_matrix['conv1.weight'].matmul(g_mat) + param_name_to_decay_matrix['conv1.weight'].matmul(param_mat)

In [37]:
print(csgd_gradient.reshape(p_size).shape)

torch.Size([6, 1, 5, 5])


In [38]:
for name, params in net.named_parameters():
    print(name, params.shape)
    # 获取了第一层卷积的参数
    params.grad.copy_(csgd_gradient.reshape(p_size))
    break

conv1.weight torch.Size([6, 1, 5, 5])
