## 演示一下 k-mean 的代码

In [4]:
import numpy as np
from sklearn.cluster import KMeans


data = np.random.rand(16,3,24,24)
print("data shape is: ", data.shape)
data_input = np.reshape(data, (data.shape[0], -1))
print("input data shape is: ", data_input.shape)

n_clusters = 4
print("number of clusters is: ", n_clusters)

km = KMeans(n_clusters=n_clusters)
km.fit(data_input)
print("result of km: ", km.labels_)

result = [[] for i in range(n_clusters)]
for i, c in enumerate(km.labels_):
        result[c].append(i)
print("cluster result: ", result)

data shape is:  (16, 3, 24, 24)
input data shape is:  (16, 1728)
number of clusters is:  4
result of km:  [0 0 3 0 2 0 2 1 0 1 0 2 2 0 0 2]
cluster result:  [[0, 1, 3, 5, 8, 10, 13, 14], [7, 9], [4, 6, 11, 12, 15], [2]]


## 以 vgg16 举例子。
## 获取卷积的参数，存放字典容器中

In [2]:
import torchvision
from torchsummary import summary
from collections import namedtuple

NamedParamValue = namedtuple('NamedParamValue', ['name', 'value'])
model = torchvision.models.vgg16(pretrained = True)

# summary(model.cuda(), input_size=(3,64,64))

In [6]:
def get_all_conv_kernel_namedvalue_as_list():
    result = []

    for k, v in model.state_dict().items():
        if v.dim() == 4:
            print("key & value shape: ", k, " ", v.shape)
            result.append(NamedParamValue(name=k, value=v.cpu().numpy()))
            
    return result

kernel_namedvalue_list = get_all_conv_kernel_namedvalue_as_list() 
# print(kernel_namedvalue_list)

key & value shape:  features.0.weight   torch.Size([64, 3, 3, 3])
key & value shape:  features.2.weight   torch.Size([64, 64, 3, 3])
key & value shape:  features.5.weight   torch.Size([128, 64, 3, 3])
key & value shape:  features.7.weight   torch.Size([128, 128, 3, 3])
key & value shape:  features.10.weight   torch.Size([256, 128, 3, 3])
key & value shape:  features.12.weight   torch.Size([256, 256, 3, 3])
key & value shape:  features.14.weight   torch.Size([256, 256, 3, 3])
key & value shape:  features.17.weight   torch.Size([512, 256, 3, 3])
key & value shape:  features.19.weight   torch.Size([512, 512, 3, 3])
key & value shape:  features.21.weight   torch.Size([512, 512, 3, 3])
key & value shape:  features.24.weight   torch.Size([512, 512, 3, 3])
key & value shape:  features.26.weight   torch.Size([512, 512, 3, 3])
key & value shape:  features.28.weight   torch.Size([512, 512, 3, 3])


## 获取目标通道列表

In [7]:
import numpy as np 
from constants import *

# 原始通道数列表
VGG_ORIGIN_DEPS_FLATTENED = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512]
deps = VGG_ORIGIN_DEPS_FLATTENED

target_deps = [[d * 13 // 16 for d in deps],
                [d * 11 // 16 for d in deps],
                [d * 5 // 8 for d in deps]]

target_deps_1 = [48, 48, 64, 64, 128, 128, 128, 256, 256, 256, 256, 256, 256, 256]

print("origin channels: \n", deps, '\n')
print("target channels: \n", list(target_deps[2]), '\n')



origin channels: 
 [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512] 

target channels: 
 [40, 40, 80, 80, 160, 160, 160, 320, 320, 320, 320, 320, 320] 



## 利用 kernel_namedvalue_list 和 target_deps，进行聚类操作，生成每层聚类后的通道情况

In [5]:
def _is_follower(layer_idx, pacesetter_dict):
    followers_and_pacesetters = set(pacesetter_dict.keys())
    return (layer_idx in followers_and_pacesetters) and (pacesetter_dict[layer_idx] != layer_idx)

def cluster_by_kmeans(kernel_value, num_cluster):
    assert kernel_value.ndim == 4  # n,c,h,w
    x = np.reshape(kernel_value, (kernel_value.shape[0], -1))  # n, c*h*w
    if num_cluster == x.shape[0]:  # if num_cluster == n, result = [0, 1, ..., n]
        result = [[i] for i in range(num_cluster)]
        return result
    else:
        print('cluster {} filters into {} clusters'.format(x.shape[0], num_cluster))
    km = KMeans(n_clusters=num_cluster)  # use sklearn.cluster.KMeans to cluster kernel_value
    km.fit(x)
    result = []  # record result
    for j in range(num_cluster):
        result.append([])
    for i, c in enumerate(km.labels_):
        result[c].append(i)
    for r in result:
        assert len(r) > 0
    return result

def get_layer_idx_to_clusters(kernel_namedvalue_list, target_deps, pacesetter_dict):
    result = {}
    for layer_idx, named_kv in enumerate(kernel_namedvalue_list):
        num_filters = named_kv.value.shape[0]
        if pacesetter_dict is not None and _is_follower(layer_idx, pacesetter_dict):
            continue
        if num_filters > target_deps[layer_idx]:
            result[layer_idx] = cluster_by_kmeans(kernel_value=named_kv.value, num_cluster=target_deps[layer_idx])
        elif num_filters < target_deps[layer_idx]:
            print(num_filters, target_deps[layer_idx])
            raise ValueError('wrong target dep')
    return result

layer_idx_to_clusters = get_layer_idx_to_clusters(kernel_namedvalue_list=kernel_namedvalue_list,
                                                                  target_deps=target_deps[2],
                                                                  pacesetter_dict=None)

cluster 64 filters into 40 clusters
cluster 64 filters into 40 clusters
cluster 128 filters into 80 clusters
cluster 128 filters into 80 clusters
cluster 256 filters into 160 clusters
cluster 256 filters into 160 clusters
cluster 256 filters into 160 clusters
cluster 512 filters into 320 clusters
cluster 512 filters into 320 clusters
cluster 512 filters into 320 clusters
cluster 512 filters into 320 clusters
cluster 512 filters into 320 clusters
cluster 512 filters into 320 clusters


In [6]:
layer_idx_to_clusters[0]

[[11],
 [33],
 [12, 37, 44],
 [3, 26, 27, 43],
 [10, 17],
 [34],
 [53],
 [25],
 [45, 47, 61],
 [7, 52],
 [0],
 [58],
 [9],
 [36],
 [8],
 [51],
 [40, 46],
 [2],
 [55],
 [42],
 [59],
 [39],
 [19],
 [57],
 [63],
 [1],
 [32],
 [29, 56],
 [14],
 [20],
 [5, 13, 15, 24, 30, 41, 50, 54, 62],
 [21],
 [35, 60],
 [23],
 [6],
 [38, 48],
 [16, 28],
 [18],
 [4],
 [22, 31, 49]]

## 通过上面这几步骤，我们得到了每层通道聚类后的结果。
## 接下来要利用这结果更新权重。

利用 
1. 模型原本每层的卷积数：deps
2. 每层聚类的结果：layer_idx_to_clusters
3. 模型原本每层的卷积参数：kernel_namedvalue_list


In [7]:
import torch

def generate_merge_matrix_for_kernel(deps, layer_idx_to_clusters, kernel_namedvalue_list):
    result = {}
    for layer_idx, clusters in layer_idx_to_clusters.items():
        # 每层的通道数目
        num_filters = deps[layer_idx]
        # 构建 num_filters * num_filters 0的矩阵
        merge_trans_mat = np.zeros((num_filters, num_filters), dtype=np.float32)
        # 距离 clusters, 16 聚类 4 的结果 [[1, 10, 11, 12, 14], [3, 6], [0, 4, 7, 8, 9, 13], [2, 5, 15]]
        for clst in clusters:
            # 此时 clst 分别是 [1, 10, 11, 12, 14], [3, 6], [0, 4, 7, 8, 9, 13], [2, 5, 15]
            if len(clst) == 1:
                merge_trans_mat[clst[0], clst[0]] = 1
                continue
            sc = sorted(clst)       # Ideally, clst should have already been sorted in ascending order
            for ei in sc:
                for ej in sc:
                    merge_trans_mat[ei, ej] = 1 / len(clst)
        result[kernel_namedvalue_list[layer_idx].name] = torch.from_numpy(merge_trans_mat).cuda()
        # 这样每层都能得到一个 聚类后id 的 matrix
        # 这个 matrix 是为了加快计算用的
    return result

deps = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512]

param_name_to_merge_matrix = generate_merge_matrix_for_kernel(deps, layer_idx_to_clusters, kernel_namedvalue_list)

In [8]:
param_name_to_merge_matrix['features.0.weight']

tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.3333, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1111, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]],
       device='cuda:0')

## 上面获得了每层卷积权重的一个 matrix，然后将这个 matrix 复制三份，给卷积 bias，bn 的 weight, bn 的 bias。

In [41]:
KERNEL_KEYWORD = 'weight'

def add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix):
    kernel_names = set(param_name_to_merge_matrix.keys())
    for name in kernel_names:
        bias_name = name.replace(KERNEL_KEYWORD, 'conv.bias')
        gamma_name = name.replace(KERNEL_KEYWORD, 'bn.weight')
        beta_name = name.replace(KERNEL_KEYWORD, 'bn.bias')
        param_name_to_merge_matrix[bias_name] = param_name_to_merge_matrix[name]
        param_name_to_merge_matrix[gamma_name] = param_name_to_merge_matrix[name]
        param_name_to_merge_matrix[beta_name] = param_name_to_merge_matrix[name]

add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix)


In [42]:
print(param_name_to_merge_matrix['features.0.weight'].shape)
print(param_name_to_merge_matrix['features.0.conv.bias'].shape)
print(param_name_to_merge_matrix['features.0.bn.weight'].shape)
print(param_name_to_merge_matrix['features.0.bn.bias'].shape)

torch.Size([64, 64])
torch.Size([64, 64])
torch.Size([64, 64])
torch.Size([64, 64])


## 接下来是生成权重衰减的 matrix

In [44]:
weight_decay = 1e-4
weight_decay_bias = 0
centri_strength = 3e-3

def generate_decay_matrix_for_kernel_and_vecs(deps, layer_idx_to_clusters, kernel_namedvalue_list,
                                              weight_decay, weight_decay_bias, centri_strength):
    result = {}
    #   for the kernel
    for layer_idx, clusters in layer_idx_to_clusters.items():
        num_filters = deps[layer_idx]
        decay_trans_mat = np.zeros((num_filters, num_filters), dtype=np.float32)
        for clst in clusters:
            sc = sorted(clst)
            for ee in sc:
                decay_trans_mat[ee, ee] = weight_decay + centri_strength
                for p in sc:
                    decay_trans_mat[ee, p] += -centri_strength / len(clst)
        kernel_mat = torch.from_numpy(decay_trans_mat).cuda()
        result[kernel_namedvalue_list[layer_idx].name] = kernel_mat

    #   for the vec params (bias, beta and gamma), we use 0.1 * centripetal strength
    for layer_idx, clusters in layer_idx_to_clusters.items():
        num_filters = deps[layer_idx]
        decay_trans_mat = np.zeros((num_filters, num_filters), dtype=np.float32)
        for clst in clusters:
            sc = sorted(clst)
            for ee in sc:
                # Note: using smaller centripetal strength on the scaling factor of BN improve the performance in some of the cases
                decay_trans_mat[ee, ee] = weight_decay_bias + centri_strength * 0.1
                for p in sc:
                    decay_trans_mat[ee, p] += -centri_strength * 0.1 / len(clst)
        vec_mat = torch.from_numpy(decay_trans_mat).cuda()
        result[kernel_namedvalue_list[layer_idx].name.replace(KERNEL_KEYWORD, 'bn.weight')] = vec_mat
        result[kernel_namedvalue_list[layer_idx].name.replace(KERNEL_KEYWORD, 'bn.bias')] = vec_mat
        result[kernel_namedvalue_list[layer_idx].name.replace(KERNEL_KEYWORD, 'conv.bias')] = vec_mat

    return result


param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs(deps=deps,
                                                                            layer_idx_to_clusters=layer_idx_to_clusters,
                                                                            kernel_namedvalue_list=kernel_namedvalue_list,
                                                                            weight_decay=weight_decay,
                                                                            weight_decay_bias=weight_decay_bias,
                                                                            centri_strength=centri_strength)

In [48]:
print(param_name_to_decay_matrix.keys())
print(param_name_to_merge_matrix.keys())

dict_keys(['features.0.weight', 'features.2.weight', 'features.5.weight', 'features.7.weight', 'features.10.weight', 'features.12.weight', 'features.14.weight', 'features.17.weight', 'features.19.weight', 'features.21.weight', 'features.24.weight', 'features.26.weight', 'features.28.weight', 'features.0.bn.weight', 'features.0.bn.bias', 'features.0.conv.bias', 'features.2.bn.weight', 'features.2.bn.bias', 'features.2.conv.bias', 'features.5.bn.weight', 'features.5.bn.bias', 'features.5.conv.bias', 'features.7.bn.weight', 'features.7.bn.bias', 'features.7.conv.bias', 'features.10.bn.weight', 'features.10.bn.bias', 'features.10.conv.bias', 'features.12.bn.weight', 'features.12.bn.bias', 'features.12.conv.bias', 'features.14.bn.weight', 'features.14.bn.bias', 'features.14.conv.bias', 'features.17.bn.weight', 'features.17.bn.bias', 'features.17.conv.bias', 'features.19.bn.weight', 'features.19.bn.bias', 'features.19.conv.bias', 'features.21.bn.weight', 'features.21.bn.bias', 'features.21.c

## 上面这些步骤的操作，是为了拿到要聚类的通道的 index
## 然后按照论文中的公式在更新对于的参数

In [None]:
def train_one_step(net, data, label, optimizer, criterion, param_name_to_merge_matrix, param_name_to_decay_matrix):
    pred = net(data)
    loss = criterion(pred, label)
    loss.backward()
    # 上面是正常的计算 loss, 和反向传播

    #TODO note: C-SGD works here
    for name, param in net.named_parameters():
        name = name.replace('module.', '')
        if name in param_name_to_merge_matrix:
            p_dim = param.dim()
            p_size = param.size()
            # 获取原本梯度参数
            if p_dim == 4:
                param_mat = param.reshape(p_size[0], -1)
                g_mat = param.grad.reshape(p_size[0], -1)
            elif p_dim == 1:
                param_mat = param.reshape(p_size[0], 1)
                g_mat = param.grad.reshape(p_size[0], 1)
            else:
                assert p_dim == 2
                param_mat = param
                g_mat = param.grad
            # 上面是获取当前的梯度，reshape 成 g_mat
            # 下面是将 g_mat 按照文章中的公式，进行矩阵相乘和相加。
            csgd_gradient = param_name_to_merge_matrix[name].matmul(g_mat) + param_name_to_decay_matrix[name].matmul(param_mat)
            # 将计算的结果更新到参数梯度中。
            param.grad.copy_(csgd_gradient.reshape(p_size))



