reference:

https://github.com/liuzechun/AdamBNN

https://github.com/lopuhin/tpu-imagenet

二值化网络：普通网络一般为实值化的，二值化在模型压缩等方面有着作用

In [1]:
import os
import tensorflow
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf

In [2]:
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Number of accelerators:  8


# reactnet

In [3]:
CONV_KER_INIT = 'glorot_uniform'
CONV_BIAS_INIT = 'zeros'
STAGE_OUT_CHANNEL = [32]+[64]+[128]*2+[256]*2+[512]*6+[1024]*2
WEIGHT_DECAY = 0.01

In [4]:
def conv3_3(out_planes, stride=1):
    """3*3 conv with padding"""
    return keras.layers.Conv2D(
        filters=out_planes,
        kernel_size=3,
        strides=stride,
        use_bias=False,
        kernel_initializer=CONV_KER_INIT,
#         bias_initializer=CONV_BIAS_INIT,
#         kernel_regularizer=keras.regularizers.l2(WEIGHT_DECAY),
        padding = 'same',
#         activation='relu',
    )

In [5]:
def conv1_1(out_planes, stride=1):
    return keras.layers.Conv2D(
        filters=out_planes,
        kernel_size=1,
        strides=stride,
        use_bias=False,
        kernel_initializer=CONV_KER_INIT,
#         bias_initializer=CONV_KER_INIT,
#         kernel_regularizer=keras.regularizers.l2(WEIGHT_DECAY),
#         activation='relu',
    )

In [6]:
class firstconv3_3(keras.layers.Layer):
    def __init__(self, oup, stride):
        super(firstconv3_3, self).__init__()
        self.conv1 = keras.layers.Conv2D(
            filters=oup,
            kernel_size=3,
            strides=stride,
            use_bias=False,
            kernel_initializer=CONV_KER_INIT,
#             bias_initializer=CONV_BIAS_INIT,
#             kernel_regularizer=keras.regularizers.l2(WEIGHT_DECAY),
            padding = 'same',
#             activation='relu',
        )
        self.bn = keras.layers.BatchNormalization()
    
    def call(self, x):
        out = self.conv1(x)
        out = self.bn(out)
        return out

In [7]:
class BinaryActivation(keras.layers.Layer):
    def __init__(self, ):
        super(BinaryActivation, self).__init__()

    def call(self, x):
        out_forward = tf.math.sign(x)
        mask1 = x < -1
        mask2 = x < 0
        mask3 = x < 1
        out1 = (-1)*tf.cast(mask1, tf.float32) + (x*x+2*x)*(1-tf.cast(mask1, tf.float32))
        out2 = out1*tf.cast(mask2, tf.float32) + (-x*x+2*x)*(1-tf.cast(mask2, tf.float32))
        out3 = out2*tf.cast(mask3, tf.float32) + 1*(1-tf.cast(mask3, tf.float32))
        out = out_forward-out3+out3
        return out

In [8]:
class LearnableBias(keras.layers.Layer):
    def __init__(self, out_chn, size=None):
        super(LearnableBias, self).__init__()
        self.size = size
        self.out_chn = out_chn
        self.b = self.add_weight(
            shape=(1, 1, self.out_chn),
            initializer='zeros',
            trainable=True,
        )
    
    def call(self, x):
        out = tf.add(x, self.b)
        return out

In [9]:
'''test LearnableBias'''
# import torch
# import torch.nn as nn
# import torch.utils.model_zoo as model_zoo
# import torch.nn.functional as F
# import numpy as np
# class ptLearnableBias(nn.Module):
#     def __init__(self, out_chn):
#         super(ptLearnableBias, self).__init__()
#         self.bias = nn.Parameter(torch.zeros(1,out_chn,1,1), requires_grad=True)

#     def forward(self, x):
#         out = x + self.bias.expand_as(x)
#         return out
# img = np.random.normal(size=(2, 3, 24, 24))
# label = np.random.normal(size=(2, 3, 24, 24))

# pt_img = torch.from_numpy(img)
# pt_label = torch.from_numpy(label)
# pt_layer = ptLearnableBias(3)
# pt_out = pt_layer(pt_img)
# pt_loss = nn.MSELoss()
# pt_optimizer = torch.optim.Adam(params=pt_layer.parameters(), lr=0.9, betas=(0.9,0.999), eps=1e-08)

# pt_output = pt_loss(pt_out, pt_label)
# print(pt_output)
# pt_optimizer.zero_grad()
# pt_output.backward()
# pt_optimizer.step()
# print(pt_layer.bias)
# print('------------\n')


# tf_img = tf.convert_to_tensor(img)
# tf_img = tf.transpose(tf_img, perm=[0, 2, 3, 1])
# tf_label = tf.convert_to_tensor(label)
# tf_label = tf.transpose(tf_label, perm=[0, 2, 3, 1])
# tf_layer = LearnableBias(3)
# tf_loss = tf.keras.losses.MeanSquaredError()
# tf_optimizer = tf.keras.optimizers.Adam(learning_rate=0.9, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
# with tf.GradientTape() as tape:
#     tf_out = tf_layer(tf_img)
#     tf_output = tf_loss(tf_label, tf_out)
# grads = tape.gradient(tf_output, tf_layer.trainable_weights)
# tf_optimizer.apply_gradients(list(zip(grads, tf_layer.trainable_weights)))
# print(tf_output)
# print(tf_layer.trainable_weights)

'test LearnableBias'

In [10]:
class BasicBlock(keras.layers.Layer):
    def __init__(self, inplanes, planes, size=None, stride=1):
        super(BasicBlock, self).__init__()
        self.move11 = LearnableBias(inplanes, size)
        self.binary_3_3 = conv3_3(inplanes, stride=stride)
        self.bn1 = keras.layers.BatchNormalization()

        self.move12 = LearnableBias(inplanes, size)
        self.prelu1 = keras.layers.PReLU()
        self.move13 = LearnableBias(inplanes, size)

        self.move21 = LearnableBias(inplanes, size)

        if inplanes == planes:
            self.binary_pw = conv1_1(planes, stride=stride)
            self.bn2 = keras.layers.BatchNormalization()
        else:
            self.binary_pw_down1 = conv1_1(inplanes)
            self.binary_pw_down2 = conv1_1(inplanes)
            self.bn2_1 = keras.layers.BatchNormalization()
            self.bn2_2 = keras.layers.BatchNormalization()
        
        self.move22 = LearnableBias(planes, size)
        self.prelu2 = keras.layers.PReLU()
        self.move23 = LearnableBias(planes, size)

        self.binary_activation = BinaryActivation()
        self.stride = stride
        self.inplanes = inplanes
        self.planes = planes

        if self.inplanes != self.planes:
            self.pooling = keras.layers.AveragePooling2D(pool_size=(2, 2))

    def call(self, x):
        out1 = self.move11(x)

        out1 = self.binary_activation(out1)
        out1 = self.binary_3_3(out1)
        out1 = self.bn1(out1)

        if self.stride == 2:
            x = self.pooling(x)
        
        out1 = x+out1
        out1 = self.move12(out1)
        out1 = self.prelu1(out1)
        out1 = self.move13(out1)

        out2 = self.move21(out1)
        out2 = self.binary_activation(out2)

        if self.inplanes == self.planes:
            out2 = self.binary_pw(out2)
            out2 = self.bn2(out2)
            out2 += out1
        else:
            assert self.planes == self.inplanes * 2
            out2_1 = self.binary_pw_down1(out2)
            out2_2 = self.binary_pw_down2(out2)
            out2_1 = self.bn2_1(out2_1)
            out2_2 = self.bn2_2(out2_2)
            out2_1 += out1
            out2_2 += out1
            out2 = tf.concat([out2_1, out2_2], axis=-1)
        
        out2 = self.move22(out2)
        out2 = self.prelu2(out2)
        out2 = self.move23(out2)
        return out2

In [11]:
class reactnet(keras.Model):
    def __init__(self, num_classes, stage_out_channel,):
        super(reactnet, self).__init__()
        self.feature = keras.Sequential()
        
        for i in range(len(stage_out_channel)):
            if i==0:
                self.feature.add(firstconv3_3(stage_out_channel[i], 2))
            elif stage_out_channel[i-1] != stage_out_channel[i] and stage_out_channel[i] != 64:
                self.feature.add(
                    BasicBlock(stage_out_channel[i-1], stage_out_channel[i], stride=2)
                )
            else:
                self.feature.add(
                    BasicBlock(stage_out_channel[i-1], stage_out_channel[i], stride=1)
                )
        self.pool1 = keras.layers.GlobalAveragePooling2D()
        self.fc = keras.layers.Dense(num_classes)
    
    def call(self, inputs):
        x = self.feature(inputs)
        x = self.pool1(x)
        x = self.fc(x)
        return x

In [12]:
def model(image_size, num_classes, stage_out_channel):
    inputs = keras.Input((image_size, image_size, 3))
    outputs = reactnet(num_classes, stage_out_channel)(inputs)
    model = keras.Model(inputs, outputs)
    return model

In [13]:
react = model(224, 1000, STAGE_OUT_CHANNEL)
react.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
reactnet (reactnet)          (None, 1000)              33995848  
Total params: 33,995,848
Trainable params: 33,973,960
Non-trainable params: 21,888
_________________________________________________________________


In [14]:
# for layer in react.layers[-1].feature.layers:
#     if 'basic_block' in layer.name:
#         for i in range(len(layer.trainable_weights)):
#             print(layer.trainable_weights[i].name)
#             print(layer.trainable_weights[i].shape)
#         print('------')
#         print(len(layer.trainable_weights))
#         break

# Loss

In [15]:
with strategy.scope():
    class myLoss(keras.losses.Loss):
        def __init__(self):
            super(myLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE)

        def call(self, y_true, y_pred):
            model_output_log_prob = tf.math.log(keras.activations.softmax(y_pred))
            real_output_soft = keras.activations.softmax(y_true)

            real_output_soft = tf.expand_dims(real_output_soft, axis=1)
            model_output_log_prob = tf.expand_dims(model_output_log_prob, axis=-1)
            cross_entropy_loss = -tf.einsum('bij, bji->bi', real_output_soft, model_output_log_prob)

            return cross_entropy_loss

#  Learning rate schedule

In [16]:
class MyLRSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, epochs, step_epoch):
        self.initial_learning_rate = initial_learning_rate
        self.all_step = epochs * step_epoch
        
    def __call__(self, step):
        return self.initial_learning_rate*(1.0 - (step/(self.all_step)))

# Dataset

In [17]:
def read_tfrecord(example):
    features = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'class': tf.io.FixedLenFeature([], tf.int64),
        'filename': tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, features)
    image = tf.image.decode_jpeg(example['image'])
    class_num = example['class']
    filename = example['filename']
    return image, class_num, filename

In [18]:
def image_hw(image):
    shape = tf.shape(image)
    return shape[0], shape[1]

def resize_and_crop_image(image, target_size):
    h, w = image_hw(image)
    th, tw = target_size
    image = tf.cond(
        (w*th)/(h*tw) <1,
        lambda: tf.image.resize(image, [h * tw/w, w* tw/w]),
        lambda: tf.image.resize(image, [h * th/h, w*th/h])
    )
    nh, nw = image_hw(image)
    image = tf.image.crop_to_bounding_box(image, (nh-th)//2, (nw-tw)//2, th, tw)
    image = tf.reshape(image, [*target_size, 3])
    return image

def normalize(image, dtype):
    image = tf.cast(image, dtype) / 255.0
    return image

In [19]:
def dataset(tfrec_roots, image_size, is_train, dtype=tf.float32, batch_size=None, cache=False, drop_filename=True):
    AUTO = tf.data.experimental.AUTOTUNE
    pattern = '/train-*.tfrec' if is_train else '/val.tfrec'
    tfrec_paths = []
    for tfrec_root in tfrec_roots:
        tfrec_paths.extend(tf.io.gfile.glob(tfrec_root.rstrip('/')+pattern))
#     print('tfrec paths', tfrec_paths)
    dataset = tf.data.TFRecordDataset(tfrec_paths, num_parallel_reads=AUTO)
    options_no_order = tf.data.Options()
    options_no_order.experimental_deterministic = False
    dataset = dataset.with_options(options_no_order)

    def process(filename):
        image, label, filename = read_tfrecord(filename)
        image = resize_and_crop_image(image, target_size=image_size)
        image = tf.image.random_flip_left_right(image)
        image = normalize(image, dtype=dtype)
        result = (image, label)
        if not drop_filename:
            result += (filenmae,)
        return result
        
    dataset = dataset.map(process, num_parallel_calls=AUTO)
    if cache:
        dataset = dataset.cache()
    if is_train:
        dataset = dataset.shuffle(4096)
    if batch_size is not None:
        dataset = dataset.batch(batch_size)
    dataset = dataset.repeat()
    dataset = dataset.prefetch(AUTO)
    return dataset

# train

In [20]:
from pathlib import Path
from kaggle_datasets import KaggleDatasets
gcs_path = [KaggleDatasets().get_gcs_path(p.name) for p in Path('/kaggle/input/').iterdir()]
IMAGE_SIZE = 224
N_CLASSES = 1000
XLA = 0
MIXED = 1
BATCH_SIZE = 256
EPOCHS = 256
LEARNING_RATE = 1.25e-3
MOMENTUM = 0.9
LABEL_SMOOTH =0.1
TEACHER = 'ResNet50'

In [21]:
if MIXED:
    dtype = tf.bfloat16 if tpu else tf.float16
NUM_TRAIN_IMAGES = 1281167
NUM_VAL_IMAGES = 50000
STEP_PER_EPOCH = NUM_TRAIN_IMAGES // BATCH_SIZE
VAL_PER_EPOCH = NUM_VAL_IMAGES // BATCH_SIZE
PER_REPICE_BATCH_SIZE = BATCH_SIZE // strategy.num_replicas_in_sync
SAVE_PATH = './model.h5'

train_dataset = strategy.experimental_distribute_datasets_from_function(
    lambda _:dataset(
            gcs_path,
            is_train=True, 
            image_size=(IMAGE_SIZE, IMAGE_SIZE), 
            cache=False, 
            batch_size=PER_REPICE_BATCH_SIZE,
            drop_filename=True,
            dtype=dtype,
            ))
val_dataset = strategy.experimental_distribute_datasets_from_function(lambda _:dataset(
            gcs_path,
            is_train=False, 
            image_size=(IMAGE_SIZE, IMAGE_SIZE), 
            cache=True, 
            batch_size=PER_REPICE_BATCH_SIZE,
            drop_filename=True,
            dtype=dtype,
            ))
train_iterator = iter(train_dataset)
val_iterator = iter(val_dataset)

In [22]:
"""load model"""
with strategy.scope():
    model_teacher = keras.applications.__dict__[TEACHER](weights='imagenet', include_top=True)
    model_teacher.trainable = False

    model_student = model(224, 1000, STAGE_OUT_CHANNEL)

    lr_schedule = MyLRSchedule(LEARNING_RATE, EPOCHS, STEP_PER_EPOCH)
    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999)
    criterion = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    criterion_smooth = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0)
    criterion_kd = myLoss()
    training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('training_accuracy', dtype=tf.float32)
    training_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
    val_accuracy =tf.keras.metrics.SparseCategoricalAccuracy('val_accuracy', dtype=tf.float32)


@tf.function
def train_step(iterator):
    def train_fn(inputs):
        x, y = inputs
        with tf.GradientTape() as tape:
            logits_student = model_student(x, training=True)
            logits_teacher = model_teacher(x, training=False)
            loss = criterion_kd(logits_teacher, logits_student)
            loss = tf.nn.compute_average_loss(loss, global_batch_size=BATCH_SIZE)
            
        grads = tape.gradient(loss, model_student.trainable_weights)
        optimizer.apply_gradients(list(zip(grads, model_student.trainable_weights)))
        training_accuracy.update_state(y, logits_student)
        training_loss.update_state(loss)
    
    strategy.run(train_fn, args=(next(iterator),))
    
@tf.function
def test_step(iterator):
    def test_fn(inputs):
        x, y = inputs
        val_logits = model_student(x, training=False)
        val_accuracy.update_state(y, val_logits)
    strategy.run(test_fn, args=(next(iterator),))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
for epoch in range(EPOCHS):
    print('\nstart of epoch %d'%(epoch,))
    for step in range(STEP_PER_EPOCH):
        train_step(train_iterator)    
    train_acc = training_accuracy.result()
    print('training acc over epoch: %.4f, %4f'%(float(train_acc), float(training_loss.result())))
    training_accuracy.reset_states()
    training_loss.reset_states()

    for step in range(VAL_PER_EPOCH):
        test_step(val_iterator)
    val_acc = val_accuracy.result()
    print('validation acc over epoch: %.4f'%(float(val_acc),))
    val_accuracy.reset_states()


start of epoch 0
training acc over epoch: 0.0010, 0.864433
validation acc over epoch: 0.0010

start of epoch 1
training acc over epoch: 0.0010, 0.863488
validation acc over epoch: 0.0010

start of epoch 2
training acc over epoch: 0.0010, 0.863487
validation acc over epoch: 0.0010

start of epoch 3


In [None]:
model_student.save_weights(SAVE_PATH)