# Classification with Virtual Branching

In [1]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
from vbranch.callbacks import classification_acc
from vbranch.applications.fcn import FCN
from vbranch.applications.cnn import CNN, CNNCifar10
from vbranch.applications.resnet import ResNet18
from vbranch.applications.densenet import DenseNet3
from vbranch.losses import softmax_cross_entropy_with_logits

from vbranch.utils import TFSessionGrow, restore_sess
from vbranch.utils.training import *
from vbranch.utils.generic import get_path, save_results
from vbranch.datasets import cifar10

Using TensorFlow backend.


In [3]:
MODEL_ID = 1
ARCHITECTURE = 'densenet'
DATASET = 'cifar10'
NUM_CLASSES = 10
NUM_FEATURES = None
SAMPLES_PER_CLASS = None
BAGGING_SAMPLES = 1.0
TRAIN_FRAC = 1.

BATCH_SIZE = 64
EPOCHS = 100
T_0 = 50
# STEPS_PER_EPOCH = 390

## Data

In [4]:
(X_train, y_train), (X_test, y_test) = get_data(DATASET, ARCHITECTURE, NUM_CLASSES,
                                NUM_FEATURES, SAMPLES_PER_CLASS, 
                                train_frac=TRAIN_FRAC, preprocess=True)
x_shape = (None,) + X_test.shape[1:]
y_shape = (None, NUM_CLASSES)

def create_generator(batch_size):
     return cifar10.DataGeneratorTrain(batch_size=batch_size,
                                       one_hot=True, preprocess=True,
                                       flip=True, padding=4,
                                       im_size=X_test.shape[1])

In [5]:
X_train.shape, y_train.shape, 
# X_test.shape, y_test.shape

((50000, 32, 32, 3), (50000, 10))

In [6]:
print(X_train.min(), X_train.max(), np.mean(X_train), np.std(X_train))
# print(X_test.min(), X_test.max())

-2.4290657439446366 2.7537313432835817 -1.8789740505556917e-05 1.248454173223828


## Training

In [7]:
def path(n_branches, shared_frac):
    if shared_frac is None:
        return get_path(DATASET, ARCHITECTURE, f'sensitivity-Ba{BATCH_SIZE}')
        
    return get_path(DATASET, ARCHITECTURE, f'sensitivity-Ba{BATCH_SIZE}', vb=True, 
                    B=n_branches, S=shared_frac)

In [8]:
# lr_scheduler = lr_step_scheduler((150, 0.1), (0.75*300, 0.01), (300, 0.001))

In [9]:
# lr_steps = [lr_scheduler(e + 1) for e in range(EPOCHS)]
# plt.plot(lr_steps)
# plt.title('Learning Rate')
# plt.show()

In [10]:
def build_model(n_branches, shared_frac, name='model', compile_loss=True):
#     inputs, labels, train_init_op, test_init_op = get_data_iterator(x_shape, y_shape, 
#                                                                     batch_size=BATCH_SIZE, 
#                                                                     n=n_branches, 
#                                                                     share_xy=BAGGING_SAMPLES == 0)
    
    (inputs, labels), train_init_op, test_init_op = get_data_iterator_from_generator(
        [create_generator(BATCH_SIZE // n_branches) for _ in range(n_branches)], (x_shape, y_shape), 
        n=n_branches, labels=True)
    
    print(inputs, labels)
    lr = tf.placeholder('float32', name='lr')
    lr_scheduler = lr_exp_decay_scheduler(0.001, T_0, EPOCHS, 0.001)

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        if ARCHITECTURE == 'densenet':
            model = DenseNet3(depth=100, growth_rate=12, 
                              inputs=inputs, classes=NUM_CLASSES, name=name, 
                              shared_frac=shared_frac) 
#                               shared_frac_blocks=[shared_frac, 1., 1., 1.])
        else:
            raise ValueError('invalid model type')

        if compile_loss:
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
#             optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
            model.compile(optimizer, softmax_cross_entropy_with_logits(), 
                          train_init_op, test_init_op, labels=labels,
                          callbacks={'acc':classification_acc(n_branches, batch_size=250)}, 
                          schedulers={'lr:0': lr_scheduler})

    return model

In [11]:
def train(n_branches, shared_frac, model_id=1):
    dirpath = path(n_branches, shared_frac)
    model_path = os.path.join('models', dirpath, 'model_{}'.format(model_id))
    os.system('mkdir -p ' + model_path)
    print(model_path)
    
    tf.reset_default_graph()
    model = build_model(n_branches, shared_frac)
    model.summary()
    
    # Bagging
    if BAGGING_SAMPLES > 0:
        x_train_list, y_train_list = bag_samples(X_train, y_train, n_branches, 
                                                 max_samples=BAGGING_SAMPLES)
    
    train_dict = {'x:0': X_train, 'y:0': y_train, 'batch_size:0': BATCH_SIZE}
    val_dict = {'x:0': X_test, 'y:0': y_test, 'batch_size:0': 250}
    
#     if n_branches > 1 and BAGGING_SAMPLES > 0:
#         print('Bag', BAGGING_SAMPLES)
#         for i in range(n_branches):
#             train_dict[f'vb{i+1}_x:0'] = x_train_list[i]
#             train_dict[f'vb{i+1}_y:0'] = y_train_list[i]

    history = model.fit(EPOCHS, create_generator(BATCH_SIZE // n_branches).get_steps_per_epoch(), 
                        train_dict=train_dict,
                        val_dict=val_dict, log_path=model_path, verbose=1)
    save_results(history, dirpath, f'train_{model_id}.csv')
    
    return history

In [12]:
# for n_branches in range(3, 5):
#     for shared_frac in [0.5, 0.75, 1.]:
#         for t in range(4):
#             train(n_branches, shared_frac, model_id=t+1)
# history = train(n_branches=1, shared_frac=None, model_id=1)
# history = train(n_branches=2, shared_frac=0.0, model_id=2)

In [None]:
for n_branches in range(2, 5):
    for shared_frac in [0.25, 0.5, 0.75, 1.]:
        for t in range(1):
            train(n_branches, shared_frac, model_id=t+1)

models/sensitivity-Ba64/vb-cifar10-densenet/B2/S0.25/model_1
('float32', 'float32') ((None, 32, 32, 3), (None, 10))
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    
Instructions for updating:
Colocations handled automatically by placer.
('float32', 'float32') ((None, 32, 32, 3), (None, 10))
[<tf.Tensor 'input_1:0' shape=(?, 32, 32, 3) dtype=float32>, <tf.Tensor 'input_2:0' shape=(?, 32, 32, 3) dtype=float32>] [<tf.Tensor 'input_1:1' shape=(?, 10) dtype=float32>, <tf.Tensor 'input_2:1' shape=(?, 10) dtype=float32>]
Instructions for updating:
Use tf.cast instead.
i    Layer name                      Output sha

Epoch 1/100


## Evaluation

In [None]:
# from vbranch.utils.generic import get_model_path, get_vb_model_path
from vbranch.utils.test import baseline_classification, compute_correlation_strength, compute_acc_from_logits
import json

### Correlation and Strength

For classification, we can compute the correlation between models and their strength. The formulas used are from the Random Forest paper:

https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf

In [None]:
def correlation_strength(n_branches, shared_frac, model_id):
    model_path = os.path.join('models', path(n_branches, shared_frac), 
                              'model_{}'.format(model_id))
    print(model_path)

    test_init_ops = []
    tensors = []
    for i in range(n_branches):
        test_init_ops.append('test_init_op_{}'.format(i+1))
        tensors.append('model/output/vb{}/output:0'.format(i+1))

    with TFSessionGrow() as sess:
        restore_sess(sess, model_path)
        sess.run(test_init_ops, feed_dict={'x:0':X_test, 'y:0': y_test, 
                                           'batch_size:0':len(X_test)})
        outputs = sess.run(tensors)

    return compute_correlation_strength(outputs, y_test, NUM_CLASSES, n_branches)

In [None]:
correlation_results = {}
strength_results = {}

# num_branches = 4
shared_frac_list = [0., 0.25, 0.5, 0.75, 1.]
# shared_correlation_list = []
# shared_strength_list = []
n_trials = 4

for b in range(2, 3):
    correlation_results[b] = {}
    strength_results[b] = {}
    
    for shared in shared_frac_list:
        correlation_list = []
        strength_list = []

        for model_id in range(1, n_trials + 1):
            tf.reset_default_graph()
            c, s = correlation_strength(b, shared, model_id)
            correlation_list.append(c)
            strength_list.append(s)

        correlation_results[b][shared] = [np.mean(correlation_list), np.std(correlation_list)]
        strength_results[b][shared] = [np.mean(strength_list), np.std(strength_list)]

In [None]:
with open(f'results/sensitivity-3/correlation-{DATASET}-{ARCHITECTURE}.json', 'w') as f:
    json.dump(correlation_results, f, indent=4)
with open(f'results/sensitivity-3/strength-{DATASET}-{ARCHITECTURE}.json', 'w') as f:
    json.dump(strength_results, f, indent=4)

In [None]:
architecture = ['cnn'] #, 'cnnx'] #, 'fcn2', 'fcn3', 'fcn2A', 'fcn3A']
correlation = []
strength = []

for arch in architecture:
    with open(f'results/sensitivity-3/correlation-{DATASET}-{arch}.json', 'r') as f:
        correlation.append(json.load(f))
    with open(f'results/sensitivity-3/strength-{DATASET}-{arch}.json', 'r') as f:
        strength.append(json.load(f))

In [None]:
def plot_corr_strength(n_branches):
    def mean_std(data):
        mean = []
        std = []
        for frac in shared_frac_list:
            mean.append(data[str(frac)][0])
            std.append(data[str(frac)][1])
        return np.array(mean), np.array(std)
    
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    for i, arch in enumerate(architecture):
        data = correlation[i][str(n_branches)]
        mean, std = mean_std(data)    
        plt.errorbar(shared_frac_list, mean, 2*std / np.sqrt(n_trials), label=arch)
        plt.legend()
        
    plt.subplot(1,2,2)
    for i, arch in enumerate(architecture):
        data = strength[i][str(n_branches)]
        mean, std = mean_std(data)    
        plt.errorbar(shared_frac_list, mean, 2*std / np.sqrt(n_trials), label=arch)    
        plt.legend()
    
    plt.show()

In [None]:
plot_corr_strength(2)

### Model Parameters

In [None]:
# Vbranch params
shared_frac_list = [0.] #, 0.25, 0.5, 0.75, 1.]
num_branches = 1

vbranch_params = []
for frac in shared_frac_list:
    tf.reset_default_graph()
    inputs = tf.placeholder('float32', [None, 32,32,3])
    model = build_model(num_branches, frac, compile_loss=False)
    model.summary()
    vbranch_params.append(model.count_parameters())

In [None]:
param_ratio = [p / vbranch_params[-1] for p in vbranch_params]
ideal_ratio = num_branches - np.array(shared_frac_list)**2 * (num_branches-1)

In [None]:
plt.scatter(shared_frac_list, param_ratio, color='orange')
# plt.plot(shared_frac_list, [1]*len(shared_frac_list))
plt.plot(shared_frac_list, ideal_ratio)

plt.xlabel('shared frac')
plt.ylabel('params / baseline')
plt.title('{} parameter count'.format(ARCHITECTURE))

plt.savefig('figs/cnn-small-parameter-count.png')
plt.show()