In [1]:
import math
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import logit

from IPython.display import display

from keras.layers import (Input, Dense, Lambda, Flatten, Reshape, BatchNormalization, Layer,
                          Activation, Dropout, Conv2D, Conv2DTranspose,
                          Concatenate, add, Add, Multiply)
from keras.engine import InputSpec
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import RMSprop, Adam
from keras.models import Model
from keras import metrics
from keras import backend as K
from keras_tqdm import TQDMNotebookCallback
from keras.datasets import cifar10

from realnvp_helpers import Mask, FlowBatchNorm


%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
batch_size = 10
shape = (4, 4, 3)
batch_shape = (batch_size,) + shape
samples = 100
print(batch_shape)

train_data = np.random.normal(0.5, 3, size=(samples,) + (shape))
print(train_data.shape)
train_data[0, :, :, :]

(10, 4, 4, 3)
(100, 4, 4, 3)


array([[[ 7.91728386, -3.74211666, -1.39937365],
        [-4.54491778, -0.23326053, -0.77910284],
        [ 0.25611969,  0.44484699,  2.62382611],
        [ 4.17528109, -0.27181473,  2.78194168]],

       [[-0.47577941,  0.99883384,  1.33806874],
        [-2.13918905,  0.50927928, -5.57648648],
        [ 2.16347973,  1.40567808, -2.43587834],
        [-6.35771616, -1.08612996,  3.1987612 ]],

       [[-2.35902617, -0.46983846, -2.36385492],
        [-1.73405912,  0.54416087, -6.9883723 ],
        [-4.30888353, -3.84526239, -0.74205784],
        [-4.63604267,  1.15458543, -3.07438176]],

       [[ 1.83255414, -1.19189994,  1.09591406],
        [-2.32309297, -2.32885619,  5.81224811],
        [-5.00856445,  5.85198909,  3.97918849],
        [ 4.03387848,  3.81555843, -2.92012491]]])

In [3]:
def conv_block(input_shape, kernel_size, filters, stage, block, use_resid=True):
    ''' Adapted from resnet50 implementation in Keras '''
    filters1, filters2, filters3 = filters
    if K.image_data_format() == 'channels_last':
        bn_axis = 3
    else:
        bn_axis = 1
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    
    input_tensor = Input(batch_shape=input_shape)
    x = Conv2D(filters1, (1, 1),
               kernel_initializer='he_normal',
               name=conv_name_base + '2a')(input_tensor)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
    x = Activation('relu')(x)

    x = Conv2D(filters2, kernel_size,
               padding='same',
               kernel_initializer='he_normal',
               name=conv_name_base + '2b')(x)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
    x = Activation('relu')(x)

    x = Conv2D(filters3, (1, 1),
               kernel_initializer='he_normal',
               name=conv_name_base + '2c')(x)
    x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)

    if use_resid:
        x = add([x, input_tensor])
    x = Activation('relu')(x)
    
    return Model(input_tensor, x, name='conv_block' + stage + block)

In [16]:
def coupling_step(input_shape, mask_type, stage):
    ''' Implements (as per paper):
        y = b * x + (1 - b) * [x * exp(s(b * x)) + t(b * x)]
    '''
    assert mask_type in ['check_even', 'check_odd', 'channel_even', 'channel_odd']
    mask_prefix = 'check' if mask_type.startswith('check') else 'channel'
    mask_opposite = 'odd' if mask_type.endswith('even') else 'even'
    
    input_tensor = Input(batch_shape=input_shape)
    
    # Raw operations for step
    b0 = Mask(mask_type)
    b1 = Mask(mask_prefix + '_' + mask_opposite)
    s_ = conv_block(input_shape, (3, 3), (32, 32, 3), stage, '_s', use_resid=True)
    t_ = conv_block(input_shape, (3, 3), (32, 32, 3), stage, '_t', use_resid=True)
    batch = FlowBatchNorm()
       
    # Forward
    masked_input = b1(input_tensor)
    s = s_(masked_input)
    t = t_(masked_input)
    coupling = Lambda(lambda ins:  ins[0] * K.exp(ins[1]) + ins[2])([input_tensor, s, t])
    coupling_mask = b0(coupling)
    out1, out2 = Add()([masked_input, coupling_mask]), b0(s)
    out1_norm, mean, var = batch(out1)
    batch_loss = Lambda(lambda x: 0.5 * K.log(x + batch.epsilon), output_shape=(None, var.shape,))(var)
    
    # Reverse
   
    # Return result + masked scale for loss function
    #return Model(input_tensor, [out1_norm, out2, batch_loss], name='_'.join(['coupling', mask_type, stage]))
    return Model(input_tensor, [out1_norm, out2], name='_'.join(['coupling', mask_type, stage]))

In [18]:
def coupling_layer(input_tensor, steps, mask_type, stage):
    name_mapping = dict(enumerate(string.ascii_lowercase))
    
    # TODO: Only need check/channel, not even/odd right?
    assert mask_type in ['check_even', 'check_odd', 'channel_even', 'channel_odd']
    mask_prefix = 'check' if mask_type.startswith('check') else 'channel'
    
    input_shape = tuple(x.value for x in input_tensor.shape)
    x = input_tensor
    s_losses = []
    batch_losses = []
    for i in range(3):
        mask_type = mask_prefix + ('_even' if i % 2 == 0 else '_odd')
        step = coupling_step(input_shape, mask_type, stage=str(stage) + name_mapping[i])
        #x, s, batch_loss = step(x)
        x, s = step(x)
        s_losses.append(s)
        #batch_losses.append(batch_loss)
    
    #return x, s_losses, batch_losses
    return x, s_losses

In [19]:
def realnvp_loss(target, output, shape):
    # Extract x's and s's
    z = output[0]
    s_losses = output[1]
    #batch_losses = output[2]
    #z = output[:, :, :, :shape[-1]]
    #s = output[:, :, :, shape[-1]:]
   
    # log(p_X(x)) = log(p_Z(f(x))) + log(|det(\partial f(x) / \partial X^T)|)
    # Prior is standard normal(mu=0, sigma=1)
    z_loss = -0.5 * np.log(math.pi) - 0.5 * z**2
   
    # Determinant is just sum of "s" params (already log-space)
    det_loss = K.sum(s_losses)
    
    return -z_loss - det_loss

In [24]:
input_tensor = Input(batch_shape=batch_shape)
#x, s_losses, batch_losses = coupling_layer(input_tensor, steps=3, mask_type='check_even', stage=1)
x, s_losses = coupling_layer(input_tensor, steps=3, mask_type='check_even', stage=1)
s_losses = Concatenate()(s_losses)
#batch_losses = concatenate()(batch_losses)

forward_model = Model(inputs=input_tensor, outputs=[x, s_losses])
optimizer = Adam(lr=0.001)
forward_model.compile(optimizer=optimizer, 
                      loss=lambda target, output: realnvp_loss(target, output, shape=shape))
forward_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_51 (InputLayer)           (10, 4, 4, 3)        0                                            
__________________________________________________________________________________________________
coupling_check_even_1a (Model)  [(10, 4, 4, 3), (10, 19498       input_51[0][0]                   
__________________________________________________________________________________________________
coupling_check_odd_1b (Model)   [(10, 4, 4, 3), (10, 19498       coupling_check_even_1a[1][0]     
__________________________________________________________________________________________________
coupling_check_even_1c (Model)  [(10, 4, 4, 3), (10, 19498       coupling_check_odd_1b[1][0]      
__________________________________________________________________________________________________
concatenat

In [32]:
#early_stopping = keras.callbacks.EarlyStopping('val_loss', min_delta=50.0, patience=5)
#reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001)
s = [int(x) for x in s_losses.shape]
s[0] = int(train_data.shape[0])
history = forward_model.fit(
    train_data, [train_data, np.zeros(s)],
    batch_size=batch_size,
    epochs=20,
    callbacks=[TQDMNotebookCallback()], #, early_stopping, reduce_lr],
    verbose=0
)

HBox(children=(IntProgress(value=0, description='Training', max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0'), HTML(value='')))

InvalidArgumentError: Incompatible shapes: [4] vs. [10]
	 [[{{node training_4/Adam/gradients/loss_4/coupling_check_even_1c_loss/mul_1_grad/BroadcastGradientArgs}} = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@train...1_grad/Sum"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](training_4/Adam/gradients/loss_4/coupling_check_even_1c_loss/mul_1_grad/Shape, training_4/Adam/gradients/loss_4/coupling_check_even_1c_loss/mul_1_grad/Shape_1)]]
	 [[{{node loss_4/coupling_check_even_1c_loss/Mean_2/_6777}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_7089_loss_4/coupling_check_even_1c_loss/Mean_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
df = pd.DataFrame(history.history)
display(df.describe(percentiles=[0.25 * i for i in range(4)] + [0.95, 0.99]))
col = 'val_loss' if 'val_loss' in df else 'loss'
df[col][-25:].plot(figsize=(8, 6))

# 2019-07-28

* Got some framework up to do coupling layers but having trouble passing the scale parameter to the loss function, getting some weird tensorflow error, needs more debugging
* Without the determinant in the loss function, it looks like loss goes down, so maybe on the right track?
    * It's actually weird that we're not using the image in the output, but I guess that's what's great about this reversible model!
* TODO:
    * Debug scale function in loss
    * Add reverse (generator) network to functions above.

# 2019-07-29

* Explanation of how to estimate probability of continuous variables (relevant for computing bits/pixel without an explicit discrete distribution): https://math.stackexchange.com/questions/2818318/probability-that-a-sample-is-generated-from-a-distribution
* Idea for a post, explain likelihood estimation of discrete vs. continuous distributions (like pixels), include:
  * Probability of observing a value from continuous distribution = 0
     * https://math.stackexchange.com/questions/2818318/probability-that-a-sample-is-generated-from-a-distribution
  * Probability of observing a value from a set of discrete hypthesis (models) is non-zero using epsilon trick (see above link):
     * https://math.stackexchange.com/questions/920241/can-an-observed-event-in-fact-be-of-zero-probability
  * Explain Equation 3 from "A NOTE ON THE EVALUATION OF GENERATIVE MODELS"
     * Also include an example using a simpler case, like a bernoulli variable that we're estimating using a continuous distribution
  * Bring it back to modelling pixels and how they usually do it

# 2020-03-30

* To make reversible network, build forward and backward network at the same time using `Model()` to have components that I can use in both networks
* Looks like I have some instability here, depending on the run I can get an exact fit (-100s loss) or a poor a fit (+10):
    * Turning off residual networks helps
    * Adjusting the learning rate, batch size helps but hard to pinpoint a methodology
* Most likely it's the instability of using a scale parameter (RealNVP paper Section 3.7), might need to implement their batch norm for more stable results, especially when adding more layers:
    * Reimplement `BatchNorm`: https://github.com/keras-team/keras/blob/master/keras/layers/normalization.py
    * Except return regular result AND (variance + eps) term
    * Use the (var + eps) term to compute Jacobian for loss function (should just be log-additive)
* Once this is done, add back the other stuff:
    * Turn on residual shortcuts
    * Change batch size to reasonable number and learning rate=0.01
* If this still doesn't work, might want to implement "Running average over recent minibatches" in Appendix E

# 2020-03-31

* Fixed a bug (I think) in the network where the coupling layer was wrong.  However, it still sometimes get stuck at around a loss of 5 but more often than not (on another training run) get to -10 (after 20 iters).
* Trying to get FlowBatchNorm worknig but having some issues passing the determinant batch loss as an output because the `batch_size` is not getting passed (it has dimension (3,) but should have dimension (None, 3)).  Need to figure out how to tranlate a tensor to Layer that includes batch.

# 2020-04-05

* Reminder: BatchNormalization on conv layers only need to normalize across [B, W, H, :] layers, not the "C" layer because the filter is identical across a channel (so it uses the same mean/var to normalize).  This is nice because it's the same axis (-1) you would normalize across in a Dense layer. See: https://intellipaat.com/community/3872/batch-normalization-in-convolutional-neural-network
* I think I figured out how to return the batchnorm weights back but now I'm hitting a roadblock when I try to merge them together to put as part of the output loss -- maybe I should just forget it and use the tensors directly in the output loss?
* Now that I switched to an explicit batch size, it doesn't run anymore... get this error "Incompatible shapes: [4] vs. [32]", probably some assumption that I had, got to work backwards and fix it I think.


In [None]:
from scipy.stats import norm

for i in range(-10, 10):
    eps = i / 1000
    l = norm.cdf(0 - eps)
    r = norm.cdf(0 + eps)
    print(eps, '\t', l - r)