In [1]:
from os import path as osp
import numpy as np
import tensorflow as tf
import sonnet as snt
from attrdict import AttrDict

from evaluation import make_fig, make_logger

from data import load_data, tensors_from_data
from mnist_model import SeqAIRonMNIST

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
% matplotlib inline

In [2]:
learning_rate = 1e-4
n_steps = 3

results_dir = '../results/seq'
run_name = 'test'

logdir = osp.join(results_dir, run_name)
checkpoint_name = osp.join(logdir, 'model.ckpt')
axes = {'imgs': 0, 'labels': 0, 'nums': 1}

In [3]:
batch_size = 64

num_steps_prior = AttrDict(
    anneal='exp',
    init=1.,
    final=1e-1,
    steps_div=1e4,
    steps=1e5,
    hold_init=1e3,
    analytic=False
)
# num_steps_prior = None

appearance_prior = AttrDict(loc=0., scale=1.)
where_scale_prior = AttrDict(loc=0., scale=1.)
where_shift_prior = AttrDict(loc=0., scale=1.)

use_reinforce = True
sample_presence = True
step_bias = .75
transform_var_bias = .5
output_multiplier = .5

l2_weight = 0. #1e-5

In [4]:
valid_data = load_data('mnist_validation.pickle')
train_data = load_data('mnist_train.pickle')

In [5]:
tf.reset_default_graph()
train_tensors = tensors_from_data(train_data, batch_size, axes, shuffle=True)
valid_tensors = tensors_from_data(valid_data, batch_size, axes, shuffle=False)
x, valid_x = train_tensors['imgs'], valid_tensors['imgs']
y, test_y = train_tensors['nums'], valid_tensors['nums']
    
n_hiddens = 32 * 8
n_layers = 2
n_hiddens = [n_hiddens] * n_layers
    
seq_x = tf.reshape(x, (4, 16, 50, 50))
offset = tf.placeholder(tf.int32, [], 'offset')
offset = tf.Variable(0, dtype=tf.int32, name='offset', trainable=False)
seq_x = seq_x[offset:]
print seq_x.get_shape()
seq_y = tf.reshape(y, (4, 16, -1))[offset:]
air = SeqAIRonMNIST(seq_x,
                max_steps=n_steps,
                inpt_encoder_hidden=n_hiddens,
                glimpse_encoder_hidden=n_hiddens,
                glimpse_decoder_hidden=n_hiddens,
                transform_estimator_hidden=n_hiddens,
                steps_pred_hidden=[128, 64],
                baseline_hidden=[256, 128],
                transform_var_bias=transform_var_bias,
                step_bias=step_bias,
                output_multiplier=output_multiplier
)

(?, 16, 50, 50)


In [6]:
train_step, global_step = air.train_step(learning_rate, l2_weight, appearance_prior, where_scale_prior,
                            where_shift_prior, num_steps_prior, use_reinforce=use_reinforce,
                            decay_rate=None, nums=seq_y)

() (?, 16) (?, 16) (?, 16)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [7]:
for v in tf.trainable_variables():
    print v.name, v.get_shape()

SeqAIRonMNIST/lstm_initial_state_0/w:0 (1, 256)
SeqAIRonMNIST/lstm_initial_state_1/w:0 (1, 256)
SeqAIRonMNIST/SeqAIRCell/Encoder/MLP/linear/w:0 (2500, 256)
SeqAIRonMNIST/SeqAIRCell/Encoder/MLP/linear/b:0 (256,)
SeqAIRonMNIST/SeqAIRCell/Encoder/MLP/linear_1/w:0 (256, 256)
SeqAIRonMNIST/SeqAIRCell/Encoder/MLP/linear_1/b:0 (256,)
lstm/w_gates:0 (512, 1024)
lstm/b_gates:0 (1024,)
SeqAIRonMNIST/SeqAIRCell/StochasticTransformParam/MLP/linear/w:0 (256, 256)
SeqAIRonMNIST/SeqAIRCell/StochasticTransformParam/MLP/linear/b:0 (256,)
SeqAIRonMNIST/SeqAIRCell/StochasticTransformParam/MLP/linear_1/w:0 (256, 256)
SeqAIRonMNIST/SeqAIRCell/StochasticTransformParam/MLP/linear_1/b:0 (256,)
SeqAIRonMNIST/SeqAIRCell/StochasticTransformParam/MLP/linear_2/w:0 (256, 8)
SeqAIRonMNIST/SeqAIRCell/StochasticTransformParam/MLP/linear_2/b:0 (8,)
SeqAIRonMNIST/SeqAIRCell/StepsPredictor/MLP/linear/w:0 (256, 128)
SeqAIRonMNIST/SeqAIRCell/StepsPredictor/MLP/linear/b:0 (128,)
SeqAIRonMNIST/SeqAIRCell/StepsPredictor/MLP/l

In [8]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
    
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())

In [9]:
# step_prob = air.num_steps_distrib.prob()
# mean_prob = tf.reduce_mean(step_prob, 0)
# # step_entropy = step_prob * tf.log(step_prob)
# # step_entropy = -tf.reduce_sum(step_entropy)

In [10]:
# def compute_grad(y, x):
#     g = tf.reduce_mean(tf.gradients(y, x)[0], 0)
#     norm = 1.#tf.sqrt(tf.reduce_sum(tf.square(g), -1, keep_dims=True))
#     return g / norm

# total_grad = compute_grad(air.opt_loss, step_prob)
# kl_grad = compute_grad(air.prior_loss.value, step_prob)

# grads = [total_grad, kl_grad]
# names = ['total', 'kl']

# if use_reinforce:
#     reinforce_grad = compute_grad(air.reinforce_loss, step_prob)
#     grads.append(reinforce_grad)
#     names.append('reinforce')
    
# grads.append(mean_prob)
# names.append('prob')


# for g, name in zip(grads, names):
#     step_grads = tf.unstack(g, axis=-1)
#     for i, step_grad in enumerate(step_grads):
#         tf.summary.scalar('prob_grad/{}_{}'.format(name, i), step_grad)

In [11]:
# print grads
# print names

In [12]:
# a, d = sess.run([grads, mean_prob])
# for i in a:
#     print i
# print d

In [13]:
all_summaries = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(logdir, sess.graph)
saver = tf.train.Saver()

In [14]:
train_batches = train_data['imgs'].shape[0] // batch_size
valid_batches = valid_data['imgs'].shape[0] // batch_size
log = make_logger(air, sess, summary_writer, train_tensors, train_batches, valid_tensors, valid_batches)

In [15]:
# tensors = {
#     'loss': air.loss.value,
#     'imp_weight': .5*tf.reduce_mean((air.reinforce_imp_weight)**2),
#     'baseline_loss': air.baseline_loss,
#     'baseline': tf.reduce_mean(air.baseline)
# }

In [None]:
# output = sess.run(tensors)
# for o, v in output.iteritems():
#     print '{}: {}'.format(o, v)

In [None]:


* add sequential air model and it's MNISt version
* adjust STN and BaselineMLP to work with timeseries
* move anneal_weight fun to ops.py
* implement 'prob' method of NumStepsDistribution for 
* add 'opt_loss' to logging


In [None]:
train_itr = sess.run(global_step)
print 'Starting training at iter = {}'.format(train_itr)

if train_itr == 0:
    sess.run(air._baseline_tran_step)
    log(0)
    
while train_itr < 1e6:
        
    train_itr, _ = sess.run([global_step, train_step])
    
#     if train_itr % 100 == 0:
#     if (train_itr % 1000) < 100:
    if train_itr % 50 == 0:
        summaries = sess.run(all_summaries)
        summary_writer.add_summary(summaries, train_itr)
        
    if train_itr % 1000 == 0:
        log(train_itr)
        
    if train_itr % 5000 == 0:
        saver.save(sess, checkpoint_name, global_step=train_itr)
#         make_fig(air, sess, logdir, train_itr)    

Starting training at iter = 0
Step 0, Data train prior_loss = 33.9371, loss = -66.1596, baseline_loss = 110608.9784, kl_what = 0.3690, imp_weight = -67.3129, opt_loss = 24.3343, kl_num_steps = 33.1958, reinforce_loss = 90.4939, rec_loss = -100.0966, num_step = 1.4717, num_step_acc = 0.2478, kl_where = 0.3724, eval time = 1.691s
Step 0, Data test prior_loss = 33.9803, loss = 5.4206, baseline_loss = 106089.4497, kl_what = 0.4070, imp_weight = 4.1189, opt_loss = -0.1373, kl_num_steps = 33.1956, reinforce_loss = -5.5579, rec_loss = -28.5596, num_step = 1.4965, num_step_acc = 0.2257, kl_where = 0.3777, eval time = 0.4598s

Step 1000, Data train prior_loss = 65.4491, loss = -338.1859, baseline_loss = 2496.4797, kl_what = 21.1333, imp_weight = -17.4861, opt_loss = -314.7169, kl_num_steps = 33.1946, reinforce_loss = 23.4691, rec_loss = -403.6351, num_step = 1.2769, num_step_acc = 0.2818, kl_where = 11.1213, eval time = 1.634s
Step 1000, Data test prior_loss = 68.2813, loss = -281.3978, baselin