Learning-to-learn is a way to train a neural network to learn. Instead of using "AdaDelta", "ADAM", or "RMSProp", give the responsiblity to a neural network to find a suitable optimizer for you. 

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import os

import tensorflow as tf

from tensorflow.contrib.learn.python.learn import monitored_session as ms

from tensorflow.contrib.learn.python.learn.datasets import mnist as mnist_dataset

import meta
import util

import nn

_nn_initializers = {
    "w": tf.random_normal_initializer(mean=0, stddev=0.01),
    "b": tf.random_normal_initializer(mean=0, stddev=0.01),
}

In [None]:
save_path = "meta_optimizer"
num_epochs = 1000
log_period = 100
evaluation_period = 1000
evaluation_epochs = 20

problem = "simple"
num_steps = 100 # Number of optimization steps per epoch
unroll_length = 20 # Meta-optimizer unroll length
learning_rate = 0.01
second_derivatives = False

### Main Learning for a simple problem of optimizing a square function

In [None]:
num_unrolls = num_steps // unroll_length
if os.path.isdir(save_path):
    os.rmdir(save_path)
os.makedirs(save_path)

### Problems

### Simple
In this problem, we have only one parameter $x$. We are trying to minimize $x^2$.

In [None]:
def simple():
  """Simple problem: f(x) = x^2."""

  def build():
    """Builds loss graph."""
    x = tf.get_variable(
        "x",
        shape=[],
        dtype=tf.float32,
        initializer=tf.ones_initializer())
    return tf.square(x, name="x_squared")

  return build

### Simple-Multi-Optimizer

In this problem, we have two parameters $x_1$ and $x_2$. We are trying to optimize $x_1^2 + x_2^2$.

In [None]:
def simple_multi_optimizer(num_dims=2):
  """Multidimensional simple problem."""

  def get_coordinate(i):
    return tf.get_variable("x_{}".format(i),
                           shape=[],
                           dtype=tf.float32,
                           initializer=tf.ones_initializer())

  def build():
    coordinates = [get_coordinate(i) for i in xrange(num_dims)]
    x = tf.concat(0, [tf.expand_dims(c, 0) for c in coordinates])
    return tf.reduce_sum(tf.square(x, name="x_squared"))

  return build


In this problem, we have total $128 * 10$ parameters. We are trying to minimize $\sum_{i=1}^{128}||W_i x_i - y_i||$. Here $W_i$ is a fixed matrix of size $10 \times 10$ and $y_i$ is a fixed column-vector of dimension $10$.

In [None]:
def quadratic(batch_size=128, num_dims=10, stddev=0.01, dtype=tf.float32):
  """Quadratic problem: f(x) = ||Wx - y||."""

  def build():
    """Builds loss graph."""

    # Trainable variable.
    x = tf.get_variable(
        "x",
        shape=[batch_size, num_dims],
        dtype=dtype,
        initializer=tf.random_normal_initializer(stddev=stddev))

    # Non-trainable variables.
    w = tf.get_variable("w",
                        shape=[batch_size, num_dims, num_dims],
                        dtype=dtype,
                        initializer=tf.random_uniform_initializer(),
                        trainable=False)
    y = tf.get_variable("y",
                        shape=[batch_size, num_dims],
                        dtype=dtype,
                        initializer=tf.random_uniform_initializer(),
                        trainable=False)

    product = tf.squeeze(tf.batch_matmul(w, tf.expand_dims(x, -1)))
    return tf.reduce_mean(tf.reduce_sum((product - y) ** 2, 1))

  return build

### mnist

In this problem, we will train MNIST problem using "learning-to-learn".

In [None]:
def _xent_loss(output, labels):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(output, labels)
    return tf.reduce_mean(loss)

def mnist(layers,  # pylint: disable=invalid-name
          activation="sigmoid",
          batch_size=128,
          mode="train"):
    """Mnist classification with a multi-layer perceptron."""

    if activation == "sigmoid":
        activation_op = tf.sigmoid
    elif activation == "relu":
        activation_op = tf.nn.relu
    else:
        raise ValueError("{} activation not supported".format(activation))

    # Data.
    data = mnist_dataset.load_mnist()
    data = getattr(data, mode)
    images = tf.constant(data.images, dtype=tf.float32, name="MNIST_images")
    images = tf.reshape(images, [-1, 28, 28, 1])
    labels = tf.constant(data.labels, dtype=tf.int64, name="MNIST_labels")

    # Network.
    mlp = nn.MLP(list(layers) + [10],
               activation=activation_op,
               initializers=_nn_initializers)
    network = nn.Sequential([nn.BatchFlatten(), mlp])

    def build():
        indices = tf.random_uniform([batch_size], 0, data.num_examples, tf.int64)
        batch_images = tf.gather(images, indices)
        batch_labels = tf.gather(labels, indices)
        output = network(batch_images)
        return _xent_loss(output, batch_labels)

    return build

To train an optimizer, we essentially need three things
1. We should be able to give loss function.
2. We should have a LSTM network as meta-optimizer.
3. We should be able to tell our optimizer, what kind of network that we want for each variables. 

### An important observation

When I was training a simple_mulit_optimizer, the adam optimizer was not able to optimize its parameter, while the CW-DEEP-LSTM was able to minimize it. In particular, both parameters should be zero after optimization but the parameter that was optimized using adam was saturating around 0.005 and was not going below that. When I changed the update rule to CW-DEEP-LSTM, both the parameters went close to zero.  

## Problems

### Simple

In [None]:
problem_simple = simple()  # return a function "build" that gives square-loss when called 
net_config_simple = {"cw": {
        "net": "CoordinateWiseDeepLSTM",
        "net_options": {"layers": (), "initializer": "zeros"},
        "net_path": None
    }}
net_assignments_simple = None

### simple-multi-dimensional

In [None]:
problem_simple_multi_optimizer = simple_multi_optimizer()
net_config_simple_multi_optimizer = {
    "cw": {
        "net": "CoordinateWiseDeepLSTM",
        "net_options": {"layers": (), "initializer": "zeros"},
        "net_path": None
    },
    "adam": {
        "net": "Adam",
        "net_options": {"learning_rate": 0.1}
    }
}
net_assignments_simple_multi_optimizer = [("cw", ["x_0", "x_1"])]

### quadratic

In [None]:
problem_quadratic = quadratic(batch_size=2, num_dims=10)
net_config_quadratic = {"cw": {
    "net": "CoordinateWiseDeepLSTM",
    "net_options": {"layers": (20, 20)},
    "net_path": None
}}
net_assignments_quadratic = None

### mnist

In [None]:
problem_mnist = mnist(layers=(5,), mode="train")
net_config_mnist = {"cw": {
                    "net": "CoordinateWiseDeepLSTM",
                    "net_options": {
                          "layers": (20, ),
                          "preprocess_name": "LogAndSign",
                          "preprocess_options": {"k": 5},
                          "scale": 0.01,
                                    },
                    "net_path": None
                    }
              }

net_assignments_mnist = None

In [None]:
# Optimizer setup.
optimizer = meta.MetaOptimizer(**net_config_mnist)

In [None]:
minimize = optimizer.meta_minimize(
      problem_mnist, unroll_length,
      learning_rate=learning_rate,
      net_assignments=net_assignments_mnist)

In [None]:
step, update, reset, cost_op, _ = minimize

In [None]:
sess = ms.MonitoredSession()

In [None]:
tf.get_default_graph().finalize()

In [None]:
best_evaluation = float("inf")
total_time = 0
total_cost = 0

In [None]:
for e in xrange(num_epochs):
    # Training.
    time, cost = util.run_epoch(sess, cost_op, [update, step], reset,
                              num_unrolls)
    total_time += time
    total_cost += cost

    # Logging.
    if (e + 1) % log_period == 0:
        util.print_stats("Epoch {}".format(e + 1), total_cost, total_time,
                         log_period)
        total_time = 0
        total_cost = 0


### Sequential Data

In [1]:
from basic_rnn_using_tensorflow_api import BasicRNN
from sequential_data import SequentialData
import numpy as np
import tensorflow as tf

In [2]:
num_epochs = 1
data_size = 1000000
batch_size = 200
num_steps = 10
num_classes = 2
learning_rate = 0.001
state_size = 5

In [3]:
data = SequentialData(data_size=data_size, batch_size=batch_size,
                      num_steps=num_steps, num_classes=num_classes)

In [4]:
t = list(data.gen_epoch(num_epochs).next())

In [5]:
basic_rnn = BasicRNN(state_size=state_size, num_steps=num_steps, num_classes=num_classes,
                    learning_rate=learning_rate)

In [6]:
i, o = t[0]

In [7]:
init_state = np.zeros((batch_size, state_size))

In [8]:
feed = {basic_rnn.input: i, basic_rnn.target: o, basic_rnn.init_state: init_state}

In [9]:
sess = tf.Session()

In [44]:
basic_rnn.input = i
basic_rnn.target = o
basic_rnn.init_state = init_state

In [10]:
sess.run(tf.global_variables_initializer())

In [11]:
variables = tf.get_collection("variables")

In [12]:
[_.name for _ in tf.get_collection("variables")]

[u'rnn/W:0',
 u'rnn/b:0',
 u'softmax/W_softmax:0',
 u'softmax/b_softmax:0',
 u'RNN/BasicRNNCell/Linear/Matrix:0',
 u'RNN/BasicRNNCell/Linear/Bias:0',
 u'optimization/beta1_power:0',
 u'optimization/beta2_power:0',
 u'optimization/softmax/W_softmax/Adam:0',
 u'optimization/softmax/W_softmax/Adam_1:0',
 u'optimization/softmax/b_softmax/Adam:0',
 u'optimization/softmax/b_softmax/Adam_1:0',
 u'optimization/RNN/BasicRNNCell/Linear/Matrix/Adam:0',
 u'optimization/RNN/BasicRNNCell/Linear/Matrix/Adam_1:0',
 u'optimization/RNN/BasicRNNCell/Linear/Bias/Adam:0',
 u'optimization/RNN/BasicRNNCell/Linear/Bias/Adam_1:0']

In [42]:
sess.run(variables[0])

array([[-0.63520825, -0.28813177,  0.07283157, -0.13720179,  0.4340862 ],
       [ 0.54079354, -0.51890314,  0.43846619, -0.30666414,  0.62554479],
       [ 0.22436571, -0.05976206,  0.35196531,  0.05456895, -0.55082071],
       [ 0.21429145,  0.04938388,  0.07652867,  0.26680684,  0.10104239],
       [-0.42012298,  0.02139503,  0.5290997 ,  0.31906343, -0.42870903],
       [-0.63955635, -0.27138782,  0.42434478, -0.2647545 , -0.19300362],
       [ 0.24836046, -0.14170069, -0.28564963, -0.46438271, -0.13674998]], dtype=float32)

In [46]:
a, b = sess.run([basic_rnn.loss, basic_rnn.final_state])

InvalidArgumentError: You must feed a value for placeholder tensor 'input' with dtype int32
	 [[Node: input = Placeholder[dtype=DT_INT32, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op u'input', defined at:
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 498, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-78b5d3ae6555>", line 2, in <module>
    learning_rate=learning_rate)
  File "basic_rnn_using_tensorflow_api.py", line 25, in __init__
    self.create_graph()
  File "basic_rnn_using_tensorflow_api.py", line 108, in create_graph
    self.create_placeholders()
  File "basic_rnn_using_tensorflow_api.py", line 37, in create_placeholders
    self.input = tf.placeholder(tf.int32, shape=(None, self.num_steps), name="input")
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 1512, in placeholder
    name=name)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 2043, in _placeholder
    name=name)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
    op_def=op_def)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/drl/anaconda2/envs/keras/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'input' with dtype int32
	 [[Node: input = Placeholder[dtype=DT_INT32, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


In [31]:
i = 1

In [32]:
def x():
    def y():
        global i
        print i
        i += 1
    return y

In [33]:
a = x()

In [40]:
a()

7


In [18]:
basic_rnn.

In [None]:
num_epochs = 1
data_size = 1000000
batch_size = 200
num_steps = 10
num_classes = 2
state_size = 5
learning_rate = 0.001
index = 0
initial_state = np.zeros((batch_size, state_size))
def sequence(batch_size=128):
    """Sequence classification with RNN."""
    # Data.

    data = SequentialData(data_size=data_size, batch_size=batch_size,
                      num_steps=num_steps, num_classes=num_classes)
    data = list(data.gen_epoch(num_epochs).next())
    # Network
    network = BasicRNN(state_size=state_size, num_steps=num_steps, num_classes=num_classes,
                    learning_rate=learning_rate)

    def build(): 
        global index, initial_state
        input_, target = data[index]
        output = network(batch_images)
        return _xent_loss(output, batch_labels)
    

    return build

In [None]:
[var.name for var in tf.get_collection("variables")]

In [None]:
sess.run(tf.get_collection("variables", scope="vars_optimizer")[0]).shape

In [None]:
for var in tf.get_collection("variables", scope="states"):
    print(var.name)

In [None]:
784 * 5 + 5 +  5 * 10 + 10

In [None]:
w1, b1, w2, b2 = sess.run(tf.get_collection("variables")[0:4])

In [None]:
import numpy as np

In [None]:
def sigmoid(x):
    return 1./(1+np.exp(-x))

In [None]:
def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)
    x = np.exp(x)
    return x/np.sum(x, axis=1, keepdims=True)

In [None]:
def test_mnist(x):
    l1 = sigmoid(np.dot(x, w1) + b1)
    return softmax(np.dot(l1, w2) + b2).argmax(axis=1)

In [None]:
data = mnist_dataset.load_mnist()

In [None]:
test_mnist(data.test.images[0:20])

In [None]:
data.test.labels[0:20]