In [1]:
import tensorflow as tf
tf.enable_eager_execution()

In [2]:
import numpy as np
import gym
import tensorflow_probability as tfp
tfd = tfp.distributions

import trfl
import sonnet as snt

TODOs

- Test each independently.
    - tabular version of action decoding
    - ?
- Extend to partial info

In [3]:
class Policy():
    """
    Vanilla policy with A2C.
    """
    def __init__(self, n_actions):
        self.fn = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation=tf.nn.selu),
            tf.keras.layers.Dense(n_actions+1)
        ])
        self.n_actions = n_actions
        
    def __call__(self, x):
        z = self.fn(x)
        a = tfd.Categorical(logits=z[:,:self.n_actions]).sample()
        return a, lambda r, tape: self.loss(z, a, r, tape)
    
    def loss(self, z, a, r, tape):
        # TODO add entropy regularisation
        loss = trfl.discrete_policy_gradient(z[:,:self.n_actions], a, z[:,-1])
        return loss, self.fn.variables

In [4]:
class Encoder(): # aka perception!?
    # WANT VQ.
    # opt for;
    # - high entropy clustering
    # - disentanglement
    # - sparsity
    # - local/multiscale
    def __init__(self, n_hidden):
        self.fn = tf.keras.Sequential([
            # could add some memory in here. lSTM or DNC
            tf.keras.layers.InputLayer(input_shape=[8]),
            tf.keras.layers.Dense(64, activation=tf.nn.selu),
            tf.keras.layers.Dense(n_hidden)
        ])
        
        self.vq = snt.nets.VectorQuantizer(embedding_dim=n_hidden, num_embeddings=32, commitment_cost=1)
                
    def __call__(self, x):
        h = self.fn(x)
        z = self.vq(h, True)
        return z['quantize'], lambda x_tp1, tape: self.loss(z, x_tp1, tape)
        
    def loss(self, z_t, x_tp1, tape):
        z_tp1, _ = self.__call__(x_tp1)

        # optimise for temporal similarity
        loss_sim = tf.losses.mean_squared_error(z_t['quantize'], z_tp1)
        
        # and high perplexity
        loss = loss_sim-z_t['perplexity']
        return loss, self.fn.variables + [self.vq.embeddings]

In [None]:
class Transition(): # aka reasoning!?
    # could add another 'intrinsic' value fn here.
    # to encourage exploration.
    
    # also should explore energy via distribution vs not.
    
    # seems weird. how I am training this. the energy fn will never be able to 
    # achieve high next step prediction accuracy unless it has a model of V!?
    def __init__(self):
        self.energy_fn = tf.keras.Sequential([
            # could add some memory in here. lSTM or DNC
            tf.keras.layers.InputLayer(input_shape=[16]),
            tf.keras.layers.Dense(64, activation=tf.nn.selu),
            tf.keras.layers.Dense(1)
        ])
        
        self.value_fn = tf.keras.Sequential([
            # could add some memory in here. lSTM or DNC
            tf.keras.layers.Dense(64, activation=tf.nn.selu),
            tf.keras.layers.Dense(1)
        ])
        
        self.gamma = 0.99
                
    def __call__(self, x_t, step_size=0.1):
        with tf.GradientTape() as tape:
            tape.watch(x_t)
            e = self.energy_fn(x_t) 
            v = self.value_fn(x_t)
            
            cost = v - e
            
        grad = tape.gradient(cost, x_t)
         # ascend value and descend energy
        x_hat_tp1 = x_t + step_size*grad[0]
        return x_hat_tp1, lambda x_tp1, r, tape: self.loss(x_t, x_tp1, x_hat_tp1, v, r, tape)
        
    def loss(self, x_t, x_tp1, x_hat_tp1, v_t, r_t, tape):
        # observations should have low energy
        # not sure how to optimise for that!?

        # for now. optimise E, V for accuracy
        loss_acc = tf.losses.mean_squared_error(x_tp1, x_hat_tp1)

        # value should predict future rewards
        v_tp1 = self.value_fn(x_tp1)
        loss_value = tf.losses.mean_squared_error(v_t, r_t+self.gamma*v_tp1)  
        # could split out the value fn as another class. as will need for policy as well.

        losses = [loss_value,loss_acc]
        variables = [self.value_fn.variables, self.energy_fn.variables]

        return losses, variables

In [None]:
class Net():
    def __init__(self, n_actions):
        self.encoder = Encoder(16)
        self.transition = Transition()
        self.policy_fn = Policy(n_actions)
        """
        This policy has little to do with achieving 'extrinsic value'.
        Its main task is reachability. I want to go to X. This policy should make it happen. 
        """
        # does this need memory?
        # the ability to integrate the deltas?
        # the ability to remember the past?
        # will be a pain for training...
        
        
        self.opt = tf.train.AdamOptimizer()
        self.step = tf.train.get_or_create_global_step()
        
        self.writer = tf.contrib.summary.create_file_writer('/tmp/net/0')
        self.writer.set_as_default()
                
    def __call__(self, x_t):
        """
        Handles training and prediction.
        """
        # and/or could use a worker to collect data and train offline...
        # BUT. how to combine both!? will need to aggregate params somehow. or stop playing.
        
        # key to good online learning is a good exploration policy that
        # produces a uniform distribution of ...
        
        s_t, encoder_callback = self.encoder(x_t)
        s_hat_tp1, transition_callback = self.transition(s_t)
        a, policy_callback = self.policy_fn(s_t - s_hat_tp1)  # is this enough info? or do we need abs info?
        
        def callback(x_tp1, r, tape):
            # this callback is nice bc we dont have to do any recompute
            s_tp1, _ = self.encoder(x_tp1)
            
            encoder_loss, encoder_vars = encoder_callback(x_tp1, tape)
            transition_losses, transition_vars = transition_callback(s_tp1, r, tape)
            
            # use the transtition loss the the reward for the policy.
            # but high loss means the policy could not achieve its target.
            # but is also good as maybe we experienced something novel!? what is the difference?
            policy_loss, policy_vars = policy_callback(tf.stop_gradient(-transition_losses[1]), tape)
                
            lnvs = [
                (encoder_loss, encoder_vars),
                (policy_loss, policy_vars)
            ]
            
            losses, variables = zip(*lnvs)
            losses = list(losses) + transition_losses
            variables = list(variables) + transition_vars
            
            with tf.contrib.summary.record_summaries_every_n_global_steps(10):
                names = ['enc', 'policy', 'value', 'acc']
                for name, loss in zip(names, losses):
                    tf.contrib.summary.scalar(name, loss)

            
            grads = tape.gradient(list(losses), list(variables))
            gnvs = zip(grads, variables)
            gnvs = [(g, v) if g is not None else (tf.zeros_like(v), v)
                    for G, V in gnvs for g,v in zip(G, V)]

            # PROBLEM!? not sure....
#             gnvs = [(g, v) for G, V in gnvs for g,v in zip(G, V)]    
#             count = sum([1 if g is None else 0 for g,v in gnvs])
#             print(count)
#             raise SystemExit
            
            
            self.opt.apply_gradients(gnvs, global_step=self.step)
        
        return a, callback

In [None]:
env = gym.make('LunarLander-v2')
obs = env.reset()

  result = entry_point.load(False)


In [None]:
player = Net(env.action_space.n)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.


In [None]:
def run_episode():
    obs = env.reset()
    done = False
    R = 0
    reward = 0

    while not done:
        with tf.GradientTape() as tape:
            action, callback = player(tf.constant(obs, dtype=tf.float32, shape=[1, 8]))
            env.render()
            obs, reward, done, info = env.step(action.numpy()[0])

            callback(tf.constant(obs, dtype=tf.float32, shape=[1, 8]), 
                     tf.constant(reward, dtype=tf.float32, shape=[1, 1]), 
                     tape)
            R += reward
        
        with tf.contrib.summary.record_summaries_every_n_global_steps(1):
            tf.contrib.summary.scalar('R', R)
    return R

In [None]:
for _ in range(10000):
    run_episode()