In [1]:
import tensorflow as tf
import faiss                   # make faiss available
import numpy as np

In [2]:
tf.enable_eager_execution()

- Remember everything.
- When asked to make an action, look up its value from past experiences.
- Distill the expensive memoriser into a policy NN.

Also https://arxiv.org/pdf/1702.08833.pdf

In [3]:
d_state = 8
d_hidden = 4

data = [(np.random.standard_normal((1, d_state)).astype(np.float32),
        np.random.randint(0, 4, size=(1,1)),
        np.random.random((1,1)))
        for _ in range(10000)]

In [4]:
class LSH():
    def __init__(self, input_dims, output_dims):
        self.W = np.random.standard_normal((input_dims, output_dims)).astype(np.float32)
        
    def __call__(self, x):
        return np.dot(x, self.W)

In [5]:


lsh = LSH(d_state, d_hidden)
lsh(data[0][0])

array([[ 0.07962756,  3.997297  ,  1.6272243 , -2.9108546 ]],
      dtype=float32)

In [6]:
def softmax(x, axis=1):
    return np.exp(x)/np.sum(np.exp(x), axis=axis, keepdims=True)

In [7]:
class Memorizer():
    def __init__(self, input_dims, n_actions, hidden_dims):
        self.n_actions = n_actions
#         self.buffer = deque
        self.k = 5
        
        # f: S -> idx
        self.lsh = LSH(input_dims+n_actions, hidden_dims)
        
        # build the index
        self.index = faiss.IndexFlatL2(hidden_dims)
        
        # to store the values
        self.values = dict()
        
    def __call__(self, s):
        # calculate hash for all state-action combinations
        a = np.arange(self.n_actions)
        s = np.vstack([s for _ in range(self.n_actions)])
        x = bundle(s, a, self.n_actions)
        h = self.lsh(x)
        
        # find the nearest neighbors
        # n_args x n_neighbors (n_args = first dim of h)
        dist, indexes  = self.index.search(h, self.k)
        
        # index the value for the nearest neighbor.
        # n_actions x n_neighbors
        # TODO want a better way to index the values
        vs = np.array([[self.values[idx] for idx in indexes[:, i]] 
                         for i in range(self.k)]).squeeze().T
        # softmax over neighbors. the neighbor with the;
        # - smallest dist exponentially dominates
        # - largest value exponentially dominates
        vs_adj = np.einsum('ij,kj->i', softmax(vs), softmax(-dist))
        return np.argmax(vs_adj)
        
    def add(self, traj):
        s, a, r = tuple([np.vstack(val) for val in zip(*traj)])
        x = bundle(s, a, self.n_actions)
        # v = discounted_rewards(r)
        v = r
        h = self.lsh(x)
        init_idx = self.index.ntotal
        self.index.add(h)
        final_idx = self.index.ntotal
        
        for i, v_t in zip(range(init_idx, final_idx), r):
            self.values[i] = v_t
            
    def compress(self):
        # find dense regions and summarise them.
        
        # could do average. not sure it makes sense.
        # take value to be the max so far
        # self.values[idx, a_t] = np.maximum(self.values[idx, a_t], v_t)
        pass
        
    def balance(self, gen):
        gen_copy = copy.deepcopy(gen)
        self.hash.train(gen)
        
        self.index.reset()
        self.values.reset()
        self.add(gen_copy)
        
        # reset value memory.
        # TODO but only if we have actually changed the hash...
        # when balancing. want to hashes with low churn!

In [8]:
def onehot(x, N):
    return np.eye(N)[x].astype(np.float32)

def bundle(s, a, N):
    B = s.shape[0]
    return np.concatenate([s, onehot(a, N).reshape((B, N))], axis=1)

In [9]:
player = Memorizer(d_state, 4, d_hidden)

In [10]:
player.add(data)

In [11]:
s = np.random.standard_normal((1, d_state)).astype(np.float32)

In [25]:

player(s)

2

In [28]:
class BehavourialCloner():
    # Can use behaviourial cloning techniques here?!
    # is there some fancy stuff?!
    def __init__(self):
        self.policy = make_NN()
    
    def __call__(self, s):
        return sample(self.policy(s))
    
    def loss_fn(self, s, a_star):
        logits = self.policy(s)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=a_star, logits=logits)
        return tf.reduce_mean(loss)
    
    def train(self, generator):
        for s, a in generator:
            with tf.GradientTape() as tape:
                loss = self.loss_fn(s, a)
            grads = tape.gradient(loss, self.policy.variables)
            self.opt.apply_gradients(zip(grads, self.policy.variables))

In [None]:
class Player():
    def __init__(self):
        self.buffer = Buffer()
        self.memorizer = Memorizer()
        self.policy = BehavourialCloner()
        
    def choose_action(self, s):
        return self.policy(s)
        
    def sleep(self):
        # 
        s = self.buffer.get_states()
        a = self.memorizer(s)
        self.policy.train(zip(s, a))