In [29]:
import tensorflow as tf
import gym
import numpy as np
from scipy import signal

class BatchHandler:
    
    def __init__(self,
                 obs_space: gym.spaces.Box,
                 action_space: gym.spaces.Discrete,
                 batch_size,
                 dtype=tf.float32):
        
        self.action_space = action_space
        
        # Convert dtype from tf to numpy
        if dtype == tf.float32:
            dtype = np.float32
        elif dtype == tf.float16:
            dtype = np.float16
        
        
        self.b_obs = np.zeros(
            shape=(batch_size, ) + obs_space.shape,
            dtype=dtype
        )

        self.b_act = np.zeros(
            shape=(batch_size,),
            dtype=dtype
        )
        
        self.b_act_logits = np.zeros(
            shape=(batch_size, action_space),
            dtype=dtype
        )
        
        self.b_rew = np.zeros(
            shape=(batch_size,),
            dtype=dtype
        )
        
        self.batch_size = batch_size
        self.counter = 0
    
    def add(self, obs, action, action_logits, reward):
        self.b_obs[self.counter] = obs
        self.b_act[self.counter] = action
        self.b_act_logits[self.counter] = action_logits
        self.b_rew[self.counter] = reward
        
        if self.counter == self.batch_size - 1:
            self.counter = 0
            return True
            
        self.counter += 1
        return False    

class PGPolicy(tf.keras.models.Model):
    
    def __init__(self, action_space: gym.spaces.Discrete, dtype=tf.float32):
        super().__init__('pg_policy')
        self._dtype = dtype
        
        self.h_1 = tf.keras.layers.Dense(32, activation='relu', dtype=self._dtype)
        self.h_2 = tf.keras.layers.Dense(32, activation='relu', dtype=self._dtype)
        self.h_3 = tf.keras.layers.Dense(23, activation='relu', dtype=self._dtype)
        
        # Probabilties of each action
        self.logits = tf.keras.layers.Dense(action_space, activation=None, name='policy_logits', dtype=self._dtype)
        
        self.optimizer = tf.keras.optimizers.RMSprop(lr=0.0007) # TODO dytnamic
        self.compile(
                    optimizer=self.optimizer,
                    loss=[self._loss_logits]
                )

    
    def call(self, inputs):
        x = tf.convert_to_tensor(inputs, dtype=self._dtype) 
        x = self.h_1(x)
        #x = self.h_2(x)
        #x = self.h_3(x)
        out = self.logits(x)
        return out
        
    def _loss_logits(self, actual_y, pred_y):
        # http://inoryy.com/post/tensorflow2-deep-reinforcement-learning/
        actions, advantages = tf.split(actual_y, 2, axis=-1)
        
        weighted_sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = weighted_sparse_ce(actions, pred_y, sample_weight=advantages)
        entropy_loss = tf.keras.losses.categorical_crossentropy(pred_y, pred_y, from_logits=True)
        return policy_loss - 0.001 *entropy_loss
    
    def train(self, observations, actions, actions_logits, discounted_rewards):
  
        acts_and_advs = np.concatenate([actions[:, None], discounted_rewards[:, None]], axis=-1)
     
        losses = self.train_on_batch(observations, [acts_and_advs])
        print(losses)
        return losses
            
            

class PGAgent:
    
    def __init__(self, 
                 obs_space: gym.spaces.Box,
                 action_space: gym.spaces.Discrete,
                 gamma=0.99, # Discount factor
                 batch_size=1,
                 dtype=tf.float32):
        
        self.gamma = gamma
        
        self.action_space = action_space
        
        self.policy = PGPolicy(
            action_space=action_space,
            dtype=dtype
        )
        
        self.batch = BatchHandler(
            obs_space=obs_space,
            action_space=action_space,
            batch_size=batch_size,
            dtype=dtype
        )
        
        self.last_observation = None
        self.last_action = None
        self.last_action_logits = None
        
    def reset(self):
        self.last_observation = None
        self.last_action = None
        self.last_action_logits = None
        self.batch.counter = 0
        
    def predict(self, observation):
        action_logits = self.policy.predict(observation)
        self.last_action_logits = action_logits
        
        action_sample = tf.squeeze(tf.random.categorical(action_logits, 1)).numpy()
        
        self.last_observation = observation
        self.last_action = action_sample
        
        
        return action_sample
    
    def observe(self, reward):
        
        is_full = self.batch.add(
            obs=self.last_observation, 
            action=self.last_action, 
            action_logits=self.last_action_logits,
            reward=reward
        )
        
        # Batch is full.
        if is_full:
            
            #actions_and_logits = np.concatenate([
            #    self.batch.b_act[:, None], 
            #    self.batch.b_act_logits[:, None]
            #], axis=-1)
            
            observations = self.batch.b_obs
            actions = self.batch.b_act
            actions_logits = self.batch.b_act_logits
            rewards = self.batch.b_rew
            
            losses = self.policy.train(observations, actions, actions_logits, self._discounted_rewards(rewards))
            return losses

    def _discounted_rewards(self, episode_rewards):
        """ take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(episode_rewards)
        running_add = 0
        for t in reversed(range(0, episode_rewards.size)):
            #if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
            running_add = running_add * self.gamma + episode_rewards[t]
            discounted_r[t] = running_add
            
        discounted_r -= discounted_r.mean()
        discounted_r /- discounted_r.std()
        
        return discounted_r


In [30]:

env = gym.make('CartPole-v0')
agent = PG(
    obs_space=env.observation_space,
    action_space=env.action_space.n,
    batch_size=32
)



for e in range(300):
    steps = 0
    terminal = False
    obs = env.reset()
    #agent.reset()
    cum_loss = 0
    while not terminal:
        action = agent.predict(obs[None, :])
        obs, reward, terminal, info = env.step(action)
        reward = 0 if terminal else reward
        
        losses = agent.observe(reward)
        if losses is not None:
            cum_loss += losses
        steps += 1



-0.0013617369
-0.016735207
-0.07514664
0.06602718
0.06956212
0.06020675
-0.10450711
-0.30911234
-0.051262133
-0.034170356
0.34022716
0.054224692
0.021173744
0.08969717
0.100440025
0.009390388
-0.05487892
0.12053683
0.0030912957
0.01026321
-0.023374163
-0.12600274
0.056313083
0.14557771
-0.03539244
0.017897598
-0.05423072
-0.31782568
-0.06807476
0.051415935
0.016246816
0.03580606
0.040626023
-0.053464696
0.04489786
-0.008301324
0.119335845
0.088634536
0.008831598
0.027109846
0.10294019
-0.07586465
-0.023696706
0.05657626
-0.064632565
0.059798557
0.043465413
-0.26420546
-0.14404768
-0.021309298
0.13886853
-0.045178823
0.0030867024
0.08863833
0.009941473
-0.22023854
0.058307275
-0.070489995
0.024035871
0.07700421
0.052212834
0.0023313025
-0.008762917
0.09426248
-0.0984613
0.030175935
-0.20274293
-0.25919795
-0.3177393
0.03745797
0.20695809
-0.033254694
0.059724838
0.009446939
-0.2577966
0.004890954
-0.11574559
-0.02174506
0.02593891
0.049329508
-0.018100388
0.054605585
0.07056083
0.059746