In [1]:
import tensorflow.keras.losses as kloss
import tensorflow.keras.optimizers as opt

In [2]:
import sys
sys.path.append("../src/")

from config import *
from pong_wrapper import *
from process_image import *
from utilities import *
from plugin_write_and_run import *
from a2c_networks import *

In [3]:
pw = PongWrapper(ENV_NAME)

In [4]:
class A2CAgent:
    def __init__(self, model, lr=1e-4, gamma=0.99, value_c=0.5, entropy_c=1e-4):
        # `gamma` is the discount factor
        self.gamma = gamma
        # Coefficients are used for the loss terms.
        self.value_c = value_c
        self.entropy_c = entropy_c
        # `gamma` is the discount factor
        self.gamma = gamma

        self.model = model
        self.model.compile(
            optimizer=opt.RMSprop(lr=lr),
            # Define separate losses for policy logits and value estimate.
            loss=[self._logits_loss, self._value_loss])
    
    def train(self, wrapper, batch_sz=64, updates=1000, input_shape=(84, 84, 4)):
        # Storage helpers for a single batch of data.
        actions = np.empty((batch_sz,), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_sz))
        observations = np.empty((batch_sz,) + input_shape)

        # Training loop: collect samples, send to optimizer, repeat updates times.
        ep_rewards = [0.0]
        next_obs = wrapper.reset()
        for update in range(updates):
            for step in range(batch_sz):
                observations[step] = next_obs.copy()
                actions[step], values[step] = self.model.action_value(next_obs[None, :])
                next_obs, rewards[step], dones[step], _ = wrapper.step(actions[step], "rgb_array")
                next_obs = wrapper.state
                ep_rewards[-1] += rewards[step]
                if dones[step]:
                    ep_rewards.append(0.0)
                    next_obs = wrapper.reset()
                    print("Episode: %03d, Reward: %03d" % (
                        len(ep_rewards) - 1, ep_rewards[-2]))

            _, next_value = self.model.action_value(next_obs[None, :])

            returns, advs = self._returns_advantages(rewards, dones, values, next_value)
            # A trick to input actions and advantages through same API.
            acts_and_advs = np.concatenate([actions[:, None], advs[:, None]], axis=-1)

            # Performs a full training step on the collected batch.
            # Note: no need to mess around with gradients, Keras API handles it.
            losses = self.model.train_on_batch(observations, [acts_and_advs, returns])

            print("[%d/%d] Losses: %s" % (update + 1, updates, losses))

        return ep_rewards

    def _returns_advantages(self, rewards, dones, values, next_value):
        # `next_value` is the bootstrap value estimate of the future state (critic).
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)

        # Returns are calculated as discounted sum of future rewards.
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.gamma * returns[t + 1] * (1 - dones[t])
        returns = returns[:-1]

        # Advantages are equal to returns - baseline (value estimates in our case).
        advantages = returns - values

        return returns, advantages

    def test(self, pw, render=True):
        obs, done, ep_reward = pw.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(obs[None, :])
            obs, reward, done, _ = pw.step(action, "rgb_array")
            obs = pw.state
            ep_reward += reward
        return ep_reward

    def _value_loss(self, returns, value):
        # Value loss is typically MSE between value estimates and returns.
        return self.value_c * kloss.mean_squared_error(returns, value)

    def _logits_loss(self, actions_and_advantages, logits):
        # A trick to input actions and advantages through the same API.
        actions, advantages = tf.split(actions_and_advantages, 2, axis=-1)

        # Sparse categorical CE loss obj that supports sample_weight arg on `call()`.
        # `from_logits` argument ensures transformation into normalized probabilities.
        weighted_sparse_ce = kloss.SparseCategoricalCrossentropy(from_logits=True)

        # Policy loss is defined by policy gradients, weighted by advantages.
        # Note: we only calculate the loss on the actions we've actually taken.
        actions = tf.cast(actions, tf.int32)
        policy_loss = weighted_sparse_ce(actions, logits, sample_weight=advantages)

        # Entropy loss can be calculated as cross-entropy over itself.
        probs = tf.nn.softmax(logits)
        entropy_loss = kloss.categorical_crossentropy(probs, probs)

        # We want to minimize policy and maximize entropy losses.
        # Here signs are flipped because the optimizer minimizes.
        return policy_loss - self.entropy_c * entropy_loss

In [5]:
model = Model(num_actions=pw.env.action_space.n, hidden=hidden)

In [6]:
agent = A2CAgent(model)

In [7]:
rewards_history = agent.train(pw)

[1/1000] Losses: [-1.0427892208099365, -1.3785712718963623, 0.3357820510864258]
[2/1000] Losses: [2750892.5, 0.0, 2750892.5]
[3/1000] Losses: [16.36139678955078, -3.1221742630004883, 19.483570098876953]
[4/1000] Losses: [804.3336791992188, -8.756754065638442e-33, 804.3336791992188]
[5/1000] Losses: [28.43353843688965, -2.6816821098327637, 31.11522102355957]
[6/1000] Losses: [2.273188591003418, -7.602491677971557e-06, 2.273196220397949]
[7/1000] Losses: [1.8732295036315918, 0.4600639343261719, 1.41316556930542]
[8/1000] Losses: [0.7222954034805298, -1.3408609333964705e-07, 0.7222955226898193]
[9/1000] Losses: [0.6029934883117676, 3.5943463672083453e-07, 0.6029931306838989]
[10/1000] Losses: [0.19441978633403778, -1.5859325230849208e-07, 0.1944199502468109]
[11/1000] Losses: [0.05819389224052429, -4.365978867326703e-08, 0.058193936944007874]
[12/1000] Losses: [0.04497600719332695, -2.4780888452369254e-08, 0.04497603327035904]
[13/1000] Losses: [2.604947805404663, 7.97214397607604e-07, 2.