In [1]:
import tensorflow as tf
import numpy as np
import time
import wandb
import os
from tqdm import tqdm

In [2]:
import sys
sys.path.append("../src/")
import wandb
import os
from config import *
from pong_wrapper import *
from process_image import *
from utilities import *
from ppo_network import *

In [20]:
class PPOAgent:
    def __init__(self, model, save_path=None, load_path=None, gamma=GAMMA, max_updates=MAX_UPDATES, batch_size=BATCH_SIZE, input_shape=INPUT_SHAPE+(HISTORY_LENGHT, )):
        # `gamma` is the discount factor
        self.gamma = gamma
        self.max_updates = max_updates
        self.batch_size = batch_size
        self.input_shape = input_shape
        self.save_path = save_path
        self.load_path = load_path
        
        self.model = model
        
        """
        if load_path is not None:
            print("loading model in {}".format(load_path))
            self.load_model(load_path)
            print("model loaded")
        """
        
    def get_prob_value_action(self, state, advantages, predictions):
        prob, value = self.model([state[None, :], advantages, predictions])
        return prob, value, tf.squeeze(tf.random.categorical(prob, 1), axis=-1)
    
    def get_returns_advantages(self, rewards, dones, values, next_value):
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)
        # Returns are calculated as discounted sum of future rewards.
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.gamma * returns[t + 1] * (1 - dones[t])
        returns = returns[:-1]
        
        # Advantages are equal to returns - baseline (value estimates in our case).
        advantages = returns - values
        
        return returns, advantages
        
    def train(self, game_wrapper):
        
        dummy_preds, dummy_adv = np.zeros((1, game_wrapper.env.action_space.n)), np.zeros((1, 1))
        
        probs = np.empty((self.batch_size, game_wrapper.env.action_space.n))
        actions = np.empty((self.batch_size,), dtype=np.int32)
        rewards, dones, values = np.empty((3, self.batch_size))
        observations = np.empty((self.batch_size,) + self.input_shape)
        
        ep_rewards = [0.0]
        next_obs = game_wrapper.reset()
        
        for update in tqdm(range(self.max_updates)):
            start_time = time.time()
            
            for step in range(self.batch_size):
                observations[step] = next_obs.copy()
                prob, value, action = self.get_prob_value_action(next_obs, dummy_adv, dummy_preds)
                actions[step] = action
                probs[step] = prob
                values[step] = value
                next_processed_image, rewards[step], dones[step], _ = game_wrapper.step(actions[step], "rgb_array")
                next_obs = game_wrapper.state
                ep_rewards[-1] += rewards[step]
                if dones[step]:
                    ep_rewards.append(0.0)
                    next_obs = game_wrapper.reset()
            
            _, next_value = self.model([next_obs[None, :], dummy_adv, dummy_preds])
            next_value = np.squeeze(next_value, -1)
            
            returns, advs = self.get_returns_advantages(rewards, dones, values, next_value)
            X = [observations, advs, probs]
            
            onehot_actions = np.eye(game_wrapper.env.action_space.n)[actions]

            #y = [onehot_actions, values]
            
            y={'values' : np.array(values), 'probs' :  np.array(onehot_actions)}

            
            #self.model.train_on_batch(X, y)

        return probs, values, actions, observations, dones, ep_rewards, rewards, next_value, returns, advs, X, y

In [21]:
ppo_network = build_ppo_network(6)
pw = PongWrapper(ENV_NAME, history_length=4)

In [22]:
test_agent = PPOAgent(ppo_network)

In [23]:
probs, values, actions, observations, dones, ep_rewards, rewards, next_value, returns, advs, X, y = test_agent.train(pw)

100%|██████████| 100/100 [00:22<00:00,  4.49it/s]


In [24]:
returns.shape

(64,)

In [25]:
advs.shape

(64,)

In [26]:
X[0].shape

(64, 84, 84, 4)

In [27]:
X[1].shape

(64,)

In [28]:
X[2].shape

(64, 6)

In [30]:
ppo_network.train_on_batch(X, y)

TypeError: in user code:

    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:755 train_step
        loss = self.compiled_loss(
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:213 __call__
        metric_obj.update_state(loss_metric_value, sample_weight=batch_dim)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/utils/metrics_utils.py:90 decorated
        update_op = update_state_fn(*args, **kwargs)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py:177 update_state_fn
        return ag_update_state(*args, **kwargs)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py:363 update_state  **
        sample_weight = weights_broadcast_ops.broadcast_weights(
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/ops/weights_broadcast_ops.py:155 broadcast_weights
        values = ops.convert_to_tensor(values, name="values")
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/profiler/trace.py:163 wrapped
        return func(*args, **kwargs)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:1540 convert_to_tensor
        ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/framework/constant_op.py:339 _constant_tensor_conversion_function
        return constant(v, dtype=dtype, name=name)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/framework/constant_op.py:264 constant
        return _constant_impl(value, dtype, shape, name, verify_shape=False,
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/framework/constant_op.py:281 _constant_impl
        tensor_util.make_tensor_proto(
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/framework/tensor_util.py:435 make_tensor_proto
        values = np.asarray(values)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/numpy/core/_asarray.py:83 asarray
        return array(a, dtype, copy=False, order=order)
    /Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/keras_tensor.py:273 __array__
        raise TypeError(

    TypeError: Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.


In [None]:
X