# Export PPOAgent - ActorCriticNetwork

In [35]:
import tensorflow as tf
import numpy as np
from tf_agents.agents.ppo.ppo_kl_penalty_agent import PPOKLPenaltyAgent
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.networks.value_network import ValueNetwork
from tf_agents.environments import tf_py_environment
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from rl_env.DoomEnviroment import DoomEnvironment
import matplotlib.pyplot as plt
import collections
import joblib

In [36]:
CPT_DIR = '/home/jupyter/train_data/agentV2.9/train/policy'
CHECKPOINT_NO = 19000
NO_OF_DATA_POINTS = 100

In [3]:
env = DoomEnvironment(
    config_name='rl_env/custom.cfg', 
    frame_skip=4, 
    episode_timeout=2000, 
    obs_shape=(60, 100),
    start_ammo=6, 
    living_reward=3, 
    kill_imp_reward=100, 
    kill_demon_reward=10, 
    ammo_reward=5, 
    health_reward=.5
)
tfpy_env = tf_py_environment.TFPyEnvironment(env)

In [4]:
actor_net = ActorDistributionNetwork(
    input_tensor_spec=tfpy_env.observation_spec(),
    output_tensor_spec=tfpy_env.action_spec(),
    fc_layer_params = (200, 100)
)

value_net = ValueNetwork(
    input_tensor_spec=tfpy_env.observation_spec(),
    fc_layer_params = (200, 100)
)

agent = PPOKLPenaltyAgent(
    time_step_spec=tfpy_env.time_step_spec(),
    action_spec=tfpy_env.action_spec(),
    actor_net=actor_net,
    value_net=value_net,
    entropy_regularization = 0.0,
    kl_cutoff_factor = 2.0,
    kl_cutoff_coef = 100,
    initial_adaptive_kl_beta = 1.0,
    adaptive_kl_target = 0.01,
    adaptive_kl_tolerance = 0.3,
    normalize_observations = True,
    normalize_rewards = False,
    use_gae = True,
    num_epochs = 25,
)

In [5]:
checkpoint_state = tf.train.get_checkpoint_state(CPT_DIR)
checkpoint_paths = list(checkpoint_state.all_model_checkpoint_paths)
checkpoint = [x for x in checkpoint_paths if str(CHECKPOINT_NO) in x][0]
checkpoint

'/home/jupyter/train_data/agentV2.9/train/policy/ckpt-19000'

In [6]:
policy_checkpoint = tf.train.Checkpoint(policy=agent.policy)
load_status = policy_checkpoint.restore(checkpoint)

In [7]:
time_step = tfpy_env.reset()
observations = []
actions = []
for _ in range(NO_OF_DATA_POINTS):
    action_obj = agent.actor_net.call(time_step.observation, time_step.step_type, network_state=())[0]
    action = action_obj.sample().numpy()[0]
    actions.append(action)
    time_step = tfpy_env.step(action=action)
    observations.append(time_step.observation)

In [8]:
agent.actor_net.summary()

Model: "ActorDistributionNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncodingNetwork (EncodingNet multiple                  7220300   
_________________________________________________________________
CategoricalProjectionNetwork multiple                  404       
Total params: 7,220,704
Trainable params: 7,220,704
Non-trainable params: 0
_________________________________________________________________


In [9]:
encoding_net = agent.actor_net.layers[0]
encoding_net.summary()

Model: "EncodingNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  7200200   
_________________________________________________________________
dense_1 (Dense)              multiple                  20100     
Total params: 7,220,300
Trainable params: 7,220,300
Non-trainable params: 0
_________________________________________________________________


In [10]:
flat = encoding_net.get_layer('flatten')
dense_0 = encoding_net.get_layer('dense')
dense_1 = encoding_net.get_layer('dense_1')

In [11]:
flat_acts = []
dense_0_acts = []
dense_1_acts = []
for ob in observations:
    flat_act = flat(ob)
    dense_0_act = dense_0(flat_act)
    dense_1_act = dense_1(dense_0_act)
    flat_acts.append(flat_act.numpy()[0])
    dense_0_acts.append(dense_0_act.numpy()[0])
    dense_1_acts.append(dense_1_act.numpy()[0])

In [13]:
flat_acts = np.array(flat_acts)
dense_0_acts = np.array(dense_0_acts)
dense_1_acts = np.array(dense_1_acts)
observations = np.array([ob.numpy()[0] for ob in observations])
actions = np.array(actions)

In [14]:
flat_acts.shape, dense_0_acts.shape, dense_1_acts.shape, observations.shape, actions.shape

((100, 36000), (100, 200), (100, 100), (100, 60, 100, 6), (100,))

In [30]:
class ExperienceData(
    collections.namedtuple('Trajectory', [
        'observations',
        'actions',
        'activations_lyr0',
        'activations_lyr1',
        'activations_lyr2',
    ])):
  __slots__ = ()

In [31]:
exps = ExperienceData(observations=observations, 
                      actions=actions, 
                      activations_lyr0=flat_acts, 
                      activations_lyr1=dense_0_acts, 
                      activations_lyr2=dense_1_acts)

In [37]:
joblib.dump(exps, '/home/jupyter/train_data/agentV2.9/AgentExperienceData.pkl')

['/home/jupyter/train_data/agentV2.9/AgentExperienceData.pkl']