# Export PPOAgent - ActorCriticNetwork

In [32]:
import tensorflow as tf
import numpy as np
from tf_agents.agents.ppo.ppo_kl_penalty_agent import PPOKLPenaltyAgent
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.networks.value_network import ValueNetwork
from tf_agents.environments import tf_py_environment
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from rl_env.DoomEnviroment import DoomEnvironment
import matplotlib.pyplot as plt
import collections
import joblib
from tqdm import tqdm

## Create Agent, Load Checkpoints, Flatten Model and Save it

In [29]:
CPT_DIR = '/home/jupyter/train_data/agentV2.9/train/policy'
CHECKPOINT_NO = 19000
NO_OF_DATA_POINTS = 10000

In [35]:
env = DoomEnvironment(
    config_name='rl_env/custom.cfg', 
    frame_skip=4, 
    episode_timeout=2000, 
    obs_shape=(60, 100),
    start_ammo=6, 
    living_reward=3, 
    kill_imp_reward=100, 
    kill_demon_reward=10, 
    ammo_reward=5, 
    health_reward=.5
)
tfpy_env = tf_py_environment.TFPyEnvironment(env)

In [4]:
actor_net = ActorDistributionNetwork(
    input_tensor_spec=tfpy_env.observation_spec(),
    output_tensor_spec=tfpy_env.action_spec(),
    fc_layer_params = (200, 100)
)

value_net = ValueNetwork(
    input_tensor_spec=tfpy_env.observation_spec(),
    fc_layer_params = (200, 100)
)

agent = PPOKLPenaltyAgent(
    time_step_spec=tfpy_env.time_step_spec(),
    action_spec=tfpy_env.action_spec(),
    actor_net=actor_net,
    value_net=value_net,
    entropy_regularization = 0.0,
    kl_cutoff_factor = 2.0,
    kl_cutoff_coef = 100,
    initial_adaptive_kl_beta = 1.0,
    adaptive_kl_target = 0.01,
    adaptive_kl_tolerance = 0.3,
    normalize_observations = True,
    normalize_rewards = False,
    use_gae = True,
    num_epochs = 25,
)

In [5]:
checkpoint_state = tf.train.get_checkpoint_state(CPT_DIR)
checkpoint_paths = list(checkpoint_state.all_model_checkpoint_paths)
checkpoint = [x for x in checkpoint_paths if str(CHECKPOINT_NO) in x][0]
checkpoint

'/home/jupyter/train_data/agentV2.9/train/policy/ckpt-19000'

In [6]:
policy_checkpoint = tf.train.Checkpoint(policy=agent.policy)
load_status = policy_checkpoint.restore(checkpoint)

In [7]:
agent.actor_net.summary()

Model: "ActorDistributionNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncodingNetwork (EncodingNet multiple                  7220300   
_________________________________________________________________
CategoricalProjectionNetwork multiple                  404       
Total params: 7,220,704
Trainable params: 7,220,704
Non-trainable params: 0
_________________________________________________________________


In [11]:
for v in agent.actor_net.variables:
    print(v.name, v.shape)

ActorDistributionNetwork/EncodingNetwork/dense/kernel:0 (36000, 200)
ActorDistributionNetwork/EncodingNetwork/dense/bias:0 (200,)
ActorDistributionNetwork/EncodingNetwork/dense_1/kernel:0 (200, 100)
ActorDistributionNetwork/EncodingNetwork/dense_1/bias:0 (100,)
ActorDistributionNetwork/CategoricalProjectionNetwork/logits/kernel:0 (100, 4)
ActorDistributionNetwork/CategoricalProjectionNetwork/logits/bias:0 (4,)


In [12]:
def flatten_model(model_nested):
    def get_layers(layers):
        layers_flat = []
        for layer in layers:
            try:
                layers_flat.extend(get_layers(layer.layers))
            except AttributeError:
                layers_flat.append(layer)
        return layers_flat

    model_flat = tf.keras.models.Sequential(
        get_layers(model_nested.layers)
    )
    return model_flat

In [13]:
flat_actorNet = flatten_model(agent.actor_net)
flat_actorNet.build(input_shape=(1, 60, 100, 6))
flat_actorNet.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  7200200   
_________________________________________________________________
dense_1 (Dense)              multiple                  20100     
_________________________________________________________________
logits (Dense)               multiple                  404       
Total params: 7,220,704
Trainable params: 7,220,704
Non-trainable params: 0
_________________________________________________________________


In [18]:
logits = flat_actorNet.get_layer('logits')

In [14]:
flat_actorNet.save('/home/jupyter/train_data/agentV2.9/actorNet.keras')

## Run Enviroment and Save Observations, Actions, Activations and Preferences

In [36]:
time_step = tfpy_env.reset()
observations = []
actions = []
preferences = []
for _ in tqdm(range(NO_OF_DATA_POINTS)):
    action_obj = agent.actor_net.call(time_step.observation, time_step.step_type, network_state=())[0]
    action = action_obj.sample().numpy()[0]
    time_step = tfpy_env.step(action=action)
    state = tfpy_env.envs[0]._game.get_state()
    tfpy_env.envs[0]._game.advance_action()
    preference = None
    if state is not None:
        label = [lbl for lbl in state.labels if lbl.object_name == 'Demon']
        if len(label) > 0:
            label = label[0]
            if label.object_angle < 90 or label.object_angle > 270:
                preference = 'Attacking Monster'
            else:
                preference = 'Attacking Agent'
        else:
            preference = 'Dead'
    preferences.append(preference)
    observations.append(time_step.observation)
    actions.append(action)

100%|██████████| 10000/10000 [02:01<00:00, 82.24it/s]


In [37]:
encoding_net = agent.actor_net.layers[0]
flat = encoding_net.get_layer('flatten')
dense_0 = encoding_net.get_layer('dense')
dense_1 = encoding_net.get_layer('dense_1')

In [38]:
flat_acts = []
dense_0_acts = []
dense_1_acts = []
for ob in tqdm(observations):
    flat_act = flat(ob)
    dense_0_act = dense_0(flat_act)
    dense_1_act = dense_1(dense_0_act)
    flat_acts.append(flat_act.numpy()[0])
    dense_0_acts.append(dense_0_act.numpy()[0])
    dense_1_acts.append(dense_1_act.numpy()[0])

100%|██████████| 10000/10000 [00:16<00:00, 594.74it/s]


In [39]:
flat_acts = np.array(flat_acts)
dense_0_acts = np.array(dense_0_acts)
dense_1_acts = np.array(dense_1_acts)
observations = np.array([ob.numpy()[0] for ob in observations])
actions = np.array(actions)
preferences = np.array(preferences)

In [40]:
flat_acts.shape, dense_0_acts.shape, dense_1_acts.shape, observations.shape, actions.shape, preferences.shape

((10000, 36000),
 (10000, 200),
 (10000, 100),
 (10000, 60, 100, 6),
 (10000,),
 (10000,))

In [41]:
class ExperienceData(
    collections.namedtuple('Trajectory', [
        'observation',
        'action',
        'activation_lyr0',
        'activation_lyr1',
        'activation_lyr2',
        'preference'
    ])):
  __slots__ = ()

In [42]:
exps = ExperienceData(
    observation=observations,
    action=actions,
    activation_lyr0=flat_acts,
    activation_lyr1=dense_0_acts,
    activation_lyr2=dense_1_acts,
    preference=preferences
)

In [43]:
joblib.dump([exps], '/home/jupyter/train_data/agentV2.9/AgentExperienceData.pkl')

['/home/jupyter/train_data/agentV2.9/AgentExperienceData.pkl']