In [48]:
# Parameters
artificial_humans = "../../data/training/ah_10/data/model.pt"
artificial_humans_model = "graph"
output_path = "../../data/manager/v2/dev/"
model_args = {
    "hidden_size": 5,
    "add_rnn": True,
    "add_edge_model": True,
    "add_global_model": False,
    "x_encoding": [
        {"name": "prev_contributions", "n_levels": 21, "encoding": "numeric"},
        {"name": "prev_punishments", "n_levels": 31, "encoding": "numeric"},
    ],
    "u_encoding": [
        {"name": "round_number", "n_levels": 16, "encoding": "numeric"},
        {"name": "prev_common_good", "norm": 128, "etype": "float"},
    ],
}
opt_args = {"lr": 0.003}
gamma = 1.0
eps = 0.2
target_update_freq = 20
n_episode_steps = 16
n_episodes = 100
memory_size = 50
sample_args = {"batch_size": 10}
n_test_episodes = 10
eval_freq = 10


In [49]:
%load_ext autoreload
%autoreload 2

import torch as th
import os
from itertools import count

from aimanager.manager.memory import Memory
from aimanager.manager.environment import ArtificialHumanEnv
from aimanager.artificial_humans import AH_MODELS
from aimanager.manager.manager import ArtificalManager
from aimanager.manager.graph_memory import GraphMemory

from torch_geometric.data import Batch


output_file =  os.path.join(output_path, "eval")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
class ManagerEvaluator():
    def __init__(self, n_episodes, n_test_episodes, eval_freq, n_episode_steps, output_file):
        self.device = th.device('cpu')
        self.n_test_episodes = n_test_episodes
        self.eval_freq = eval_freq
        self.t_episode = 0
        self.recorder = Memory(n_episodes= n_episodes // eval_freq * n_test_episodes, n_episode_steps=n_episode_steps, output_file=output_file, device=device)

    def eval(self, manager, env, episode):
        if episode % self.eval_freq == 0:
            episode_ = th.tensor(episode, device=self.device)

            for i in range(self.n_test_episodes):
                state = env.init_episode()
                for step in count():
                    state_ = {k: v.unsqueeze(0).unsqueeze(-1) for k, v in state.items()}
                    obs = Batch.from_data_list(manager.encode(state_, edge_index=env.edge_index))

                    # Get q values from controller
                    q_values = manager.get_q(manager_observations=obs, first=step == 0).squeeze(1)

                    # Sample a action
                    selected_action = manager.eps_greedy(q_values=q_values, eps=0)

                    state = env.punish(selected_action)
                    self.recorder.add(**state, episode_step=step, episode=episode_)

                    # pass actions to environment and advance by one step
                    state, reward, done = env.step()
                    if done:
                        break

                self.recorder.next_episode(self.t_episode)
                self.t_episode += 1

In [51]:
device = th.device('cpu')
rec_device = th.device('cpu')
artifical_humans = AH_MODELS[artificial_humans_model].load(artificial_humans).to(device)

env = ArtificialHumanEnv(
    artifical_humans=artifical_humans, n_agents=4, n_contributions=21, n_punishments=31, episode_steps=n_episode_steps, device=device)

manager = ArtificalManager(
    n_contributions=21, n_punishments=31, model_args=model_args, opt_args=opt_args, gamma=gamma, 
    target_update_freq=target_update_freq, default_values=artifical_humans.default_values, device=device)

replay_mem = GraphMemory(n_episodes=memory_size, n_episode_steps=n_episode_steps, n_nodes=4, device=device)
recorder = Memory(n_episodes=n_episodes, n_episode_steps=n_episode_steps, device=device)
evaluator = ManagerEvaluator(n_episodes, n_test_episodes, eval_freq, n_episode_steps, output_file)

display_freq = n_episodes // 20

for episode in range(n_episodes):
    if (episode > 1) and (episode % display_freq == 0):
        avg_common_good = recorder.last(display_freq)['common_good'].mean()
        avg_reward = replay_mem.last(display_freq)['reward'].mean()
        print(f'Start episode {episode}. Avg common good {avg_common_good}. Avg reward {avg_reward}')

    state = env.init_episode()

    manager.init_episode(episode)

    for step in count():
        state_ = {k: v.unsqueeze(0).unsqueeze(-1) for k, v in state.items()}
        obs = Batch.from_data_list(manager.encode(state_, edge_index=env.edge_index))

        # Get q values from controller
        q_values = manager.get_q(manager_observations=obs, first=step == 0).squeeze(1)

        # Sample a action
        selected_action = manager.eps_greedy(q_values=q_values, eps=eps)

        state = env.punish(selected_action)
        recorder.add(**state, episode_step=step)

        # pass actions to environment and advance by one step
        state, reward, done = env.step()
        replay_mem.add(
            action=selected_action, reward=reward, 
            obs=obs)
 
        if done:
            replay_mem.next_episode(episode)
            
            # allow manager to update itself
            sample = replay_mem.sample(**sample_args)
            

            if sample is not None:
                manager.update(**sample)
            break
    recorder.add(episode_step=step, **state)
    recorder.next_episode(episode)
    evaluator.eval(manager, env, episode)


Start episode 5. Avg common good 9.103124618530273. Avg reward -1.1431190967559814
Start episode 10. Avg common good 10.698750495910645. Avg reward -0.8390940427780151
Start episode 15. Avg common good 7.6050004959106445. Avg reward -0.9586941003799438
Start episode 20. Avg common good 10.288125991821289. Avg reward -0.8202440142631531
Start episode 25. Avg common good 10.150625228881836. Avg reward -0.7964940071105957
Start episode 30. Avg common good 9.775625228881836. Avg reward -0.630469024181366
Start episode 35. Avg common good 8.559374809265137. Avg reward -0.8459939956665039
Start episode 40. Avg common good 8.257499694824219. Avg reward -0.8263691067695618
Start episode 45. Avg common good 9.595624923706055. Avg reward -0.7702189683914185
Start episode 50. Avg common good 9.863750457763672. Avg reward -0.6136690378189087
Start episode 55. Avg common good 10.235624313354492. Avg reward -0.5958940386772156
Start episode 60. Avg common good 9.095000267028809. Avg reward -0.698768