In [5]:
# Parameters
artificial_humans = "../../data/training/ah_10/data/model.pt"
artificial_humans_model = "graph"
output_file = "../../data/training/dev/data/manager_run.pt"
model_args = {
    "hidden_size": 5,
    "add_rnn": False,
    "add_edge_model": False,
    "add_global_model": False,
    "x_encoding": [
        {"name": "prev_contributions", "n_levels": 21, "encoding": "numeric"},
        {"name": "prev_punishments", "n_levels": 31, "encoding": "numeric"},
    ],
    "u_encoding": [
        {"name": "round_number", "n_levels": 16, "encoding": "numeric"},
        {"name": "prev_common_good", "norm": 128, "etype": "float"},
    ],
}
opt_args = {"lr": 0.001}
gamma = 0.5
eps = 0.2
target_update_freq = 50
n_episode_steps = 16
n_episodes = 1000
memory_size = 100
sample_args = {"batch_size": 10}


In [6]:
%load_ext autoreload
%autoreload 2

import torch as th
from itertools import count

from aimanager.manager.memory import Memory
from aimanager.manager.environment import ArtificialHumanEnv
from aimanager.artificial_humans import AH_MODELS
from aimanager.manager.manager import ArtificalManager
from aimanager.manager.graph_memory import GraphMemory

from torch_geometric.data import Batch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
device = th.device('cpu')
rec_device = th.device('cpu')
artifical_humans = AH_MODELS[artificial_humans_model].load(artificial_humans).to(device)

env = ArtificialHumanEnv(
    artifical_humans=artifical_humans, n_agents=4, n_contributions=21, n_punishments=31, episode_steps=n_episode_steps, device=device)

manager = ArtificalManager(
    n_contributions=21, n_punishments=31, model_args=model_args, opt_args=opt_args, gamma=gamma, 
    target_update_freq=target_update_freq, default_values=artifical_humans.default_values, device=device)

replay_mem = GraphMemory(n_episodes=memory_size, n_episode_steps=n_episode_steps, n_nodes=4, device=device)
recorder = Memory(n_episodes=n_episodes, n_episode_steps=n_episode_steps, output_file=output_file, device=device)

display_freq = 20

for episode in range(n_episodes):
    if (episode > 1) and (episode % display_freq == 0):
        avg_common_good = recorder.last(display_freq)['common_good'].mean()
        avg_reward = replay_mem.last(display_freq)['reward'].mean()
        print(f'Start episode {episode}. Avg common good {avg_common_good}. Avg reward {avg_reward}')

    state = env.init_episode()

    manager.init_episode(episode)

    for step in count():
        state_ = {k: v.unsqueeze(0).unsqueeze(-1) for k, v in state.items()}
        obs = Batch.from_data_list(manager.encode(state_, edge_index=env.edge_index))

        # Get q values from controller
        q_values = manager.get_q(manager_observations=obs, first=step == 0).squeeze(1)

        # Sample a action
        selected_action = manager.eps_greedy(q_values=q_values, eps=eps)

        state = env.punish(selected_action)
        recorder.add(**state, episode_step=step)

        # pass actions to environment and advance by one step
        state, reward, done = env.step()
        replay_mem.add(
            action=selected_action, reward=reward, 
            obs=obs)
 
        if done:
            replay_mem.next_episode(episode)
            
            # allow manager to update itself
            sample = replay_mem.sample(**sample_args)
            

            if sample is not None:
                manager.update(**sample)
            break
    recorder.add(episode_step=step, **state)
    recorder.next_episode(episode)


Start episode 20. Avg common good 11.939531326293945. Avg reward -0.9015752673149109
Start episode 40. Avg common good 9.67984390258789. Avg reward -1.0572751760482788
Start episode 60. Avg common good 8.387187957763672. Avg reward -1.1661503314971924
Start episode 80. Avg common good 9.258281707763672. Avg reward -1.1279315948486328
Start episode 100. Avg common good 8.581250190734863. Avg reward -1.1378252506256104
Start episode 120. Avg common good 8.917187690734863. Avg reward -1.1289315223693848
Start episode 140. Avg common good 8.75718879699707. Avg reward -1.079856514930725
Start episode 160. Avg common good 8.820467948913574. Avg reward -1.0949627161026
Start episode 180. Avg common good 9.9701566696167. Avg reward -1.0554752349853516
Start episode 200. Avg common good 9.670937538146973. Avg reward -1.072250247001648
Start episode 220. Avg common good 9.470781326293945. Avg reward -1.0571690797805786
Start episode 240. Avg common good 9.994531631469727. Avg reward -0.998093962

## Investigate policy

In [8]:
import seaborn as sns

data = {
    'contributions': th.arange(0,21),
    'episode_step': th.tensor(0),
}

# obs = manager.

data['punishments'] = manager.act(**data)

obs = manager.encode_obs(**data)
q = manager.get_q(manager_observations=obs)
sns.heatmap(q)


AttributeError: 'ArtificalManager' object has no attribute 'encode_obs'

## Investigate trajectory

In [None]:
from aimanager.utils.array_to_df import using_multiindex
import pandas as pd

def to_series(k, v):
    dims = ['episode', 'step', 'agent'][:len(v.shape)]
    sr = using_multiindex(v, dims).set_index(dims).rename(columns={"value": k})
    sr.name = k
    return sr
dfs = [to_series(k, v)
    for k, v in recorder.memory.items() if len(v.shape) <= 3
]
dfs = [df.reindex(dfs[0].index) for df in dfs]
rec_df = pd.concat(dfs, axis=1).reset_index(drop=True)

dfs = [to_series(k, v)
    for k, v in replay_mem.memory.items() if len(v.shape) <= 3
]
dfs.append(to_series('current_obs', th.argmax(replay_mem.memory['current_obs'], -1)))
dfs = [df.reindex(dfs[0].index) for df in dfs]
repm_df = pd.concat(dfs, axis=1).reset_index(drop=True)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=rec_df, x='punishments')

In [None]:
sns.countplot(data=rec_df, x='prev_punishments')

In [None]:
df_count_pt = rec_df.pivot_table(index=['prev_punishments'], columns='prev_contributions', values='contributions', aggfunc=len)
df_mean_pt = rec_df.pivot_table(index=['prev_punishments'], columns='prev_contributions', values='contributions', aggfunc=np.mean)



f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,6))
ax1.set_title('count')
sns.heatmap(df_count_pt, ax=ax1)
ax2.set_title('mean')
sns.heatmap(df_mean_pt, ax=ax2)

In [None]:
df_count_pt = repm_df.pivot_table(index=['current_obs'], columns='actions', values='rewards', aggfunc=len)
df_mean_pt = repm_df.pivot_table(index=['current_obs'], columns='actions', values='rewards', aggfunc=np.mean)



f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,6))
ax1.set_title('count')
sns.heatmap(df_count_pt, ax=ax1)
ax2.set_title('mean')
sns.heatmap(df_mean_pt, ax=ax2)

## Simulate managment of artificial humans

In [None]:

rec = {
    'value': [],
    'name': [],
    'round_number': [],
    'episode': [],
    'player_id': [],
}

for i in range(45):
    state = env.init_episode()
    done = False
    next_done = False
    rn = 0
    while not next_done:
        next_done = done
        contributions = state['contributions']
        punishments = manager.act(**state)
        if not next_done:
            env.punish(punishments)
            state, reward, done = env.step()

        common_good = contributions * 1.6 - punishments

        rec['value'].extend(contributions.tolist()+punishments.tolist()+common_good.tolist())
        rec['name'].extend(['contributions']*4 + ['punishments']*4 + ['common_good']*4)
        rec['episode'].extend([i]*12)
        rec['round_number'].extend([rn]*12)
        rec['player_id'].extend([0,1,2,3]*3)
        rn += 1


In [None]:
df = pd.DataFrame(rec)

In [None]:

sns.lineplot(data=df, x='round_number', hue='name', y='value')

In [None]:
import os
folder = '../data/dev/data'
df.to_parquet(os.path.join(folder, 'artificial_human_rl_simulation.parquet'))