In [24]:
from copy import deepcopy
import dill
import logging
import os
from pprint import pprint

from human_aware_rl.rllib.rllib import gen_trainer_from_params, load_agent, OvercookedMultiAgent
from human_aware_rl.ppo.ppo_rllib_client import my_config
from overcooked_ai_py.agents.benchmarking import AgentEvaluator
from overcooked_ai_py.mdp.actions import Action
from overcooked_ai_py.mdp.layout_generator import DEFAILT_PARAMS_SCHEDULE_FN, LayoutGenerator, MDPParamsGenerator

RAY_DIRECTORY = os.path.expanduser("~/ray_results")

In [25]:
def checkpoint_path(run_name, seed=0, checkpoint_num=1):
    run_dir = sorted([r for r in os.listdir(RAY_DIRECTORY) if f"{run_name}_{seed}" in r])[0]
    cp_path = f"{RAY_DIRECTORY}/{run_dir}/checkpoint_{checkpoint_num}/checkpoint-{checkpoint_num}"
    return cp_path

def load_params(run_name, seed=0):
    cp_path = checkpoint_path(run_name, seed)
    params_path = "/".join(cp_path.split("/")[:-1]) + "/config.pkl"
    params = dill.load(open(params_path, "rb"))
    return params

def load_env(run_name, seed=0):
    params = load_params(run_name, seed)
    return OvercookedMultiAgent.from_config(params["environment_params"])

def load_agents(run_name, seeds, checkpoint_num):        
    agents = {}
    for seed in seeds:
        agents[seed] = load_agent(
            checkpoint_path(run_name, seed=seed, checkpoint_num=checkpoint_num), 
            policy_id="ppo", 
            agent_index=-1  # set to 0 or 1 when initializing episode
        )
    return agents

In [26]:
def self_play(mdp, agent, num_games=100, rnd_obj_prob_thresh=0.0):
    params = {
        'horizon': 400, 
        'mlam_params': {
            'start_orientations': False,
            'wait_allowed': False,
            'counter_goals': [],
            'counter_drop': [],
            'counter_pickup': [],
            'same_motion_goals': True
        }
    }
    # start_state_fn = mdp.get_random_start_state_fn(random_start_pos=False, rnd_obj_prob_thresh=rnd_obj_prob_thresh)
    
    # TODO change to AgentEvaluator.from_mdp_lst
    trajs = AgentEvaluator.from_mdp(mdp, params).get_agent_pair_trajs(
        a0=agent, num_games=num_games
    )
    print(f"agent self-play: {trajs[0]['ep_returns'].mean()}")
    
    return trajs

def cross_play(mdp, agent_0, agent_1, num_games=100, rnd_obj_prob_thresh=0.0):
    params = {
        'horizon': 400, 
        'mlam_params': {
            'start_orientations': False,
            'wait_allowed': False,
            'counter_goals': [],
            'counter_drop': [],
            'counter_pickup': [],
            'same_motion_goals': True
        }
    }
    # start_state_fn = mdp.get_random_start_state_fn(random_start_pos=False, rnd_obj_prob_thresh=rnd_obj_prob_thresh)
    
    # TODO change to AgentEvaluator.from_mdp_lst
    trajs_0_0 = AgentEvaluator.from_mdp(mdp, params).get_agent_pair_trajs(
        a0=agent_0, num_games=num_games
    )
    print(f"agent 0 self-play: {trajs_0_0[0]['ep_returns'].mean()}")

    trajs_1_1 = AgentEvaluator.from_mdp(mdp, params).get_agent_pair_trajs(
        a0=agent_1, num_games=num_games
    )
    print(f"agent 1 self-play: {trajs_1_1[0]['ep_returns'].mean()}")

    trajs_0_1 = AgentEvaluator.from_mdp(mdp, params).get_agent_pair_trajs(
        a0=agent_0, a1=agent_1, num_games=num_games
    )
    print(f"cross-play: {trajs_0_1[0]['ep_returns'].mean()}")
    
    return trajs_0_0, trajs_1_1, trajs_0_1

In [8]:
# mdp = LayoutGenerator(MDPParamsGenerator(DEFAILT_PARAMS_SCHEDULE_FN)).generate_padded_mdp()

run_name = "room"
seed = 1

mdp = load_env(run_name, seed).base_env.mdp
print(load_params(run_name, seed))

for checkpoint_num in [501, 1001, 1501, 2001, 2501, 3001, 3501]:
    path_0 = checkpoint_path(run_name, seed=seed, checkpoint_num=checkpoint_num)
    # path_1 = checkpoint_path(run_name, seed=seed, checkpoint_num=checkpoint_num)
    num_games = 100
    rnd_obj_prob_thresh = 0.0

    agent_0 = load_agent(path_0, policy_id="ppo", agent_index=0)
    # agent_1 = load_agent(path_1, policy_id="ppo", agent_index=1)
    trajs = self_play(mdp, agent_0, num_games, rnd_obj_prob_thresh)
    #trajs_0_0, trajs_1_1, trajs_0_1 = cross_play(mdp, agent_0, agent_1, num_games, rnd_obj_prob_thresh)

{'model_params': {'use_lstm': False, 'NUM_HIDDEN_LAYERS': 3, 'SIZE_HIDDEN_LAYERS': 64, 'NUM_FILTERS': 25, 'NUM_CONV_LAYERS': 3, 'CELL_SIZE': 256, 'D2RL': False}, 'training_params': {'num_workers': 2, 'train_batch_size': 12000, 'sgd_minibatch_size': 2000, 'rollout_fragment_length': 400, 'num_sgd_iter': 8, 'lr': 0.001, 'lr_schedule': None, 'grad_clip': 0.1, 'gamma': 0.99, 'lambda': 0.98, 'vf_share_layers': True, 'vf_loss_coeff': 0.0001, 'kl_coeff': 0.2, 'clip_param': 0.05, 'num_gpus': 0, 'seed': 1, 'evaluation_interval': 50, 'entropy_coeff_schedule': [[0, 0.2], [300000.0, 0.1]], 'eager': False, 'log_level': 'WARN'}, 'environment_params': {'env_params': {'horizon': 400, 'mlam_params': {'start_orientations': False, 'wait_allowed': False, 'counter_goals': [], 'counter_drop': [], 'counter_pickup': [], 'same_motion_goals': True}}, 'multi_agent_params': {'reward_shaping_factor': 1.0, 'reward_shaping_horizon': 2500000.0, 'use_phi': True, 'bc_schedule': [[0, 0], [inf, 0]]}, 'mdp_params': {'layou

2021-07-29 12:43:49,150	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:43:49,265	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:43:49,268	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_501/checkpoint-501
2021-07-29 12:43:49,269	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 501, '_timesteps_total': 6012000, '_time_total': 7583.024436235428, '_episodes_total': 15030}
Avg rew: 66.20 (std: 25.25, se: 2.52); avg len: 400.00; : 100%|██████| 100/100 [01:23<00:00,  1.20it/s]


Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 66.2





 1001


2021-07-29 12:45:16,314	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:45:16,368	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:45:16,369	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_1001/checkpoint-1001
2021-07-29 12:45:16,369	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 1001, '_timesteps_total': 12012000, '_time_total': 15101.43731713295, '_episodes_total': 30030}
Avg rew: 71.40 (std: 21.59, se: 2.16); avg len: 400.00; : 100%|████████████████████████████████| 100/100 [01:30<00:00,  1.10it/s]


Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 71.4





 1501


2021-07-29 12:46:52,142	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:46:52,292	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:46:52,294	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_1501/checkpoint-1501
2021-07-29 12:46:52,294	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 1501, '_timesteps_total': 18012000, '_time_total': 22804.79430413246, '_episodes_total': 45030}
Avg rew: 72.00 (std: 25.46, se: 2.55); avg len: 400.00; : 100%|████████████████████████████████| 100/100 [01:30<00:00,  1.11it/s]


Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 72.0





 2001


2021-07-29 12:48:27,336	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:48:27,408	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:48:27,410	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_2001/checkpoint-2001
2021-07-29 12:48:27,410	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 2001, '_timesteps_total': 24012000, '_time_total': 30395.38819336891, '_episodes_total': 60030}
Avg rew: 73.80 (std: 27.38, se: 2.74); avg len: 400.00; : 100%|████████████████████████████████| 100/100 [01:35<00:00,  1.05it/s]


Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 73.8





 2501


2021-07-29 12:50:07,296	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:50:07,353	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:50:07,355	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_2501/checkpoint-2501
2021-07-29 12:50:07,355	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 2501, '_timesteps_total': 30012000, '_time_total': 37986.8929669857, '_episodes_total': 75030}
Avg rew: 75.60 (std: 22.37, se: 2.24); avg len: 400.00; : 100%|████████████████████████████████| 100/100 [01:40<00:00,  1.00s/it]


Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 75.6





 3001


2021-07-29 12:51:52,326	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:51:52,381	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:51:52,382	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_3001/checkpoint-3001
2021-07-29 12:51:52,383	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 3001, '_timesteps_total': 36012000, '_time_total': 45595.50277996063, '_episodes_total': 90030}
Avg rew: 84.20 (std: 25.03, se: 2.50); avg len: 400.00; : 100%|████████████████████████████████| 100/100 [01:42<00:00,  1.02s/it]


Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 84.2





 3501


2021-07-29 12:53:39,746	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:53:39,825	INFO trainable.py:217 -- Getting current IP.
2021-07-29 12:53:39,826	INFO trainable.py:423 -- Restored on 192.168.1.233 from checkpoint: /home/anchorwatt/ray_results/room_1_2021-07-28_18-10-58_jineuzo/checkpoint_3501/checkpoint-3501
2021-07-29 12:53:39,826	INFO trainable.py:430 -- Current state after restoring: {'_iteration': 3501, '_timesteps_total': 42012000, '_time_total': 53211.98012804985, '_episodes_total': 105030}
Avg rew: 80.60 (std: 21.99, se: 2.20); avg len: 400.00; : 100%|████████████████████████████████| 100/100 [01:51<00:00,  1.12s/it]

Skipping trajectory consistency checking because MDP was recognized as variable. Trajectory consistency checking is not yet supported for variable MDPs.
agent self-play: 80.6





In [23]:
from overcooked_ai_py.visualization.state_visualizer import *

StateVisualizer().display_rendered_trajectory(
    trajs[0], img_directory_path="/home/anchorwatt/traj_0_0"
)

pygame 1.9.5
Hello from the pygame community. https://www.pygame.org/contribute.html


interactive(children=(IntSlider(value=0, description='timestep', max=399), Output()), _dom_classes=('widget-in…