In [1]:
import numpy as np
from typing import List, Tuple, Dict, TypeVar
from rl_env_marl import MARLDraftEnv, NUM_DRAFT_ROUNDS, NUM_MGRS, ACTION_SPACE_DIM
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from pprint import pprint
import os
from stable_baselines3.common.logger import configure
import torch



In [2]:

def learning_rate_schedule(initial_lr=1e-4, final_lr=5e-5):
    return lambda progress_remaining: progress_remaining * (initial_lr - final_lr) + final_lr

learning_rate_schedule_fn = learning_rate_schedule(initial_lr=1e-4, final_lr=5e-5)


ppo_params = {
    "policy": "MlpPolicy",
    "learning_rate": learning_rate_schedule_fn,  # Adaptive learning rate
    "n_steps": 750,
    "batch_size": 750,
    "n_epochs": 30,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "clip_range": 0.2,
    "clip_range_vf": None,
    "normalize_advantage": True,
    "ent_coef": .03, # Entropy coefficient for the loss calculation
    "vf_coef": 0.5,
    "max_grad_norm": 0.7,
    "use_sde": False,
    "sde_sample_freq": -1,
    "rollout_buffer_class": None,
    "rollout_buffer_kwargs": None,
    # "target_kl": 0.01,
    # "target_kl": None, 
    "stats_window_size": 100,
    "policy_kwargs": dict(net_arch=[dict(pi=[256, 256, 128], vf=[256, 256, 128])]),
    "verbose": 1,
    "seed": 69,
    "device": "auto",
    "_init_setup_model": True
}


In [3]:
check_env(MARLDraftEnv())

In [4]:
env = MARLDraftEnv()

n_agents = NUM_MGRS
model_path = "logs/PPO_20240831-014945/mgr_{mgr}/best_model_mgr_{mgr}.zip"
models = [PPO.load(model_path.format(mgr=mgr), env=env) for mgr in range(n_agents)]


for model in models:
    # Update the optimizer with the new learning rate
    if model.policy.optimizer is not None:
        for param_group in model.policy.optimizer.param_groups:
            param_group['lr'] = 5e-5
        

# total_episodes = int(2e6)
total_timesteps = int(NUM_MGRS*2e6)
# n_episodes = 50  # Number of episodes per update

# saving frequency
n_episodes_info = 5e4
n_episodes_model = 5e4 


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [5]:
import os
import json
from datetime import datetime
from stable_baselines3.common.logger import configure

# Generate a run_id based on the current datetime
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
run_dir = f"logs/PPO_{run_id}"
os.makedirs(run_dir, exist_ok=True)

# Configure TensorBoard logger for each manager
# loggers = [configure(run_dir, ["tensorboard"]) for _ in range(NUM_MGRS)]
logger = configure(run_dir, ["tensorboard"])

# Initialize variables for tracking the best model
best_rewards = [-float('inf')] * NUM_MGRS

# Create subdirectories for each manager
manager_dirs = [os.path.join(run_dir, f"mgr_{i}") for i in range(NUM_MGRS)]
for manager_dir in manager_dirs:
    os.makedirs(manager_dir, exist_ok=True)

best_model_paths = [os.path.join(manager_dirs[i], f"best_model_mgr_{i}.zip") for i in range(NUM_MGRS)]

# Storage for info data
info_history = []

# Assign the new logger to each model
for idx, model in enumerate(models):
    model.set_logger(logger)

step_num = 0
n_episodes = 0

for model in models:
    assert model.n_steps % NUM_DRAFT_ROUNDS == 0, "n_steps must be divisible by the number of draft rounds"
    
while step_num < total_timesteps:
    print('step_num:', step_num)
    # reset buffers
    rollout_step_num = 0
    for model in models:
        model.rollout_buffer.reset()
    
    while rollout_step_num < models[0].n_steps*NUM_MGRS:
        
        env.reset()
        sarstti = {i: [] for i in range(NUM_MGRS)}
        mgr_values = {i: [] for i in range(NUM_MGRS)}
        mgr_log_probs = {i: [] for i in range(NUM_MGRS)}
        
        # complete 1 draft
        new_episode = 1
        for _, row in env.draft.iterrows(): # each turn of draft
            mgr = row['mgr']
            model = models[mgr]
            state = env.state
            # print(np.mean(env.state))
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(model.device)
                actions, values, log_probs = model.policy(state_tensor, deterministic=False)
            action = actions.cpu().numpy() 

            # clip actions to avoid out of bounds error
            # as we are sampling from an unbounded gaussian distribution
            clipped_actions = np.clip(actions, 0, ACTION_SPACE_DIM)
            action = clipped_actions.item()
            next_state, _, terminated, truncated, info = env.step(action)
            # print(np.mean(next_state))
            step_num += 1
            rollout_step_num += 1
            
            action = np.array(action).reshape(-1, 1)
            sarstti[mgr].append((state, action, 0, next_state, terminated, truncated, info))
            mgr_values[mgr].append(values)
            mgr_log_probs[mgr].append(log_probs)
        
        n_episodes += 1
        # compute rewards for the draft
        for mgr in range(NUM_MGRS):
            reward = env.calc_reward(mgr)
            # print(f"Reward for manager {mgr}: {reward}")
            state, action, _, next_state, terminated, truncated, info = sarstti[mgr][-1]
            sarstti[mgr][-1] = (state, action, reward, next_state, terminated, truncated, info)
        
        # add every step of draft to rollout buffer for each manager
        for mgr, model in enumerate(models):
            for round in range(NUM_DRAFT_ROUNDS):
                state, action, reward, next_state, terminated, truncated, info = sarstti[mgr][round]
                model.rollout_buffer.add(
                    state,
                    action,
                    reward,
                    new_episode,
                    mgr_values[mgr][round],
                    mgr_log_probs[mgr][round]
                )
                new_episode = 0
                
        if n_episodes % n_episodes_model == 0:
            for mgr, model in enumerate(models):
                model_save_path = os.path.join(manager_dirs[mgr], f"ppo_model_mgr_{mgr}_episode_{n_episodes}.zip")
                model.save(model_save_path)
                print(f"Saved model for manager {mgr} at episode {n_episodes} to {model_save_path}")
        
        if n_episodes % n_episodes_info == 0:
            for mgr in range(NUM_MGRS):
                info_file_path = os.path.join(manager_dirs[mgr], f"info_history_episode_{n_episodes}.json")
                with open(info_file_path, 'w') as f:
                    json.dump(info_history, f, indent=4)
                print(f"Saved info history to {info_file_path}")
            # Clear info history after saving to avoid redundant data
            info_history.clear()
    

                
    for mgr, model in enumerate(models):
        
        rewards = model.rollout_buffer.rewards
        mean_reward = rewards.mean()
        
        # Log metrics to TensorBoard for this specific manager
        model.logger.record(f"rewards/draft_reward_mgr_{mgr}", mean_reward)
        model.logger.dump(n_episodes)


        # check if best reward and save model if so
        if mean_reward > best_rewards[mgr]:
            best_rewards[mgr] = mean_reward
            model.save(best_model_paths[mgr])
            print(f"Saved best model for manager {mgr} with draft reward {mean_reward}")    
        
        model.rollout_buffer.compute_returns_and_advantage(last_values=torch.zeros_like(mgr_values[mgr][-1]), dones=terminated) # might need to make dones an array, not sure
        model.train()
        
        # Extract and log individual losses
        pg_loss = np.mean(model.logger.name_to_value['train/policy_gradient_loss'])  # Policy gradient loss
        value_loss = np.mean(model.logger.name_to_value['train/value_loss'])  # Value loss
        entropy_loss = np.mean(model.logger.name_to_value['train/entropy_loss'])  # Entropy loss

        # Calculate total loss
        total_loss = pg_loss + model.vf_coef * value_loss + model.ent_coef * entropy_loss

        # Log the individual and total losses
        model.logger.record(f"loss_total/total_loss_mgr_{mgr}", total_loss)
        model.logger.record(f"loss_pg/pg_loss_mgr_{mgr}", pg_loss)
        model.logger.record(f"loss_value/value_loss_mgr_{mgr}", value_loss)
        model.logger.record(f"loss_entropy/entropy_loss_mgr_{mgr}", entropy_loss)

        # Dump logs to TensorBoard
        model.logger.dump(n_episodes)

step_num: 0
Saved best model for manager 0 with draft reward 0.07414621859788895
Saved best model for manager 1 with draft reward 0.037895169109106064
Saved best model for manager 2 with draft reward 0.04009617492556572
Saved best model for manager 3 with draft reward 0.03857256844639778
Saved best model for manager 4 with draft reward 0.03775728866457939
Saved best model for manager 5 with draft reward 0.03534065932035446
Saved best model for manager 6 with draft reward 0.05398833751678467
Saved best model for manager 7 with draft reward 0.0336853452026844
Saved best model for manager 8 with draft reward 0.03560575097799301
Saved best model for manager 9 with draft reward 0.04129059240221977
Saved best model for manager 10 with draft reward 0.03209613636136055
Saved best model for manager 11 with draft reward 0.04082512483000755
step_num: 9000
Saved best model for manager 0 with draft reward 0.08529936522245407
Saved best model for manager 3 with draft reward 0.044147055596113205
Save