In [1]:
import numpy as np
from typing import List, Tuple, Dict, TypeVar
from rl_env_marl import MARLDraftEnv, NUM_DRAFT_ROUNDS, NUM_MGRS, ACTION_SPACE_DIM
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from pprint import pprint
import os
from stable_baselines3.common.logger import configure
import torch



In [2]:

def learning_rate_schedule(initial_lr=1e-4, final_lr=5e-5):
    return lambda progress_remaining: progress_remaining * (initial_lr - final_lr) + final_lr

learning_rate_schedule_fn = learning_rate_schedule(initial_lr=1e-4, final_lr=5e-5)


ppo_params = {
    "policy": "MlpPolicy",
    "learning_rate": learning_rate_schedule_fn,  # Adaptive learning rate
    "n_steps": 750,
    "batch_size": 750,
    "n_epochs": 30,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "clip_range": 0.2,
    "clip_range_vf": None,
    "normalize_advantage": True,
    "ent_coef": .03, # Entropy coefficient for the loss calculation
    "vf_coef": 0.5,
    "max_grad_norm": 0.7,
    "use_sde": False,
    "sde_sample_freq": -1,
    "rollout_buffer_class": None,
    "rollout_buffer_kwargs": None,
    # "target_kl": 0.01,
    # "target_kl": None, # TODO: TRY THIS NEXT TEST BECAUSE KEEP GETTING EARLY STOPPING
    "stats_window_size": 100,
    "policy_kwargs": dict(net_arch=[dict(pi=[256, 256, 128], vf=[256, 256, 128])]),
    "verbose": 1,
    "seed": 69,
    "device": "auto",
    "_init_setup_model": True
}


In [3]:
check_env(MARLDraftEnv())

In [4]:
env = MARLDraftEnv()

n_agents = NUM_MGRS
models = [PPO.load("logs/PPO_20240827-143658/best_model.zip", env=env) for _ in range(n_agents)]

# total_episodes = int(2e6)
total_timesteps = int(15*2e6)
# n_episodes = 50  # Number of episodes per update

# saving frequency
n_episodes_info = 5e4
n_episodes_model = 5e4 


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [5]:
import os
import json
from datetime import datetime
from stable_baselines3.common.logger import configure

# Generate a run_id based on the current datetime
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
run_dir = f"logs/PPO_{run_id}"
os.makedirs(run_dir, exist_ok=True)

# Configure TensorBoard logger for each manager
# loggers = [configure(run_dir, ["tensorboard"]) for _ in range(NUM_MGRS)]
logger = configure(run_dir, ["tensorboard"])

# Initialize variables for tracking the best model
best_rewards = [-float('inf')] * NUM_MGRS

# Create subdirectories for each manager
manager_dirs = [os.path.join(run_dir, f"mgr_{i}") for i in range(NUM_MGRS)]
for manager_dir in manager_dirs:
    os.makedirs(manager_dir, exist_ok=True)

best_model_paths = [os.path.join(manager_dirs[i], f"best_model_mgr_{i}.zip") for i in range(NUM_MGRS)]

# Storage for info data
info_history = []

# Assign the new logger to each model
for idx, model in enumerate(models):
    model.set_logger(logger)

step_num = 0
n_episodes = 0

for model in models:
    assert model.n_steps % NUM_DRAFT_ROUNDS == 0, "n_steps must be divisible by the number of draft rounds"
    
while step_num < total_timesteps:
    print('step_num:', step_num)
    # reset buffers
    rollout_step_num = 0
    for model in models:
        model.rollout_buffer.reset()
    
    while rollout_step_num < models[0].n_steps*NUM_MGRS:
        
        env.reset()
        sarstti = {i: [] for i in range(NUM_MGRS)}
        mgr_values = {i: [] for i in range(NUM_MGRS)}
        mgr_log_probs = {i: [] for i in range(NUM_MGRS)}
        
        # complete 1 draft
        new_episode = 1
        for _, row in env.draft.iterrows(): # each turn of draft
            mgr = row['mgr']
            model = models[mgr]
            state = env.state
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(model.device)
                actions, values, log_probs = model.policy(state_tensor, deterministic=False)
            action = actions.cpu().numpy() 

            # clip actions to avoid out of bounds error
            # as we are sampling from an unbounded gaussian distribution
            clipped_actions = np.clip(actions, 0, ACTION_SPACE_DIM)
            action = clipped_actions.item()
            next_state, _, terminated, truncated, info = env.step(action)
            step_num += 1
            rollout_step_num += 1
            
            action = np.array(action).reshape(-1, 1)
            sarstti[mgr].append((state, action, 0, next_state, terminated, truncated, info))
            mgr_values[mgr].append(values)
            mgr_log_probs[mgr].append(log_probs)
        
        n_episodes += 1
        # compute rewards for the draft
        for mgr in range(NUM_MGRS):
            reward = env.calc_reward(mgr)
            state, action, _, next_state, terminated, truncated, info = sarstti[mgr][-1]
            sarstti[mgr][-1] = (state, action, reward, next_state, terminated, truncated, info)

            # Log metrics to TensorBoard for this specific manager
            model.logger.record(f"train/draft_reward_mgr_{mgr}", reward)
            model.logger.dump(n_episodes)
        
            # check if best reward and save model if so
            if reward > best_rewards[mgr]:
                best_rewards[mgr] = reward
                model.save(best_model_paths[mgr])
                print(f"Saved best model for manager {mgr} with draft reward {reward}")
        
        # add every step of draft to rollout buffer for each manager
        for i, model in enumerate(models):
            for round in range(NUM_DRAFT_ROUNDS):
                state, action, reward, next_state, terminated, truncated, info = sarstti[mgr][round]
                # print("values shape", values[mgr][round].shape)
                # print("log_probs shape", log_probs[mgr][round].shape)
                model.rollout_buffer.add(
                    state,
                    action,
                    reward,
                    new_episode,
                    mgr_values[mgr][round],
                    mgr_log_probs[mgr][round]
                )
                new_episode = 0
                
        if n_episodes % n_episodes_model == 0:
            for mgr, model in enumerate(models):
                model_save_path = os.path.join(manager_dirs[mgr], f"ppo_model_mgr_{mgr}_episode_{n_episodes}.zip")
                model.save(model_save_path)
                print(f"Saved model for manager {mgr} at episode {n_episodes} to {model_save_path}")
        
        if n_episodes % n_episodes_info == 0:
            for mgr in range(NUM_MGRS):
                info_file_path = os.path.join(manager_dirs[mgr], f"info_history_episode_{n_episodes}.json")
                with open(info_file_path, 'w') as f:
                    json.dump(info_history, f, indent=4)
                print(f"Saved info history to {info_file_path}")
            # Clear info history after saving to avoid redundant data
            info_history.clear()
            
                
    for mgr, model in enumerate(models):
        model.rollout_buffer.compute_returns_and_advantage(last_values=torch.zeros_like(mgr_values[mgr][-1]), dones=terminated) # might need to make dones an array, not sure
        model.train()

            
            
    
# for episode_num in range(total_episodes): # each draft
    
#     env.reset()
#     sarstti = {i: [] for i in range(NUM_MGRS)}
    
#     for _, row in env.draft.iterrows(): # each turn of draft
#         mgr = row['mgr']
#         model = models[mgr]
#         state = env.state
#         action, _ = model.predict(env.state, deterministic=False)
#         next_state, _, terminated, truncated, info = env.step(action.item())
#         sarstti[mgr].append((state, action, 0, next_state, terminated, truncated, info))
#         step_num += 1
        
#         # Append info to the history for saving
#         info_history.append(info)
    
#     # calculate rewards
#     for mgr in range(NUM_MGRS):
#         state, action, _, next_state, _, truncated, info = sarstti[mgr][-1]
#         reward = env.calc_reward(mgr)
#         terminated = True
#         sarstti[mgr][-1] = (state, action, reward, next_state, terminated, truncated, info)

        
        
#     for mgr, model in enumerate(models):  # for each manager
#         # print(f"Training model for manager {mgr}")
        
#         episode_start = 1
#         for state, action, reward, next_state, terminated, truncated, info in sarstti[mgr]:
#             # Convert the state (observation) from numpy to torch tensor
#             # print(f"state dim: {state.shape}")
#             # print(f"action dim: {action.shape}")
#             with torch.no_grad():
#                 state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(model.device)
#                 action_tensor = torch.tensor(action, dtype=torch.float32).unsqueeze(0).to(model.device)
                
#                 # Calculate the value of the state and log probability of the action
#                 value = model.policy.predict_values(state_tensor)
#                 log_prob = model.policy.evaluate_actions(state_tensor, action_tensor)[1]

           
#             # Add experience to the rollout buffer
#             model.rollout_buffer.add(
#                 state,
#                 action,
#                 reward,
#                 episode_start,
#                 value,
#                 log_prob
#             )
#             episode_start = 0
#         dones = [x[4] for x in sarstti[mgr]]
#         model.rollout_buffer.compute_returns_and_advantage(last_values=torch.zeros_like(value), dones=terminated)
#         # ------
#         # Only train if the buffer is full
#         if model.rollout_buffer.full:
#             model.train()
            
#             # Track draft rewards (or any other performance metric)
#             draft_reward = sum([x[2] for x in sarstti[mgr]])  # Sum of rewards for the manager
            
#             # Save the best model if the performance improves
#             if draft_reward > best_rewards[mgr]:
#                 best_rewards[mgr] = draft_reward
#                 model.save(best_model_paths[mgr])
#                 print(f"Saved best model for manager {mgr} with draft reward {draft_reward}")
            
            # Log metrics to TensorBoard for this specific manager
            # model.logger.record(f"train/draft_reward_mgr_{mgr}", draft_reward)
            # model.logger.dump(episode_num)
        # ----------
        
        # model.train()
        
        # # Track draft rewards (or any other performance metric)
        # draft_reward = sum([x[2] for x in sarstti[mgr]])  # Sum of rewards for the manager
        
        # # Save the best model if the performance improves
        # if draft_reward > best_rewards[mgr]:
        #     best_rewards[mgr] = draft_reward
        #     model.save(best_model_paths[mgr])
        #     print(f"Saved best model for manager {mgr} with draft reward {draft_reward}")
        
        # # Log metrics to TensorBoard for this specific manager
        # model.logger.record(f"train/draft_reward_mgr_{mgr}", draft_reward)
        # model.logger.dump(episode_num)
    
    # Save info to a JSON file every n_episodes_info
    # if episode_num % n_episodes_info == 0:
    #     for mgr in range(NUM_MGRS):
    #         info_file_path = os.path.join(manager_dirs[mgr], f"info_history_episode_{episode_num}.json")
    #         with open(info_file_path, 'w') as f:
    #             json.dump(info_history, f, indent=4)
    #         print(f"Saved info history to {info_file_path}")
    #     # Clear info history after saving to avoid redundant data
    #     info_history.clear()

    # Save the model every n_episodes_model
    # if episode_num % n_episodes_model == 0:
    #     for mgr, model in enumerate(models):
    #         model_save_path = os.path.join(manager_dirs[mgr], f"ppo_model_mgr_{mgr}_episode_{episode_num}.zip")
    #         model.save(model_save_path)
    #         print(f"Saved model for manager {mgr} at episode {episode_num} to {model_save_path}")

hi
Saved best model for manager 0 with draft reward -0.5
Saved best model for manager 1 with draft reward 6.567834659156002
Saved best model for manager 2 with draft reward -0.5
Saved best model for manager 3 with draft reward -0.5
Saved best model for manager 4 with draft reward -0.5
Saved best model for manager 5 with draft reward 5.433010682380735
Saved best model for manager 6 with draft reward -0.5
Saved best model for manager 7 with draft reward -0.5
Saved best model for manager 8 with draft reward -0.5
Saved best model for manager 9 with draft reward -0.5
Saved best model for manager 10 with draft reward -0.5
Saved best model for manager 11 with draft reward -0.5
Saved best model for manager 7 with draft reward 5.438974035832456
Saved best model for manager 10 with draft reward 6.501792532224149
Saved best model for manager 6 with draft reward 6.568311359134838
Saved best model for manager 11 with draft reward 5.718961034382634
Saved best model for manager 1 with draft reward 6.

KeyboardInterrupt: 