In [40]:
import os
import random
import time
from distutils.util import strtobool

#import gym
from ScheduleGym import ScheduleGym
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

from pathlib import Path
import datetime
import tempfile
import json
import shutil
import imageio
from types import SimpleNamespace
import copy

In [2]:
class Agent(nn.Module):
    def __init__(self, state_size, action_sizes, hidden_dim=256):
        super(Agent, self).__init__()

        self.shared = nn.Sequential(
            self.layer_init(nn.Linear(state_size, hidden_dim)),
            nn.ReLU(),
            self.layer_init(nn.Linear(hidden_dim, hidden_dim)),
            nn.ReLU()
        )

        # Create separate heads for each action dimension
        self.action_heads = nn.ModuleList([
            nn.Sequential(
                self.layer_init(nn.Linear(hidden_dim, action_size))
            )
            for action_size in action_sizes
        ])

        self.value_net = nn.Sequential(
            self.layer_init(nn.Linear(hidden_dim, 1))
        )

        self.action_sizes = action_sizes

    def layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

    def forward(self, x):
        x = self.shared(x)
        policy_logits = [head(x) for head in self.action_heads]
        value = self.value_net(x)
        return policy_logits, value

    def get_action_and_value(self, state):
        policy_logits, value = self.forward(state)
        action_probs = [torch.softmax(logits, dim=-1) for logits in policy_logits]
        dists = [Categorical(probs) for probs in action_probs]
        actions = [dist.sample() for dist in dists]
        logprobs = [dist.log_prob(action) for dist, action in zip(dists, actions)]
        return actions, logprobs, [dist.entropy() for dist in dists], value

    def get_value(self, state):
        _, value = self.forward(state)
        return value

    def get_policy(self, state):
        policy_logits, _ = self.forward(state)
        return [torch.softmax(logits, dim=-1) for logits in policy_logits]


In [65]:
import copy
import random

class MCTSNode:
    def __init__(self, env, agent, parent=None, prior=1.0):
        self.env = env  # env is the environment instance, which can take actions
        self.agent = agent
        self.parent = parent
        self.children = {}
        self.visit_count = 0
        self.value_sum = 0
        self.prior = prior
        self.explored_actions = set()

    def is_fully_expanded(self):
        return len(self.explored_actions) == np.prod(self.agent.action_sizes)

    def best_child(self, c_param=1.4):
        choices_weights = [
            (child.value_sum / child.visit_count) + c_param * child.prior * np.sqrt(self.visit_count) / (1 + child.visit_count)
            for child in self.children.values()
        ]
        return list(self.children.values())[np.argmax(choices_weights)]

    def select_child(self, c_param=1.4):
        return self.best_child(c_param)
    
    def expand(self):
        if self.is_fully_expanded():
            raise Exception("Cannot expand a fully expanded node.")

        for _ in range(np.prod(self.agent.action_sizes)):
            # Convert the action list to a tuple for correct set operations
            action = tuple(random.randint(0, size - 1) for size in self.agent.action_sizes)
            if action not in self.explored_actions:
                self.explored_actions.add(action)
                
                # Deep copy the environment to create a new child node with its own environment
                new_env = copy.deepcopy(self.env)
                
                # Perform the action in the copied environment
                new_env.step(action)
                
                # Create a child node with this new environment state
                child_node = MCTSNode(new_env, self.agent, parent=self, prior=self.prior)
                self.children[action] = child_node
                return child_node

        raise Exception("No unexplored actions found, despite not being fully expanded.")


   

    def rollout(self):
        # Start with a deep copy of the current environment (not just the state)
        rollout_env = copy.deepcopy(self.env)
        while not rollout_env.is_done():
            
            state_vector = torch.tensor(rollout_env.state2vector()).to(device).float()
        
            action_probs = self.agent.get_policy(state_vector)
            action = [torch.multinomial(probs, 1).item() for probs in action_probs]
            rollout_env.step(action)  # Perform the action in the copied environment
        return rollout_env.fitness()  # Use the environment's method to compute fitness

    def backpropagate(self, reward):
        self.visit_count += 1
        self.value_sum += reward
        if self.parent:
            self.parent.backpropagate(reward)

def mcts(env, agent, num_simulations):
    root = MCTSNode(env, agent)
    for _ in range(num_simulations):
        node = root
        while node.is_fully_expanded() and node.children:
            node = node.select_child()
        if not node.is_fully_expanded():
            node = node.expand()
        reward = node.rollout()
        node.backpropagate(reward)
    return list(root.children.keys())[np.argmax([child.visit_count for child in root.children.values()])]


In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

Device: cuda


In [67]:
env = ScheduleGym(num_days=2, num_hours=4, num_classes=1, num_subjects=2)

In [68]:
state_dim = env.observation_space.shape[0]
action_dims = env.get_action_sizes()
print(f'State dimension: {state_dim}')
print(f'Action dimensions: {action_dims}')


State dimension: 10
Action dimensions: [2, 1, 2, 4, 2]


In [69]:
def play_game(env, agent, num_simulations, seed=None):
    obs,_ = env.reset(seed=seed)
    done = False
    total_reward = 0

    while not done:
        action = mcts(env, agent, num_simulations=num_simulations)
        obs, reward, done, _, _ = env.step(action)
        total_reward += reward

    return total_reward




In [70]:
def train_with_ppo(env, agent, optimizer, ppo_epochs, batch_size, seed, clip_coef, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm):
    obs, _ = env.reset(seed=seed)
    done = False

    states = []
    actions = []
    rewards = []
    values = []
    logprobs = []
    dones = []
    next_obs = obs

    # Rollout the episode
    while not done:
        action, logprob, _, value = agent.get_action_and_value(torch.tensor(next_obs.state2vector()).to(device))
        next_obs, reward, done, _, _ = env.step(action)
        
        states.append(next_obs)
        actions.append(action)
        rewards.append(reward)
        values.append(value.item())
        logprobs.append(logprob)
        dones.append(done)

    # Convert lists to tensors
    states = torch.tensor([s.state2vector() for s in states]).to(device)
    rewards = torch.tensor(rewards).to(device)
    dones = torch.tensor(dones).to(device)
    values = torch.tensor(values).to(device)
    
    # Compute advantages using GAE
    advantages = torch.zeros_like(rewards).to(device)
    lastgaelam = 0
    for t in reversed(range(len(rewards))):
        if t == len(rewards) - 1:
            nextnonterminal = 1.0 - dones[t]
            nextvalues = agent.get_value(torch.tensor(next_obs.state2vector()).to(device))
        else:
            nextnonterminal = 1.0 - dones[t + 1]
            nextvalues = values[t + 1]
        delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
        advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
    
    returns = advantages + values
    
    # PPO Update
    for _ in range(ppo_epochs):
        indices = np.arange(len(states))
        np.random.shuffle(indices)

        for start in range(0, len(states), batch_size):
            end = start + batch_size
            mb_inds = indices[start:end]

            b_obs = states[mb_inds]
            b_advantages = advantages[mb_inds]
            b_returns = returns[mb_inds]
            b_values = values[mb_inds]
            b_logprobs = [logprobs[i] for i in mb_inds]

            _, newlogprobs, entropies, newvalue = agent.get_action_and_value(b_obs)
            logratios = [newlogprob - oldlogprob for newlogprob, oldlogprob in zip(newlogprobs, b_logprobs)]
            ratios = [logratio.exp() for logratio in logratios]

            with torch.no_grad():
                approx_kl = ((ratios[0] - 1) - logratios[0]).mean()
                clipfrac = ((ratios[0] - 1.0).abs() > clip_coef).float().mean().item()

            # Policy loss
            pg_loss1 = [-mb_adv * ratio for ratio, mb_adv in zip(ratios, b_advantages)]
            pg_loss2 = [-mb_adv * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef) for ratio, mb_adv in zip(ratios, b_advantages)]
            policy_loss = torch.max(torch.stack(pg_loss1), torch.stack(pg_loss2)).mean()

            # Value loss
            v_loss_unclipped = (newvalue - b_returns) ** 2
            v_clipped = b_values + torch.clamp(
                newvalue - b_values, -clip_coef, clip_coef)
            v_loss_clipped = (v_clipped - b_returns) ** 2
            value_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean()

            # Entropy loss
            entropy_loss = torch.stack(entropies).mean()

            # Total loss
            loss = policy_loss + value_loss * vf_coef - entropy_loss * ent_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()

        # Logging
        print(f"Update {ppo_epochs} - Policy Loss: {policy_loss.item()}, Value Loss: {value_loss.item()}, KL: {approx_kl.item()}, Clip Frac: {clipfrac}")

    print("PPO Training Completed")

In [71]:
def train_agent_with_self_play(env, current_agent, best_agent, optimizer, num_updates, num_simulations=50, ppo_epochs=4, batch_size=64, clip_coef=0.2, gamma=0.99, gae_lambda=0.95, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5):
    best_score = -float('inf')
    
    for update in range(1, num_updates + 1):
        # Set the environment seed for consistent self-play comparisons
        seed = np.random.randint(0, 1e6)

        # Play the game with both the current and best agent
        current_score = play_game(env, current_agent, num_simulations, seed=seed)
        best_score_in_game = play_game(env, best_agent, num_simulations, seed=seed)

        # Compare performances
        if current_score > best_score_in_game:
            best_agent.load_state_dict(current_agent.state_dict())
            best_score = current_score
            print(f"New best agent at update {update} with score {best_score}")
        
        # Train the current agent with PPO
        train_with_ppo(env, current_agent, optimizer, ppo_epochs, batch_size, seed, clip_coef, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm)



In [72]:
num_updates = 100
num_simulations = 100
num_rollouts = 10
# Create two competing agents
current_agent = Agent(state_dim, action_dims).to(device)
best_agent = Agent(state_dim, action_dims).to(device)

In [73]:
optimizer = optim.Adam(current_agent.parameters(), lr=3e-4)

In [74]:
train_agent_with_self_play(env, current_agent, best_agent, optimizer, num_updates=num_updates, num_simulations=num_simulations)

KeyboardInterrupt: 