In [1]:
import os
import random
import time
from distutils.util import strtobool

#import gym
from ScheduleGym import ScheduleGym
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

from pathlib import Path
import datetime
import tempfile
import json
import shutil
import imageio
from types import SimpleNamespace

In [2]:

def parse_args():
    # fmt: off
    args = SimpleNamespace()
    args.exp_name = "ScheduleTester_with_add_remove"
    args.seed = 1
    args.torch_deterministic = True
    args.cuda = True
    args.track = False

    args.capture_video = False
    #args.env_id = "CartPole-v1"
    #args.env_id = "LunarLander-v2"
    args.env_id = "ScheduleGym-v0"
    args.capture_video = False

    args.total_timesteps = 50000*2*100*4
    args.learning_rate = 2.5e-4
    args.num_envs = 16
    #args.num_steps = 128
    args.num_steps = 256
    args.anneal_lr = True
    args.gae = True
    args.gamma = 0.99
    args.gae_lambda = 0.95
    args.num_minibatches = 4
    args.update_epochs = 4
    args.norm_adv = True

    args.clip_range = 0.2
    args.clip_vloss = True
    args.clip_coef = 0.2
    args.ent_coef = 0.01
    args.vf_coef = 0.5
    args.max_grad_norm = 0.5
    args.target_kl = None
    args.clip_range_vf = None


    args.batch_size = int(args.num_envs * args.num_steps)
    args.minibatch_size = int(args.batch_size // args.num_minibatches)
    # fmt: on
    return args

In [3]:
args = parse_args()
args

namespace(exp_name='ScheduleTester_with_add_remove',
          seed=1,
          torch_deterministic=True,
          cuda=True,
          track=False,
          capture_video=False,
          env_id='ScheduleGym-v0',
          total_timesteps=40000000,
          learning_rate=0.00025,
          num_envs=16,
          num_steps=256,
          anneal_lr=True,
          gae=True,
          gamma=0.99,
          gae_lambda=0.95,
          num_minibatches=4,
          update_epochs=4,
          norm_adv=True,
          clip_range=0.2,
          clip_vloss=True,
          clip_coef=0.2,
          ent_coef=0.01,
          vf_coef=0.5,
          max_grad_norm=0.5,
          target_kl=None,
          clip_range_vf=None,
          batch_size=4096,
          minibatch_size=1024)

In [5]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, state_size, action_sizes, hidden_dim=256):
        super().__init__()

        self.action_sizes = action_sizes
        self.state_size = state_size

        self.shared = nn.Sequential(
            layer_init(nn.Linear(state_size, hidden_dim)),
            nn.ReLU(),
            layer_init(nn.Linear(hidden_dim, hidden_dim)),
            nn.ReLU()
        )

        self.actor_heads = nn.ModuleList([layer_init(nn.Linear(hidden_dim, dim), std=0.01) for dim in action_sizes])


        self.critic = nn.Sequential(
            layer_init(nn.Linear(hidden_dim, hidden_dim)),
            nn.ReLU(),
            layer_init(nn.Linear(hidden_dim, 1), std=1.0)
        )
        

    def get_value(self, x):
        x = self.shared(x)
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        # x: (batch_size, state_size)
        x = self.shared(x) # (batch_size, hidden_dim)
        action_logits = [head(x) for head in self.actor_heads] # [(batch_size, action_size) for action_size in action_sizes]
        action_probs = [Categorical(logits=logits) for logits in action_logits] 
        if action is None:
            #action = torch.stack([probs.sample() for probs in action_probs], device=x.device)
            action = torch.stack([probs.sample() for probs in action_probs]).T # (batch_size, len(action_sizes))

        log_prob_sum = torch.stack([probs.log_prob(a) for probs, a in zip(action_probs, action.T)]).sum(dim=0) # (batch_size,)
        entropy_sum = torch.stack([probs.entropy() for probs in action_probs]).sum(dim=0) # (batch_size,)
        return action, log_prob_sum, entropy_sum, self.critic(x)


In [6]:
args = parse_args()
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
print(f'Device: {device}')

Device: cuda


In [7]:
# env setup
# envs = gym.vector.SyncVectorEnv(
#     [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
# )

In [8]:
# envs = gym.vector.AsyncVectorEnv(
#     [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
# )

In [9]:
envs = gym.vector.AsyncVectorEnv(
    [lambda: ScheduleGym(num_days=2, num_hours=4, num_classes=1, num_subjects=2) for i in range(args.num_envs)]
)

In [10]:
state_dim = envs.single_observation_space.shape[0] 
state_dim

10

In [11]:
action_dims = [dim.n for dim in envs.single_action_space]
action_dims

[2, 1, 2, 4, 2]

In [12]:
#override
#device =torch.device('cpu')

In [13]:
agent = Agent(state_size=state_dim, action_sizes=action_dims, hidden_dim=256).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

In [14]:
x, _, = envs.reset()

In [15]:
x.shape

(16, 10)

In [16]:
ost = agent.get_value(torch.tensor(x, device=device, dtype=torch.float32))

In [17]:
ost

tensor([[-0.0004],
        [ 0.0025],
        [-0.0021],
        [-0.0028],
        [-0.0008],
        [-0.0021],
        [-0.0028],
        [ 0.0014],
        [ 0.0022],
        [ 0.0003],
        [ 0.0002],
        [ 0.0013],
        [ 0.0016],
        [ 0.0003],
        [-0.0021],
        [ 0.0025]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [18]:
action, log_prob_sum, entropy_sum, critic_value= agent.get_action_and_value(torch.tensor(x, device=device, dtype=torch.float32))

In [19]:
critic_value


tensor([[-0.0004],
        [ 0.0025],
        [-0.0021],
        [-0.0028],
        [-0.0008],
        [-0.0021],
        [-0.0028],
        [ 0.0014],
        [ 0.0022],
        [ 0.0003],
        [ 0.0002],
        [ 0.0013],
        [ 0.0016],
        [ 0.0003],
        [-0.0021],
        [ 0.0025]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [20]:
# ALGO Logic: Storage setup
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)

In [22]:

global_step = 0
start_time = time.time()
next_obs, _ = envs.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_timesteps // args.batch_size
print(f'Num updates: {num_updates}')
print(f'Run Name: {run_name}')

Num updates: 9765
Run Name: ScheduleGym-v0__ScheduleTester_with_add_remove__1__1723711213


In [23]:
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

In [24]:
for update in range(1, num_updates + 1):
    # Annealing the rate if instructed to do so.
    if args.anneal_lr:
        frac = 1.0 - (update - 1.0) / num_updates
        lrnow = frac * args.learning_rate
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, args.num_steps):
        global_step += 1 * args.num_envs
        obs[step] = next_obs
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

       
        next_obs, reward, done, info, _ = envs.step(action.cpu().numpy())
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

        # for item in info:
        #     if "episode" in item.keys():
        #         print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
        #         writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
        #         writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
        #         break

    # bootstrap value if not done
    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        if args.gae:
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(args.num_steps)):
                if t == args.num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    nextvalues = values[t + 1]
                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values
        else:
            returns = torch.zeros_like(rewards).to(device)
            for t in reversed(range(args.num_steps)):
                if t == args.num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    next_return = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    next_return = returns[t + 1]
                returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
            advantages = returns - values

    # flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(args.batch_size)
    clipfracs = []
    for epoch in range(args.update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, args.batch_size, args.minibatch_size):
            end = start + args.minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if args.norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            if args.clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -args.clip_coef,
                    args.clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
            optimizer.step()

        if args.target_kl is not None:
            if approx_kl > args.target_kl:
                break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    #record rewards for plotting purposes
    writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
    writer.add_scalar("charts/value_loss", v_loss.item(), global_step)
    writer.add_scalar("charts/policy_loss", pg_loss.item(), global_step)
    writer.add_scalar("charts/entropy", entropy_loss.item(), global_step)
    writer.add_scalar("charts/old_approx_kl", old_approx_kl.item(), global_step)
    writer.add_scalar("charts/approx_kl", approx_kl.item(), global_step)
    writer.add_scalar("charts/clipfrac", np.mean(clipfracs), global_step)
    writer.add_scalar("charts/explained_variance", explained_var, global_step)
    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
    writer.add_scalar("charts/rewards", rewards.mean().item(), global_step)

    if update % 10 == 0:
        print(f"update: {update}/ {num_updates} Max reward: {rewards.max().item()} Min reward: {rewards.min().item()} Mean reward: {rewards.mean().item()}")

update: 10/ 9765 Max reward: 74.9000015258789 Min reward: -2.25 Mean reward: 0.18951421976089478
update: 20/ 9765 Max reward: 77.75 Min reward: -2.25 Mean reward: 0.4688721299171448
update: 30/ 9765 Max reward: 80.05000305175781 Min reward: -2.25 Mean reward: 0.7583374381065369
update: 40/ 9765 Max reward: 74.05000305175781 Min reward: -2.25 Mean reward: 0.6122437119483948
update: 50/ 9765 Max reward: 79.9000015258789 Min reward: -2.25 Mean reward: 0.7146363258361816
update: 60/ 9765 Max reward: 76.75 Min reward: -2.25 Mean reward: 0.7876466512680054
update: 70/ 9765 Max reward: 80.05000305175781 Min reward: -2.25 Mean reward: 0.6830933690071106
update: 80/ 9765 Max reward: 78.05000305175781 Min reward: -2.25 Mean reward: 0.6647461652755737
update: 90/ 9765 Max reward: 80.05000305175781 Min reward: -2.25 Mean reward: 0.6976197361946106
update: 100/ 9765 Max reward: 79.9000015258789 Min reward: -2.25 Mean reward: 0.7762573957443237
update: 110/ 9765 Max reward: 79.05000305175781 Min rew

In [26]:
#Save the model
torch.save(agent.state_dict(), f"models/{run_name}.pt")

In [12]:
# Load the model
agent = Agent(state_size=state_dim, action_sizes=action_dims, hidden_dim=256)
load_name = 'ScheduleGym-v0__ScheduleTester__1__1723660943'
agent.load_state_dict(torch.load(f"models/{load_name}.pt"))
agent.eval()
agent.to(device)

Agent(
  (shared): Sequential(
    (0): Linear(in_features=10, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
  )
  (actor_heads): ModuleList(
    (0): Linear(in_features=256, out_features=1, bias=True)
    (1): Linear(in_features=256, out_features=2, bias=True)
    (2): Linear(in_features=256, out_features=4, bias=True)
    (3): Linear(in_features=256, out_features=2, bias=True)
  )
  (critic): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [27]:
env = ScheduleGym(num_days=2, num_hours=4, num_classes=1, num_subjects=2)


In [205]:
state, _ = env.reset()


In [239]:
env.render()
state = torch.Tensor(state).unsqueeze(0).to(device)
action, logprob, _, value = agent.get_action_and_value(state)
state, reward, done, info, _ = env.step(action.squeeze().cpu().numpy())
print(f'Action {action}')
print(f'Reward {reward}')
print(f'Done {done}')

if done:
    print('-'*10)
    print('Episode done')
    env.render()


Class 1:
Day 1: [1 0 1 0]
Day 2: [ 0  1 -1 -1]

Fitness: -1.0, Actions left: 37
Target Hours:
Class 1: [1 0]
Action tensor([[1, 0, 1, 2, 0]], device='cuda:0')
Reward 69.9
Done True
----------
Episode done
Class 1:
Day 1: [1 0 1 0]
Day 2: [ 0  1  0 -1]

Fitness: 0.0, Actions left: 36
Target Hours:
Class 1: [0 0]


In [28]:
env.render()

Class 1:
Day 1: [-1 -1 -1  0]
Day 2: [-1 -1 -1 -1]

Fitness: -1.0, Actions left: 16
Target Hours:
Class 1: [0 1]


In [20]:
action.squeeze().cpu().numpy()

array([0, 0, 3, 0], dtype=int64)

Action tensor([[0, 0, 3, 0]], device='cuda:0')
Reward -0.10000000000000009
Done False


In [49]:
def visualize_agent(agent, env, n_episodes=5):
    for i_episode in range(1, n_episodes + 1):
        state, info = env.reset()
        done = False
        score = 0
        max_t = 256
        t = 0
        while not done:
            env.render()
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            #action = agent.choose_action(state)
            action, logprob, _, value = agent.get_action_and_value(state)
            action = action.squeeze().cpu().numpy()
            #action = action.cpu().numpy()
            next_state, reward, done, truncated, info = env.step(action)
            state = next_state
            score += reward
            t += 1
            if done or t >= max_t:
                break
        print(f"Episode {i_episode}\tScore: {score}")
    env.close()

In [21]:
env = gym.make(args.env_id, render_mode="human")
visualize_agent(agent, env)

Episode 1	Score: -256.0
Episode 2	Score: -256.0
Episode 3	Score: -256.0
Episode 4	Score: -256.0
Episode 5	Score: -256.0
