## Hoceky Agent

In [1]:
# imports
import torch
import torch.nn.functional as F
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import time
from torch.utils.tensorboard import SummaryWriter
import hockey.hockey_env as h_env
import pickle

import memory
import tools

## Network
implemets Dueling DQN

In [23]:
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: torch.autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else torch.autograd.Variable(*args, **kwargs)
class NoisyLinear(torch.nn.Module):
    def __init__(self, in_features, out_features, std_init=0.4):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.std_init = std_init
        
        self.weight_mu = torch.nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = torch.nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))

        self.bias_mu = torch.nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = torch.nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))

        self.reset_parameters()
        self.reset_noise()

    def forward(self, x):
        if self.training:
            weight = self.weight_mu + self.weight_sigma.mul(Variable(self.weight_epsilon))
            bias = self.bias_mu + self.bias_sigma.mul(Variable(self.bias_epsilon))
        else:
            weight = self.weight_mu
            bias = self.bias_mu
        return F.linear(x, weight, bias)
    
    def reset_parameters(self):
        mu_range = 1 / np.sqrt(self.weight_mu.size(1))
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / np.sqrt(self.weight_sigma.size(1)))
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / np.sqrt(self.bias_sigma.size(0)))

    def reset_noise(self):
        epsilon_in = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(self._scale_noise(self.out_features))

    def _scale_noise(self, size):
        x = torch.randn(size)
        x = x.sign().mul(x.abs().sqrt())
        return x

In [24]:
class Feedforward(torch.nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, enable_dueling_dqn=False, use_noise=False, device = 'cpu'):
        super(Feedforward, self).__init__()
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.enable_dueling_dqn = enable_dueling_dqn
        self.device = device

        if self.enable_dueling_dqn:
            # cut the last hidden layer to the advantage and value streams
            self.dueling_size = self.hidden_sizes[-1]
            self.hidden_sizes = self.hidden_sizes[:-1]

        layers = []
        in_size = self.input_size
        first = True
        for h in hidden_sizes:
            if first or not use_noise:
                layers.append(torch.nn.Linear(in_size, h))
                first = False
            else:
                layers.append(NoisyLinear(in_size, h))
            layers.append(torch.nn.ReLU())
            in_size = h
        

        if self.enable_dueling_dqn:
            if use_noise:
                # Value stream
                self.fc_value = NoisyLinear(in_size, self.dueling_size)
                self.value = NoisyLinear(self.dueling_size, 1)

                # Advantages stream
                self.fc_advantages = NoisyLinear(in_size, self.dueling_size)
                self.advantages = NoisyLinear(self.dueling_size, self.output_size)
            else:
                # Value stream
                self.fc_value = torch.nn.Linear(in_size, self.dueling_size)
                self.value = torch.nn.Linear(self.dueling_size, 1)

                # Advantages stream
                self.fc_advantages = torch.nn.Linear(in_size, self.dueling_size)
                self.advantages = torch.nn.Linear(self.dueling_size, self.output_size)
        else:
            if use_noise:
                layers.append(NoisyLinear(in_size, output_size))
            else:
                layers.append(torch.nn.Linear(in_size, output_size))

        self.fully_connected = torch.nn.Sequential(*layers)
        self.to(self.device)

    def forward(self, x):
        '''
        Returns [batch_size, action_space_size]
        '''
        x = self.fully_connected(x)
        if self.enable_dueling_dqn:
            # Value calculation
            v = F.relu(self.fc_value(x))
            V = self.value(v)

            # Advantages calculation
            a = F.relu(self.fc_advantages(x))
            A = self.advantages(a)

            # Calculate Q
            Q = V + A - torch.mean(A, dim=-1, keepdim=True)
        
        else:
            Q = x

        return Q
            
    
    def predict(self, x):
        '''
        Runs without gradients and takes and returns numpy arrays
        '''
        x = torch.from_numpy(x).float().to(self.device)
        self.eval()
        with torch.no_grad():
            out = self.forward(x).cpu().numpy()
        self.train()
        return out
    
    def reset_noise(self):
        for layer in self.fully_connected:
            if isinstance(layer, NoisyLinear):
                layer.reset_noise()

## Q-Function
Uses the network to provide a function that can be used as Q-function

In [30]:
class QFunction(Feedforward):
    def __init__(self, state_dim, action_dim, hidden_sizes, learning_rate, enable_dueling_dqn=False, use_noise=False, device = 'cpu'):
        super().__init__(input_size=state_dim,
                         hidden_sizes=hidden_sizes,
                         output_size=action_dim,
                         enable_dueling_dqn=enable_dueling_dqn,
                         use_noise=use_noise,
                         device=device)
        self.device = device
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.loss = torch.nn.SmoothL1Loss(reduction='none')

    def fit(self, states, actions, targets, weights):
        self.train()
        self.optimizer.zero_grad()

        # Forward pass
        acts = torch.from_numpy(actions).to(self.device)
        pred = self.Q_value(torch.from_numpy(states).float().to(self.device), acts)
        if weights is None:
            weights = torch.ones_like(pred)
        weights = weights.to(self.device)
        loss = (weights * self.loss(pred, torch.from_numpy(targets).float().to(self.device))).mean()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.parameters(), 100)
        self.optimizer.step()
        td_error = pred.detach().cpu().numpy() - targets
        return loss.item(), td_error
    
    def Q_value(self, states, actions):
        return self.forward(states).gather(1, actions[:, None])
    
    def maxQ(self, states):
        return np.max(self.predict(states), axis=-1, keepdims=True)
    
    def doubleQ(self, state, action):
        x = torch.from_numpy(state).float().to(self.device)
        action = torch.from_numpy(action).to(self.device)
        self.eval()
        with torch.no_grad():
            Q = self.Q_value(x, action)
        return Q.cpu().numpy()

    
    def greedy_action(self, states):
        return np.argmax(self.predict(states), axis=-1)

## Agent
uses target net, epsilon-decay, double DQN

In [35]:
class DQNAgent():
    def __init__(self, observation_space, action_space, config):
        self.observation_space = observation_space
        self.action_space = action_space
        self.config = config
        
        self.eps = self.config['eps']
        self.eps_decay = self.config['eps_decay']
        self.eps_min = self.config['eps_min']
        self.iter_fit = self.config['iter_fit']
        self.use_noise = self.config['enable_noisy_nets']

        if self.config["enable_prioritized_replay"]:
            self.buffer = memory.PrioritizedMemory(max_size=self.config['buffer_size'], 
                                                   alpha=self.config['alpha'],
                                                   beta=self.config['beta'])
        else:
            self.buffer = memory.Memory(self.config['buffer_size'])

        self.Q = QFunction(state_dim=self.observation_space.shape[0],
                           action_dim=self.action_space.n,
                           hidden_sizes=self.config['hidden_sizes'],
                           learning_rate=self.config['learning_rate'],
                           enable_dueling_dqn=self.config['enable_dueling_dqn'],
                           use_noise=self.config['enable_noisy_nets'],
                           device=self.config['device'])
        self.Q_target = QFunction(state_dim=self.observation_space.shape[0], 
                                  action_dim=self.action_space.n,
                                  hidden_sizes=self.config['hidden_sizes'],
                                  learning_rate=0,
                                  enable_dueling_dqn=self.config['enable_dueling_dqn'],
                                  use_noise=self.config['enable_noisy_nets'],
                                  device=self.config['device'])
        
        self.update_target()
        self.train_iter = 0

    def update_target(self):
        self.Q_target.load_state_dict(self.Q.state_dict())
    
    def act(self, state, eps=None):
        if self.use_noise:
            return self.Q.greedy_action(state)
        else: 
            if eps is None:
                eps = self.eps
            if np.random.rand() < eps:
                return self.action_space.sample()
            else:
                return self.Q.greedy_action(state)
        
    # def act_safe(self, state, eps=None):
    #     return __env.discrete_to_continous_action(self.act(state, eps))
        
    def store_transition(self, transition):
        self.buffer.add_transition(transition)

    def train(self):
        losses = []
        self.train_iter += 1
        # Update target network if needed
        if self.config["use_target_net"] and self.train_iter % self.config["update_target_every"] == 0:
            self.update_target()

        # train with given buffer for (iter_fit) mini-batches
        for _ in range(self.iter_fit):
            # smaple from buffer
            if self.config["enable_prioritized_replay"]:
                data, indices, weights = self.buffer.sample(batch=self.config['batch_size'])
            else:
                data = self.buffer.sample(batch=self.config['batch_size'])
                weights = None

            # extract batches of every element -> [32, 1] for rewards
            s = np.stack(data[:, 0])
            a = np.stack(data[:, 1])
            rew = np.stack(data[:, 2])[:, None]
            s_prime = np.stack(data[:, 3])
            done = np.stack(data[:, 4])[:, None]

            if self.config["use_target_net"]:
                if self.config["enable_double_dqn"]:
                    best_action = self.Q.greedy_action(s_prime)
                    v_prime = self.Q_target.doubleQ(s_prime, best_action)
                else:
                    v_prime = self.Q_target.maxQ(s_prime)
            else:
                v_prime = self.Q.maxQ(s_prime)
            gamma = self.config["discount"]
            td_target = rew + gamma * (1 - done) * v_prime

            fit_loss, td_error = self.Q.fit(s, a, td_target, weights)
            if self.config["enable_prioritized_replay"]:
                self.buffer.update_priorities(indices, td_error)
            losses.append(fit_loss)
            
            self.Q.reset_noise()
            self.Q_target.reset_noise()

        # Decay epsilon
        self.eps = max(self.eps_min, self.eps * self.eps_decay)
        
        return losses

## Prepare Environment

In [38]:
# Set up environment
env = h_env.HockeyEnv()

ac_space = env.discrete_action_space
o_space = env.observation_space
print(ac_space)
print(o_space)
print(list(zip(env.observation_space.low, env.observation_space.high)))

Discrete(7)
Box(-inf, inf, (18,), float32)
[(np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf)), (np.float32(-inf), np.float32(inf))]


In [41]:
def discrete_to_continous_action(discrete_action):
    """
    x, y, angle, shoot ()

    Action 0: do nothing
    Action 1: x = 1
    Action 2: x = 1, y = 1
    Action 3: x = 1, y = 1, angle = 1
    Action 4: x = 1, y = 1, angle = -1
    Action 5: x = 1, y = -1
    Action 6: x = 1, y = -1, angle = 1
    Action 7: x = 1, y = -1, angle = -1
    Action 8: x = -1
    Action 9: x = -1, y = 1
    Action 10: x = -1, y = 1, angle = 1
    Action 11: x = -1, y = 1, angle = -1
    Action 12: x = -1, y = -1
    Action 13: x = -1, y = -1, angle = 1
    Action 14: x = -1, y = -1, angle = -1
    Action 15: shoot
    """
    x = (discrete_action < 8 and discrete_action > 1) * 1.0 + (discrete_action > 7 and discrete_action < 15) * -1.0
    y = (any([discrete_action == n for n in [2,3,4,9,10,11]]) * 1.0 + any([discrete_action == n for n in [5,6,7,12,13,14]]) * -1.0)
    angle = (any([discrete_action == n for n in [3,6,10,13]]) * 1.0 + any([discrete_action == n for n in [4,7,11,14]]) * -1.0)

    action_cont = [x, y, angle, discrete_action == 15]
    return action_cont

In [200]:
ac_space = gym.spaces.Discrete(16)
print(ac_space.n)
ac_space = env.discrete_action_space

16


## Training Parameters

In [None]:
config = {
    "hidden_sizes": [1024, 1024, 1024],
    "learning_rate": 1e-4,
    "batch_size": 128,
    "iter_fit": 32,
    "max_episodes": 50000,
    "max_steps": 500,
    "buffer_size": int(1e5),
    "discount": 0.95,
    "use_target_net": True,
    "update_target_every": 20,
    "enable_dueling_dqn": True,
    "enable_double_dqn": True,
    "enable_prioritized_replay": True,
    "alpha": 0.6,
    "beta": 0.4,
    "enable_noisy_nets": False,
    "eps": 0.7,
    "eps_decay": 0.9995,
    "eps_min": 0.05,
    "weak_percent": 0.2,
    "self_percent": 0.2,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "eval_every": 200,
    "eval_episodes": 100,
    "print_every": 20
}
env_name = "Hockey"
run_name = f"{env_name}_Rainbow_run{7}"

opponent_weak = h_env.BasicOpponent(weak=True)
opponent_strong = h_env.BasicOpponent(weak=False)

agent = DQNAgent(o_space, ac_space, config)
writer = SummaryWriter(log_dir=f"logs/{run_name}")

checkpoint_path = "models/checkpoints/" + run_name + "_cp"
checkpoint_idx = 0
checkpoint_count = 3

In [204]:
stats = []
losses = []

last_eval = float("-inf")

for i in range(config["max_episodes"]):
    # First explore the environment for one whole episode
    total_reward = 0
    self_play = False

    state, _info = env.reset()
    state_agent2 = env.obs_agent_two()

    if np.random.rand() < config["weak_percent"]:
        opponent = opponent_weak
    elif np.random.rand() < config["self_percent"]:
        opponent = agent
        self_play = True
    else:
        opponent = opponent_strong

    for t in range(config["max_steps"]):
        done = False
        # Agent chooses action with epsilon-greedy policy
        a_discrete = agent.act(state)
        a = discrete_to_continous_action(a_discrete)

        # if np.random.rand() < opponent_percent:
        #     a_agent2 = opponent_weak.act(state_agent2)
        # elif np.random.rand() < self_percent:
        #     a_agent2 = env.discrete_to_continous_action(agent.act(state_agent2))
        # else:
        #     a_agent2 = opponent_strong.act(state)

        if self_play:
            a_agent2 = discrete_to_continous_action(agent.act(state_agent2))
        else:
            a_agent2 = opponent.act(state_agent2)

        (state_new, reward, done, trunc, _info) = env.step(np.hstack([a, a_agent2]))
        state_agent2 = env.obs_agent_two()

        total_reward += reward
        agent.store_transition((state, a_discrete, reward, state_new, done))
        state = state_new
        if done:
            break

    # Train agent for (iter_fit) iterations
    episode_losses = agent.train()
    losses.extend(episode_losses)
    stats.append(total_reward)

    # Write to tensorboard
    writer.add_scalar("training/loss", np.mean(episode_losses), i)
    writer.add_scalar("training/reward", total_reward, i)
    writer.add_scalar("training/epsilon", agent.eps, i)
    writer.add_scalar("training/steps", t+1, i)

    # Print if necessray
    if i % config["print_every"] == 0:
        print("{}: Done after {} steps. Reward: {}".format(i, t+1, total_reward))

    # Evaluate agent
    if i % config["eval_every"] == 0:
        start_ts = time.time()
        total_reward = 0
        wins = 0
        for _ in range(config["eval_episodes"]):
            state, _info = env.reset()
            for t in range(config["max_steps"]):
                a = agent.act(state, eps=0)
                a = env.discrete_to_continous_action(a)
                a_agent2 = opponent_strong.act(env.obs_agent_two())
                (state, reward, done, trunc, _info) = env.step(np.hstack([a, a_agent2]))
                total_reward += reward
                if done:
                    break
            if _info["winner"] == 1:
                wins += 1
        total_reward /= config["eval_episodes"]

        wins_weak = 0
        for _ in range(config["eval_episodes"]):
            state, _info = env.reset()
            for t in range(config["max_steps"]):
                a = agent.act(state, eps=0)
                a = env.discrete_to_continous_action(a)
                a_agent2 = opponent_weak.act(env.obs_agent_two())
                (state, reward, done, trunc, _info) = env.step(np.hstack([a, a_agent2]))
                if done:
                    break
            if _info["winner"] == 1:
                wins_weak += 1


        writer.add_scalar("training/eval", total_reward, i)
        writer.add_scalar("training/eval_wins", (wins + wins_weak), i)
        writer.add_scalar("training/eval_winrate", wins/(float(config["eval_episodes"])), i)
        writer.add_scalar("training/eval_winrate_weak", wins_weak/(float(config["eval_episodes"])), i)
        print("Evaluation after {} episodes: {} took {}s".format(i, total_reward, (time.time()-start_ts)))

        total_reward = (wins_weak + wins*2.0) / (3.0 * config["eval_episodes"])
        if total_reward > last_eval:
            percent = (total_reward - last_eval) / last_eval * 100
            print(f"New best model with {percent:.2f}% improvement... Saving model as checkpoint")
            _cp_path = checkpoint_path + str(checkpoint_idx) + ".pt"
            torch.save(agent.Q.state_dict(), _cp_path)
            print(f"Saved model to {_cp_path}")
            checkpoint_idx = (checkpoint_idx + 1) % checkpoint_count
            last_eval = total_reward

    # Save model every 1000 episodes
    if i % 1000 == 0:
        torch.save(agent.Q.state_dict(), checkpoint_path + f"{i}_eps" + ".pt")

0: Done after 131 steps. Reward: 7.522185312412419
Evaluation after 0 episodes: -24.518816899995723 took 34.90857124328613s
New best model with nan% improvement... Saving model as checkpoint
Saved model to models/checkpoints/Hockey_Rainbow_run7_cp0.pt
20: Done after 251 steps. Reward: -1.2577178637926212
40: Done after 251 steps. Reward: -40.420782582031066
60: Done after 251 steps. Reward: -20.848712847330248
80: Done after 14 steps. Reward: 10.0
100: Done after 251 steps. Reward: -5.131349473553475
120: Done after 43 steps. Reward: 9.061815536126204
140: Done after 55 steps. Reward: -13.842254447436938
160: Done after 48 steps. Reward: -12.073959571127649
180: Done after 251 steps. Reward: -23.537512398702116
200: Done after 251 steps. Reward: -31.034378010863758
Evaluation after 200 episodes: -17.903289053447658 took 34.92914366722107s
New best model with 270.00% improvement... Saving model as checkpoint
Saved model to models/checkpoints/Hockey_Rainbow_run7_cp1.pt
220: Done after 25

KeyboardInterrupt: 