In [1]:
import torch
import torch.nn as nn
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from gymnasium.spaces import Box
import os
import glob
import shutil
import wandb
import pandas as pd
from gymnasium.wrappers import RecordVideo
import matplotlib.pyplot as plt
import pickle

import hockey.hockey_env as hockey
from memory import Memory
from feedforward import Feedforward
from per_memory import PERMemory

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.set_num_threads(1)

class UnsupportedSpace(Exception):
    """Exception for an unsupported observation or action space."""
    def __init__(self, message="Unsupported Space"):
        super().__init__(message)

class QFunction(nn.Module):
    """Q-function that uses a feedforward neural network."""
    def __init__(self, observation_dim, action_dim,
                 hidden_sizes=[100, 100],
                 learning_rate=0.0002):
        super().__init__()
        self.net = Feedforward(
            input_size=observation_dim + action_dim,
            hidden_sizes=hidden_sizes,
            output_size=1
        )
        self.optimizer = torch.optim.Adam(
            self.parameters(),
            lr=learning_rate,
            eps=1e-6
        )
        self.loss_fn = nn.SmoothL1Loss(reduction='none')
        self.to(device)

    def forward(self, x):
        return self.net(x)

    def fit(self, observations, actions, targets, weights=None):
        self.train()
        self.optimizer.zero_grad()
        pred = self.Q_value(observations, actions)
        loss_unreduced = self.loss_fn(pred, targets)
        if weights is not None:
            loss = (loss_unreduced * weights).mean()
        else:
            loss = loss_unreduced.mean()
        loss.backward()
        self.optimizer.step()
        td_error = pred - targets
        return loss.item(), td_error.detach().cpu().numpy()

    def Q_value(self, observations, actions):
        return self.forward(torch.cat([observations, actions], dim=-1))

class OUNoise:
    """Ornstein-Uhlenbeck noise for exploration."""
    def __init__(self, shape, theta=0.15, dt=1e-2):
        self._shape = shape
        self._theta = theta
        self._dt = dt
        self.noise_prev = np.zeros(self._shape)
        self.reset()

    def __call__(self):
        noise = (
            self.noise_prev
            + self._theta * (-self.noise_prev) * self._dt
            + np.sqrt(self._dt) * np.random.normal(size=self._shape)
        )
        self.noise_prev = noise
        return noise

    def reset(self):
        self.noise_prev = np.zeros(self._shape)

class DDPGAgent:
    """DDPG agent with neural networks for Q and policy. Uses PER if 'use_per' is True."""
    def __init__(self, observation_space, action_space, **userconfig):
        if not isinstance(observation_space, spaces.Box):
            raise UnsupportedSpace(f'Observation space {observation_space} incompatible.')
        if not isinstance(action_space, spaces.Box):
            raise UnsupportedSpace(f'Action space {action_space} incompatible.')

        self.device = device
        self._obs_dim = observation_space.shape[0]
        self._action_dim = 4
        self._action_space = Box(
            low=action_space.low[:4],
            high=action_space.high[:4],
            dtype=np.float32
        )
        self._config = {
            "eps": 0.05,
            "discount": 0.95,
            "buffer_size": int(1e6),
            "batch_size": 512,
            "learning_rate_actor": 0.0003,
            "learning_rate_critic": 0.0003,
            "hidden_sizes_actor": [256, 256],
            "hidden_sizes_critic": [256, 256],
            "update_target_every": 100,
            "use_target_net": True,
            "total_episodes": 50000,
            "seed": 0,
            "tau": 0.005,
            "use_per": True
        }
        self._config.update(userconfig)

        self.eps = self._config["eps"]
        self.discount = self._config["discount"]
        self.batch_size = self._config["batch_size"]
        self.buffer_size = self._config["buffer_size"]
        self.tau = self._config["tau"]
        self.use_target_net = self._config["use_target_net"]
        self.update_target_every = self._config["update_target_every"]
        self.train_iter = 0
        self.use_per = self._config["use_per"]

        if self.use_per:
            self.buffer = PERMemory(
                obs_dim=self._obs_dim,
                act_dim=self._action_dim,
                max_size=self.buffer_size,
                device=self.device
            )
        else:
            self.buffer = Memory(
                obs_dim=self._obs_dim,
                act_dim=self._action_dim,
                max_size=self.buffer_size,
                device=self.device
            )

        self.Q = QFunction(
            observation_dim=self._obs_dim,
            action_dim=self._action_dim,
            hidden_sizes=self._config["hidden_sizes_critic"],
            learning_rate=self._config["learning_rate_critic"]
        )
        self.Q_target = QFunction(
            observation_dim=self._obs_dim,
            action_dim=self._action_dim,
            hidden_sizes=self._config["hidden_sizes_critic"],
            learning_rate=0
        )

        self.policy = Feedforward(
            input_size=self._obs_dim,
            hidden_sizes=self._config["hidden_sizes_actor"],
            output_size=self._action_dim,
            activation_fun=nn.ReLU(),
            output_activation=nn.Tanh()
        )
        self.policy_target = Feedforward(
            input_size=self._obs_dim,
            hidden_sizes=self._config["hidden_sizes_actor"],
            output_size=self._action_dim,
            activation_fun=nn.ReLU(),
            output_activation=nn.Tanh()
        )
        self.policy.to(self.device)
        self.policy_target.to(self.device)

        self._copy_nets()
        self.optimizer = torch.optim.Adam(
            self.policy.parameters(),
            lr=0.0001,
            eps=1e-3
        )
        self.action_noise = OUNoise((self._action_dim,), theta=0.3, dt=0.02)

    def _copy_nets(self):
        self.Q_target.load_state_dict(self.Q.state_dict())
        self.policy_target.load_state_dict(self.policy.state_dict())

    def soft_update(self):
        for target_param, param in zip(self.Q_target.parameters(), self.Q.parameters()):
            target_param.data.copy_(
                self.tau * param.data + (1.0 - self.tau) * target_param.data
            )
        for target_param, param in zip(self.policy_target.parameters(), self.policy.parameters()):
            target_param.data.copy_(
                self.tau * param.data + (1.0 - self.tau) * target_param.data
            )

    def act(self, observation, eps=None):
        if eps is None:
            eps = self.eps
        obs_t = torch.tensor(observation, dtype=torch.float32, device=self.device).unsqueeze(0)
        with torch.no_grad():
            action_t = self.policy(obs_t)
        action_t = action_t.squeeze(0).cpu().numpy()
        noisy_action = action_t + eps * self.action_noise()
        scaled_action = (
            self._action_space.low
            + (noisy_action + 1.0)/2.0
            * (self._action_space.high - self._action_space.low)
        )
        return scaled_action

    def store_transition(self, transition):
        self.buffer.add_transition(transition)

    def reset(self):
        self.action_noise.reset()

    def train(self, iter_fit=32):
        if self.buffer.get_size() < self.batch_size:
            return []
        losses = []
        self.train_iter += 1
        if self.use_target_net and self.train_iter % self.update_target_every == 0:
            self._copy_nets()

        for _ in range(iter_fit):
            if self.use_per:
                (s, a, rew, s_prime, done, weights, idx) = self.buffer.sample(batch_size=self.batch_size)
            else:
                (s, a, rew, s_prime, done) = self.buffer.sample(batch_size=self.batch_size)
                weights, idx = None, None

            if self.use_target_net:
                q_prime = self.Q_target.Q_value(s_prime, self.policy_target(s_prime))
            else:
                q_prime = self.Q.Q_value(s_prime, self.policy(s_prime))

            td_target = rew + self.discount * (1.0 - done) * q_prime
            loss_val, td_err = self.Q.fit(s, a, td_target, weights=weights)

            self.optimizer.zero_grad()
            q_val = self.Q.Q_value(s, self.policy(s))
            actor_loss = -torch.mean(q_val if weights is None else q_val * weights)
            actor_loss.backward()
            self.optimizer.step()

            if self.use_per and idx is not None:
                td_err = td_err.squeeze(-1)
                self.buffer.update_priorities(idx, td_err)

            losses.append((loss_val, actor_loss.item()))

        self.soft_update()
        return losses

    def save(self, path):
        torch.save({
            "actor": self.policy.state_dict(),
            "critic": self.Q.state_dict(),
            "target_actor": self.policy_target.state_dict(),
            "target_critic": self.Q_target.state_dict(),
            "actor_optimizer": self.optimizer.state_dict()
        }, path)

    def load(self, path):
        data = torch.load(path, map_location=self.device)
        self.policy.load_state_dict(data["actor"])
        self.Q.load_state_dict(data["critic"])
        self.policy_target.load_state_dict(data["target_actor"])
        self.Q_target.load_state_dict(data["target_critic"])
        self.optimizer.load_state_dict(data["actor_optimizer"])

In [None]:
from run_client import DDQNAgent

# Initialize environment and agent
env = hockey.HockeyEnv(mode="NORMAL")
ddpg = DDPGAgent(env.observation_space, env.action_space, eps=0.0, learning_rate_actor=0.00001, update_target_every=10)
directory = "./"
print(os.listdir(directory))

# Load the checkpoint and restore the agent's state
checkpoint = ddpg.load("checkpoint_ep500001.pth")
#print(checkpoint.keys())
ddpg.reset()


# Create the opponent (if needed)
player2 = hockey.BasicOpponent(weak=True)

# Evaluation loop: run one episode
ob, _ = env.reset()
done = False
win=0
matches=0
while True:
    if matches == 10000:
        print(win/matches)
        break    
    #env.render()  # visualize the game
    # Get agent action without noise
    
    ddqn =DDQNAgent()
    a_ddqn = ddqn.get_step(ob)
    

    
    a = ddpg.act(ob, eps=0.0)
    # Get opponent's action
    a2 = player2.act(env.obs_agent_two())
    # Step the environment with the combined actions
    ob, reward, done, trunc, info = env.step(np.hstack([a, a2]))
    if done:
        matches+=1
        env.reset()
        #if info['winner'] == 1:
        win+=max(0,info['winner'])
        #print(win/matches)
        continue

print(win/10000)






['checkpoint_ep500001.pth', 'feedforward.py', 'Hockey_DuelingDQN_train_both_run2_cp3.pt', 'hockey_videos', 'memory.py', 'per_memory.py', 'plots1', 'plotting.ipynb', 'run_client.py', 'Textdokument (neu).txt', 'train_lr_experiment.py', '__pycache__']
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4

KeyboardInterrupt: 