### MODELS

In [51]:
import os
import random
import time
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tyro
from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter

from goal_task import GoalTask

@dataclass
class Args:
    exp_name: str = 'MoA_ESN'
    """the name of this experiment"""
    seed: int = 1
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = True
    """if toggled, cuda will be enabled by default"""
    track: bool = False
    """if toggled, this experiment will be tracked with Weights and Biases"""
    wandb_project_name: str = "MoA_ESN_wandb"
    """the wandb's project name"""
    wandb_entity: str = None
    """the entity (team) of wandb's project"""
    capture_video: bool = False
    """whether to capture videos of the agent performances (check out `videos` folder)"""
    save_model: bool = False
    """whether to save model into the `runs/{run_name}` folder"""
    upload_model: bool = False
    """whether to upload the saved model to huggingface"""
    hf_entity: str = ""
    """the user or org name of the model repository from the Hugging Face Hub"""

    # Algorithm specific arguments
    env_id: str = "GoalTask"
    """the id of the environment"""
    total_timesteps: int = 40000
    """total timesteps of the experiments"""
    learning_rate: float = 3e-4
    """the learning rate of the optimizer"""
    buffer_size: int = 64
    """the replay memory buffer size"""
    gamma: float = 0.95
    """the discount factor gamma"""
    tau: float = 0.05
    """target smoothing coefficient (default: 0.005)"""
    batch_size: int = 64
    """the batch size of sample from the reply memory"""
    policy_noise: float = 0.2
    """the scale of policy noise"""
    exploration_noise: float = 0.1
    """the scale of exploration noise"""
    learning_starts: int = 0
    """timestep to start learning"""
    policy_frequency: int = 2
    """the frequency of training policy (delayed)"""
    noise_clip: float = 0.5
    """noise clip parameter of the Target Policy Smoothing Regularization"""


def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = GoalTask()
        env.action_space.seed(seed)
        return env

    return thunk


# ALGO LOGIC: initialize agent here:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.observation_space.shape).prod() + np.prod(env.action_space.shape), 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Actor(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.observation_space.shape).prod(), 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc_mu = nn.Linear(32, np.prod(env.action_space.shape))
        # action rescaling
        self.register_buffer(
            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
        )
        self.register_buffer(
            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
        )

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc_mu(x))
        return x * self.action_scale + self.action_bias


class ESNLayer(nn.Module):
    def __init__(self, env, reservoir_size, spectral_radius=0.95, g=2.2, sparsity=0.1, device='cuda'):
        super(ESNLayer, self).__init__()
        
        self.device = device
        self.input_size = np.array(env.observation_space.shape).prod()
        self.reservoir_size = reservoir_size
        self.spectral_radius = spectral_radius
        self.g = g
        
        # Input weights: weights are sampled from a uniform distribution over [-0.5,0.5]
        self.W_in = torch.rand(reservoir_size, self.input_size, device=self.device) - 0.5
        
        # Reservoir weights: sparse random matrix
        self.W = torch.randn(reservoir_size, reservoir_size, device=self.device)
        self.W[torch.rand(reservoir_size, reservoir_size) > sparsity] = 0.0  # Sparsify
        
        # Scale the reservoir weights to have the desired spectral radius
        _, eigenvalues = torch.linalg.eig(self.W)
        max_eigenvalue = torch.max(torch.abs(eigenvalues))
        self.W *= self.spectral_radius / max_eigenvalue
        
        # The output weights, to be learned during training
        self.W_out = nn.Linear((reservoir_size+self.input_size), 1, device=self.device)
        
    
    def forward(self, u, hidden_state):
        # Compute the new reservoir state with leaky integration
        with torch.no_grad():
            print(u.shape, hidden_state.shape)
            next_hidden_state = torch.matmul(self.W_in, u.T) + self.g * torch.matmul(self.W, hidden_state.T)
            next_hidden_state = torch.tanh(next_hidden_state)
        self.next_hidden_state = next_hidden_state
        self.hidden_state = hidden_state
        # Append output prediction (linear transformation of the reservoir state)
        output = torch.tanh(self.W_out(torch.concat([next_hidden_state.T, u.T])))

        # Stack outputs over time and return
        return output
    
    def get_hiddenstate(self):
        return self.hidden_state, self.next_hidden_state
    
    def reset_state(self):
        """Reset the hidden state of the ESN."""
        self.hiddenstate = torch.zeros(self.reservoir_size)



class ESNReplayBuffer:
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = []
    
    def add(self, experience):
        """Add experience to the buffer"""
        if len(self.buffer) >= self.max_size:
            self.buffer.pop(0)  # Remove the oldest experience if full
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        """Sample a batch of experiences from the buffer"""
        batch = random.sample(self.buffer, batch_size)
        return map(np.array, zip(*batch))


### INITIALIZATION

In [53]:
import stable_baselines3 as sb3

if sb3.__version__ < "2.0":
    raise ValueError(
            """Ongoing migration: run the following command to install the new dependencies:
poetry run pip install "stable_baselines3==2.0.0a1"
"""
        )

args = Args()
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
    import wandb

    wandb.init(
        project=args.wandb_project_name,
        entity=args.wandb_entity,
        sync_tensorboard=True,
        config=vars(args),
        name=run_name,
        monitor_gym=True,
        save_code=True,
    )
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

# TRY NOT TO MODIFY: seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

In [54]:
# env setup
envs = make_env(args.env_id, args.seed, 0, args.capture_video, run_name)()


actor =  ESNLayer(envs, 256).to(device)
qf1 = QNetwork(envs).to(device)
qf2 = QNetwork(envs).to(device)
qf1_target = QNetwork(envs).to(device)
qf2_target = QNetwork(envs).to(device)
target_actor = ESNLayer(envs, 256).to(device)
target_actor.load_state_dict(actor.state_dict())
qf1_target.load_state_dict(qf1.state_dict())
qf2_target.load_state_dict(qf2.state_dict())
q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate)
actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate)

envs.observation_space.dtype = np.float32



rb = ESNReplayBuffer(args.buffer_size)
start_time = time.time()

In [55]:
obs, _ = envs.reset(seed=args.seed)
actions = actor(torch.Tensor(obs).to(device), torch.Tensor(np.zeros((256))).to(device))

torch.Size([5]) torch.Size([256])


In [47]:
1 - np.linalg.norm(np.array([2,18]) - np.array([10,10]))/np.linalg.norm(np.array([0,0])-np.array([20,20]))

np.float64(0.6)

### TRAIN

In [None]:
# TRY NOT TO MODIFY: start the game
obs, _ = envs.reset(seed=args.seed)
for global_step in range(args.total_timesteps):
    # ALGO LOGIC: put action logic here
    if global_step < args.learning_starts:
        actions = envs.action_space.sample()
    else:
        with torch.no_grad():
            actions = actor(torch.Tensor(obs).to(device))
            actions += torch.normal(0, actor.action_scale * args.exploration_noise)
            actions = actions.cpu().numpy().clip(envs.action_space.low, envs.action_space.high)

    # TRY NOT TO MODIFY: execute the game and log data.
    next_obs, rewards, terminations, truncations, infos = envs.step(actions)

    # TRY NOT TO MODIFY: record rewards for plotting purposes
    if "final_info" in infos or terminations:
        print(f"global_step={global_step}, episodic_return={infos['episode']['r']}")
        writer.add_scalar("charts/episodic_return", infos["episode"]["r"], global_step)
        writer.add_scalar("charts/episodic_length", infos["episode"]["l"], global_step)     

    # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
    real_next_obs = next_obs.copy()
    if truncations:
        real_next_obs = infos["final_observation"]
    rb.add(obs, real_next_obs, actions, rewards, terminations, infos)

    # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
    obs = next_obs

    # ALGO LOGIC: training.
    if global_step > args.learning_starts:
        data = rb.sample(args.batch_size)
        with torch.no_grad():
            next_state_actions = target_actor(data.next_observations).clamp(
                envs.action_space.low[0], envs.action_space.high[0]
            )
            qf1_next_target = qf1_target(data.next_observations, next_state_actions)
            qf2_next_target = qf2_target(data.next_observations, next_state_actions)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target)
            next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)

        qf1_a_values = qf1(data.observations, data.actions).view(-1)
        qf2_a_values = qf2(data.observations, data.actions).view(-1)
        qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
        qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        # optimize the model
        q_optimizer.zero_grad()
        qf_loss.backward()
        q_optimizer.step()

        if global_step % args.policy_frequency == 0:
            actor_loss = -qf1(data.observations, actor(data.observations)).mean()
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # update the target network
            for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
            for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
                target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
            for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
                target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)

        if global_step % 100 == 0:
            writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
            writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
            writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
            writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
            writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
            writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)