In [1]:
# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_continuous_actionpy
import os
import random
import time
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tyro
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter

In [2]:
@dataclass
class Args:
    exp_name: str = "ppo_continuous_action"
    """the name of this experiment"""
    seed: int = 1
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = True
    """if toggled, cuda will be enabled by default"""
    track: bool = False
    """if toggled, this experiment will be tracked with Weights and Biases"""
    wandb_project_name: str = "cleanRL"
    """the wandb's project name"""
    wandb_entity: str = None
    """the entity (team) of wandb's project"""
    capture_video: bool = False
    """whether to capture videos of the agent performances (check out `videos` folder)"""
    save_model: bool = False
    """whether to save model into the `runs/{run_name}` folder"""
    upload_model: bool = False
    """whether to upload the saved model to huggingface"""
    hf_entity: str = ""
    """the user or org name of the model repository from the Hugging Face Hub"""

    # Algorithm specific arguments
    env_id: str = "HalfCheetah-v4"
    """the id of the environment"""
    total_timesteps: int = 1000000
    """total timesteps of the experiments"""
    learning_rate: float = 3e-4
    """the learning rate of the optimizer"""
    num_envs: int = 1
    """the number of parallel game environments"""
    num_steps: int = 2048
    """the number of steps to run in each environment per policy rollout"""
    anneal_lr: bool = True
    """Toggle learning rate annealing for policy and value networks"""
    gamma: float = 0.99
    """the discount factor gamma"""
    gae_lambda: float = 0.95
    """the lambda for the general advantage estimation"""
    num_minibatches: int = 32
    """the number of mini-batches"""
    update_epochs: int = 10
    """the K epochs to update the policy"""
    norm_adv: bool = True
    """Toggles advantages normalization"""
    clip_coef: float = 0.2
    """the surrogate clipping coefficient"""
    clip_vloss: bool = True
    """Toggles whether or not to use a clipped loss for the value function, as per the paper."""
    ent_coef: float = 0.0
    """coefficient of the entropy"""
    vf_coef: float = 0.5
    """coefficient of the value function"""
    max_grad_norm: float = 0.5
    """the maximum norm for the gradient clipping"""
    target_kl: float = None
    """the target KL divergence threshold"""

    # to be filled in runtime
    batch_size: int = 0
    """the batch size (computed in runtime)"""
    minibatch_size: int = 0
    """the mini-batch size (computed in runtime)"""
    num_iterations: int = 0
    """the number of iterations (computed in runtime)"""

In [3]:
def make_env(env_id, idx, capture_video, run_name, gamma):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
        env = gym.wrappers.FlattenObservation(env)  # deal with dm_control's Dict observation space
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ClipAction(env)
        env = gym.wrappers.NormalizeObservation(env)
        env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
        env = gym.wrappers.NormalizeReward(env, gamma=gamma)
        env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
        return env

    return thunk


def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

In [None]:
env = gym.make('HalfCheetah-v4')
print(env.action_space)
print(env.observation_space)


Box(-1.0, 1.0, (6,), float32)
Box(-inf, inf, (17,), float64)


In [4]:
env.action_space.sample()

array([ 0.22610849, -0.26133633, -0.9654431 ,  0.38364863,  0.97026044,
        0.06519541], dtype=float32)

# Continuous actions via normal distributions
## why actor_logstd is state independent
in the [doc](https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/#9-details-for-continuous-action-domains-eg-mujoco), says there are no major differences between dependent and independent 
## torch.expand_as

```python
import torch
x = torch.tensor([[1], [2], [3]])
print('x', x)
y = torch.tensor([[2,2], [3,3], [5,5]])
print('y', y)
print('x.expand(3, 4): \n', x.expand(3, 4))
print('x.expand_as(y): \n', x.expand_as(y))
print('x is not changed: \n',  x)
```
```shell
x = torch.tensor([[1], [2], [3]])
print('x', x)
y = torch.tensor([[2,2], [3,3], [5,5]])
print('y', y)
print('x.expand(3, 4): \n', x.expand(3, 4))
print('x.expand_as(y): \n', x.expand_as(y))
print('x is not changed: \n',  x)
``` 

## np.sum(axis=xxx)
Prompt: can you show me the pseudo code to implement this kind of `sum(axis:int|tuple[int...])`    

Assuming array shape is ($4_0$, $3_1$, $2_2$),     
`sum(axis=0)` the resulting shape will be `(3, 2)` like dimensionality reduction, like squeeze a cube(3d) `x,y,z` into plane(2d) (keep `x,y` or `x,z`, or `y,z`) the will expand somehow

```python
array = np.array([[[ 1,  2],
                   [ 3,  4],
                   [ 5,  6]],

                  [[ 7,  8],
                   [ 9, 10],
                   [11, 12]],

                  [[13, 14],
                   [15, 16],
                   [17, 18]],

                  [[19, 20],
                   [21, 22],
                   [23, 24]]])


sum_axis_1 = 
```

`np.sum(array, axis=0)`    
[    
    [40, 44],  $$(1_{000}+7_{010}+13_{020}+19_{030}), (2_{001}+8_{011}+14_{021}+20_{031})$$    
    [48, 52],  $$(3_{100}+9_{110}+15_{120}+21_{130}), (4_{101}+10_{111}+16_{121}+22_{131})$$    
    [56, 60],  $$(5_{200}+11_{210}+17_{220}+23_{230}), (6_{201}+12_{211}+18_{221}+24_{231})$$    
]

`np.sum(array, axis=1)`
[    
    [ 6, 12],  $$(1_{000}+3_{100}+5_{200}), (2_{001}+4_{101}+6_{201})$$
    [27, 30],  $$(7_{010}+9_{110}+11_{210}), (8_{011}+10_{111}+12_{211})$$
    [45, 48],  $$(13_{020}+15_{120}+17_{220}), (14_{021}+16_{121}+18_{221})$$
    [63, 66],  $$(19_{030}+21_{130}+23_{230}), (20_{031}+22_{131}+24_{231})$$
]

## np.prod(envs.single_action_space.shape) vs venvs.single_action_space.n
Q: If the shape is (3,), `np.prod(envs.single_action_space.shape)` would return 3, then what is the `envs.single_action_space.n` in this case
A: 
* If you have a continuous action space (like (3,) for three continuous values):
np.prod(envs.single_action_space.shape) would return 3, reflecting the number of continuous outputs expected by the network.
envs.single_action_space.n is not applicable and cannot be used(it typically raises an error or will return an undefined value.).
* If you have a discrete action space:
envs.single_action_space.n would *represent the total number of discrete actions available*.
envs.single_action_space.shape would typically be () because it represents a single dimension without any additional shape information.

## How is `actor_logstd` got learned?
Q1: if `action_logstd = self.actor_logstd.expand_as(action_mean)` just a tensor.expand_as that changes the shape    
A1:
actor_logstd is initialized as a learnable parameter (using nn.Parameter) with a shape that matches the number of dimensions of the actions in the action space.

Since actor_logstd is an nn.Parameter, it is included in the list of parameters that the optimizer will update during training.
As the agent interacts with the environment and learns from its experiences (using techniques like policy gradients or other RL algorithms), the gradients will be backpropagated through the actor network, which includes the actor_logstd.
This means that actor_logstd is actively learned and optimized based on the reward feedback it receives during training. It adjusts to enable the agent to learn better about how to distribute its actions based on the observations from the environment.


Q2: then what is the  back propagation formula for this actor_logstd    
A2:    
In a reinforcement learning context, particularly in policy gradient methods such as those used in the CleanRL framework you've referenced, the backpropagation updates for parameters like `actor_logstd` involve computing gradients of the loss with respect to that parameter. 

### Overview of the Backpropagation Process

$\alpha$ In the provided code, `actor_logstd` is part of the actor network, which is used to model the policy distribution from which actions are sampled. The action distribution is parameterized by both the mean (`action_mean`) and the standard deviation (`action_std`). Here's a breakdown of how the backpropagation for `actor_logstd` occurs:

1. **Loss Function**: 
   - In a policy gradient setup, the loss function for the policy network might typically involve the negative log probability of the actions taken, weighted by the rewards received (often using advantage estimates).
   - A common loss function \( L \) might look similar to: 
   $$
      L = -\mathbb{E} \left[ \log(\pi(a_t | s_t)) A_t \right]
   $$ 
   - where $\pi(a_t | s_t)$ is the probability of taking action $a_t$ given state $s_t$, and \( A_t \) is the advantage at time \( t \).

2. **Gradient Calculation**:
   - The key is that when you compute the gradients of the loss with respect to `actor_logstd`, you will use the chain rule. 

3. **Chain Rule**:
   - Let's denote the log probability of the action given the mean and standard deviation as:
```math
     \log p(a | \mu, \sigma) = -\frac{(a - \mu)^2}{2\sigma^2} - \log(\sigma \sqrt{2\pi})
```
   - When you differentiate the loss \( L \) with respect to `actor_logstd`, you are particularly interested in its contribution to the log probability of the actions taken and how it influences the overall loss function.

### Backpropagation with Respect to `actor_logstd`

Using the chain rule, the gradient of the loss with respect to `actor_logstd` can be expressed as:

$$
\frac{\partial L}{\partial \text{actor\_logstd}} = \frac{\partial L}{\partial \log p(a | \mu, \sigma)} \cdot \frac{\partial \log p(a | \mu, \sigma)}{\partial \text{actor\_logstd}}
$$

Here are the steps broken down:

1. **Computing $\frac{\partial L}{\partial \log p(a | \mu, \sigma)}$**:
   - This term represents how the loss changes with respect to the log probabilities of the actions. This would involve the advantages, calculated during the training process.
    
2. **Computing $\frac{\partial \log p(a | \mu, \sigma)}{\partial \text{actor\_logstd}}$**:
   - This can be computed from the log probability expression for the normal distribution:
     $$
     \log p(a | \mu, \sigma) = -\frac{(a - \mu)^2}{2\sigma^2} - \log(\sigma) - \frac{1}{2}\log(2\pi)
     $$
   - Thus, the derivative with respect to `actor_logstd` is found as follows:
     $$
     \frac{\partial \log p(a | \mu, \sigma)}{\partial \text{actor\_logstd}} = -\frac{(a - \mu)^2 e^{-\text{actor\_logstd}}}{2} + \frac{1}{\text{actor\_logstd}}
     $$

### Resulting Update

The gradients computed from backpropagation are then used to update `actor_logstd` along with other parameters using an optimizer (like Adam or SGD). Given the optimizer's parameters such as the learning rate \( \alpha \), the weight update for `actor_logstd` would look something like:

$$
\text{actor\_logstd} \leftarrow \text{actor\_logstd} - \alpha \cdot \frac{\partial L}{\partial \text{actor\_logstd}}
$$

### Summary

In summary, during backpropagation:
- The loss function is primarily influenced by the log probabilities of the actions given the states and the advantages from the rewards.
- The gradients with respect to `actor_logstd` are derived through the chain rule and involve the parameters of the distribution (mean and standard deviation).
- The learning process updates `actor_logstd` to improve the policy based on the actions taken and their rewards, enabling the agent to learn how to explore and exploit effectively. 

This mechanism allows `actor_logstd` to adapt over time, ensuring the action distribution can become more focused or varied as needed based on the agent's performance.
## why probs.log_prob(action).sum(1) not sum(0)
According to below equation
$$
\displaystyle\sum_{c=1}^{6} \log \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c) = \log \displaystyle\prod_{c=1}^{6} \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c) = \log \pi(a_t|s_t)
$$

```python
arr = np.arange(24).reshape(4, 6)
print(arr)
print('======')
print(arr.sum(axis=1))
```

```shell
[[ 0  1  2  3  4  5] # sum(...) => 15
 [ 6  7  8  9 10 11] # sum(...) => 51
 [12 13 14 15 16 17] # sum(...) => 87
 [18 19 20 21 22 23]] # sum(...) => 123
======
[ 15  51  87 123]
```

## why `probs.log_prob(action).sum(axis=1)` not axis=other_axis
### Understanding the Dimensions

1. **Probabilities and Action Shapes**:
   - When you create a distribution for actions using a normal distribution (as in your example), your `probs` object contains the means and standard deviations for each action dimension.
   - The shape of `action_mean` and `action_std` will generally be `(batch_size, num_action_dims)` where:
     - `batch_size` corresponds to the number of environments or the number of samples processed simultaneously.
     - `num_action_dims` corresponds to the number of dimensions or components in the action space (e.g., a multi-dimensional continuous action).

2. **Log Probability Calculation**:
   - When you call `probs.log_prob(action)`, it computes the log probability of the `action` taken. Given the multi-dimensional nature of actions, the result would have the same shape as `action`, i.e., `(batch_size, num_action_dims)`.
   - This gives a 2D tensor output where:
     - Each row corresponds to the log probabilities for a particular `action` across all action dimensions for a sample in the batch.

### Summation along Axis `1`

- **`sum(1)`**:
  - By summing along axis `1`, you're aggregating the log probabilities across all dimensions (or components) of the action for each sample in the batch.
  - This means you will get a single log probability value per action taken for each sample, effectively collapsing the action dimensions into one value.
  - For example, if `probs.log_prob(action)` yields a tensor of shape `(batch_size, num_action_dims)`, then `probs.log_prob(action).sum(1)` will yield a tensor of shape `(batch_size,)`, which represents the total log probability of the selected actions given the distribution.

- **Interpretation**:
  - This operation is essential for computing the total likelihood of the actions taken under the trained policy. In policy gradient methods, you typically want to maximize the likelihood/expected reward, which requires considering the total log probability associated with the chosen actions.

## what does $\log \pi(a_t|s_t)$ mean
Q: we got below equation for above context,
$$
\displaystyle\sum_{c=1}^{6} \log \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c) = \log \displaystyle\prod_{c=1}^{6} \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c) = \log \pi(a_t|s_t)
$$

what does $\log \pi(a_t|s_t)$ mean and  $Nu(\mu_t^c, \sigma_t^{c^2})(a_t^c)$, too?

A:
In the context of reinforcement learning and policy gradient methods, particularly when dealing with continuous action spaces, it's important to understand the notation used in equations related to probabilities and distributions. Let's break down the equation you provided:

### Notation Breakdown

1. **$\log \pi(a_t | s_t)$**:
   - This represents the **log probability** of taking action $a_t$ given state $s_t$ according to the policy $\pi$. In reinforcement learning, a policy defines the behavior of the agent—it specifies how the agent chooses its actions based on the current state.
   - $\pi(a_t | s_t)$ signifies the probability density function (PDF) that gives the likelihood of taking the specific action $a_t$ when in state $s_t$. This PDF could result from a normal distribution if we are modeling continuous actions.
   - The logarithm of this probability is often used in training algorithms because it transforms the product of probabilities (a common occurrence when multiplying probabilities across multiple actions) into a sum, which simplifies calculations and can also help with numerical stability.

2. **$Nu(\mu_t^c, \sigma_t^{c^2})(a_t^c)$**:
   - Here, $Nu$ likely refers to a PDF (Probability Density Function) of a normal distribution, specifically denoted as $\mathcal{N}$, which indicates that the action is being sampled from a normal distribution.
   - The notation can be interpreted as:
     $$
     \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c)
     $$
   - Where:
     - $\mu_t^c$ is the mean of the distribution for the $c$-th action component at time $t$.
     - $\sigma_t^2$ is the variance (the square of the standard deviation) of the distribution for the $c$-th action component at time $t$.
     - $a_t^c$ is the specific action taken for the $c$-th action dimension at time $t$.
   - So, this term gives the value of the probability density function for the normal distribution at the point defined by action $a_t^c$ given the mean and variance at that time.

### Putting It All Together

When you combine these ideas in the context of your equation:

- The equation states that the log probability of the action $a_t$ being taken at state $s_t$ can be expressed as the sum of the log probabilities of the individual action components:

$$
\sum_{c=1}^{6} \log \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c) = \log \prod_{c=1}^{6} \mathcal{N}(\mu_t^c, \sigma_t^{c^2})(a_t^c)
$$

- On the left, you are summing the logs of the individual probabilities for each action component `c` (for instance, if you have an action space that is 6-dimensional, with each component modeled as a normal distribution).
- On the right, you are using the property of logarithms that transforms the product of probabilities into a sum, highlighting how these components together contribute to the overall distribution, thereby representing the policy's probability for taking the combined action $a_t$ from state $s_t$.

### Summary

- $\log \pi(a_t | s_t)$ represents the log probability of the action given the current state, allowing the agent to evaluate how good a decision it made.
- $Nu(\mu_t^c, \sigma_t^{c^2})(a_t^c)$ indicates the probability density of selecting the action component $a_t^c$ based on its mean and variance, derived from a normal distribution, for each component of the action space.
  
These components are critical in understanding the dynamics of policy updates in reinforcement learning, specifically in continuous action spaces.

## why Normal distribution
Q: in continuous action context, why we choose normal distribution to sample action, why not other distributions?
A:
Choosing a normal (Gaussian) distribution to sample actions in continuous action settings in reinforcement learning has several advantages, but it is essential to consider the characteristics of the problem at hand. Below are some reasons why the normal distribution is commonly used, along with a discussion of other possible distributions.

### Advantages of Using the Normal Distribution

1. **Smoothness and Continuity**:
   - The normal distribution is a continuous distribution that allows for smooth transitions in action values. This is particularly useful in environments where actions are continuous and need to change smoothly.

2. **Analytic Properties**:
   - The normal distribution has well-known mathematical properties that make it easier to work with mathematically. For example, it is fully characterized by just two parameters: mean (\(\mu\)) and standard deviation (\(\sigma\)), which simplifies calculations involving gradients and expectations.

3. **Handling of Uncertainty**:
   - The normal distribution provides a natural way to model uncertainty in actions. The standard deviation controls the exploration vs. exploitation trade-off:
     - A small standard deviation leads to taking actions close to the mean (exploitation), while a larger standard deviation encourages exploration of the action space.
   
4. **Gradient Calculations**:
   - When using log probabilities of Gaussian distributions for policy gradients, the resulting gradients retain friendly mathematical forms. For actions taken from a Gaussian distribution, the gradient of the log probability with respect to the mean or variance can be computed easily, aiding in stable learning.

5. **Sample Efficiency**:
   - The structure of the normal distribution allows for effective exploration of the action space. In many cases, the Gaussian distribution can be tuned to quickly converge to optimal action strategies through mean and variance updates.

6. **Central Limit Theorem**:
   - The Central Limit Theorem suggests that many independent random variables will sum to a normally distributed variable when properly normalized. As a result, if action values are influenced by many factors, they may naturally follow a normal distribution.

### Alternatives to Normal Distribution

While the normal distribution is prevalent, other distributions can be useful, depending on the characteristics of the problem:

1. **Uniform Distribution**:
   - For tasks where any value within a specific range is equally likely (e.g., action spaces bounded within [0,1]), a uniform distribution might be more appropriate. However, uniform distributions do not have variance control like normal distributions.

2. **Cauchy Distribution**:
   - Used in situations where outliers are a significant concern, as the Cauchy distribution has heavier tails than the Gaussian, allowing for more exploration at the extremes.

3. **Log-Normal Distribution**:
   - When actions must always be positive, a log-normal distribution may be preferred, where the logarithm of the values follows a normal distribution. This is suitable for scenarios like position or length, where negative values don’t make sense.

4. **Beta Distribution**:
   - Often used for actions constrained between 0 and 1, particularly in environments with bounded continuous action spaces. The Beta distribution is flexible and can model various shapes based on its parameters.

5. **Custom Distributions**:
   - In complex environments, custom distributions can be constructed based on domain knowledge, potentially combining multiple distributions.

### Conclusion

The choice of distribution for sampling actions in continuous settings largely depends on the problem characteristics and requirements:
- **Normal distributions** are favored for their smoothness, simplicity, and effective way to explore multi-dimensional action spaces.
- However, other distributions are certainly applicable and may be more suited to specific domains or constraints, and reinforcement learning researchers can choose or design a distribution based on the needs of their particular problem. 

Ultimately, normal distributions have become a standard choice largely due to their mathematical properties and the convenience they provide in many continuous action settings in reinforcement learning.





In [1]:

class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)





'3.10.12 (main, Aug  4 2023, 13:58:21) [Clang 14.0.3 (clang-1403.0.22.14.1)]'

In [None]:

args = tyro.cli(Args)
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_minibatches)
args.num_iterations = args.total_timesteps // args.batch_size
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
    import wandb

    wandb.init(
        project=args.wandb_project_name,
        entity=args.wandb_entity,
        sync_tensorboard=True,
        config=vars(args),
        name=run_name,
        monitor_gym=True,
        save_code=True,
    )
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

# TRY NOT TO MODIFY: seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

# env setup
envs = gym.vector.SyncVectorEnv(
    [make_env(args.env_id, i, args.capture_video, run_name, args.gamma) for i in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

# ALGO Logic: Storage setup
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset(seed=args.seed)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(args.num_envs).to(device)

for iteration in range(1, args.num_iterations + 1):
    # Annealing the rate if instructed to do so.
    if args.anneal_lr:
        frac = 1.0 - (iteration - 1.0) / args.num_iterations
        lrnow = frac * args.learning_rate
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, args.num_steps):
        global_step += args.num_envs
        obs[step] = next_obs
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        next_done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)

        if "final_info" in infos:
            for info in infos["final_info"]:
                if info and "episode" in info:
                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                    writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                    writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)

    # bootstrap value if not done
    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(rewards).to(device)
        lastgaelam = 0
        for t in reversed(range(args.num_steps)):
            if t == args.num_steps - 1:
                nextnonterminal = 1.0 - next_done
                nextvalues = next_value
            else:
                nextnonterminal = 1.0 - dones[t + 1]
                nextvalues = values[t + 1]
            delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
            advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
        returns = advantages + values

    # flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(args.batch_size)
    clipfracs = []
    for epoch in range(args.update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, args.batch_size, args.minibatch_size):
            end = start + args.minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if args.norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            if args.clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -args.clip_coef,
                    args.clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
            optimizer.step()

        if args.target_kl is not None and approx_kl > args.target_kl:
            break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    # TRY NOT TO MODIFY: record rewards for plotting purposes
    writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
    writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
    writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
    writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
    writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
    writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
    writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
    writer.add_scalar("losses/explained_variance", explained_var, global_step)
    print("SPS:", int(global_step / (time.time() - start_time)))
    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

if args.save_model:
    model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
    torch.save(agent.state_dict(), model_path)
    print(f"model saved to {model_path}")
    from cleanrl_utils.evals.ppo_eval import evaluate

    episodic_returns = evaluate(
        model_path,
        make_env,
        args.env_id,
        eval_episodes=10,
        run_name=f"{run_name}-eval",
        Model=Agent,
        device=device,
        gamma=args.gamma,
    )
    for idx, episodic_return in enumerate(episodic_returns):
        writer.add_scalar("eval/episodic_return", episodic_return, idx)

    if args.upload_model:
        from cleanrl_utils.huggingface import push_to_hub

        repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
        repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
        push_to_hub(args, episodic_returns, repo_id, "PPO", f"runs/{run_name}", f"videos/{run_name}-eval")

envs.close()
writer.close()