In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
from torch.distributions import Normal 


env = gym.make("Humanoid-v5", render_mode="human")
obs, info = env.reset(seed =64)
obs = torch.tensor(obs, dtype=torch.float32)


input_dim = obs.shape[0]
output_dim = env.action_space.shape[0]
discount_factor = 0.99
learning_rate = 3e-4 
beta = 0.01 

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.core = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU()
        )
        self.mean_head = nn.Linear(512, output_dim)
        
        self.log_std = nn.Parameter(torch.ones(output_dim)*0.2 )

    def forward(self, x):
        x = self.core(x)
        mean = self.mean_head(x) 
        std = self.log_std.exp().expand_as(mean)
        return mean, std


policy = PolicyNetwork(input_dim, output_dim)
optimizer = torch.optim.Adam(policy.parameters(), lr=learning_rate)


episodes = 1000
for episode in range(episodes ):
    obs, info = env.reset(seed=episode)
    obs = torch.tensor(obs , dtype=torch.float32)

    done = False 
    observations = []
    rewards = []
    actions = []
    
    while not done:
        mean , std = policy(obs)
        dist = Normal(mean, std) 
        action = dist.sample()
        observations.append(obs)
        actions.append(action)
        obs, reward, terminated , truncated , info = env.step(action.numpy()) 
        obs = torch.tensor(obs , dtype=torch.float32)
        done = terminated or truncated  
        rewards.append(reward)
        env.render()
    discounted_rewards = []
    total_reward = 0 

    for i in range(len(rewards)-1, -1 , -1):
        total_reward = rewards[i] + discount_factor * total_reward 
        discounted_rewards.append(total_reward)
    discounted_rewards.reverse()

    discounted_rewards = torch.tensor(discounted_rewards , dtype=torch.float32)
    Gs = (discounted_rewards - discounted_rewards.mean())/(discounted_rewards.std() + 1e-7)
    loss = 0 

    for obs, reward, action in zip(observations , Gs,actions):
        mean , std  = policy(obs)
        dist = Normal(mean , std)
        log_probs = dist.log_prob(action).sum()
        entropy = dist.entropy().sum()
        loss += -log_probs * reward  - beta * entropy 
        
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

       
        



env.close()

^C
Note: you may need to restart the kernel to use updated packages.
