In [7]:
import gymnasium as gym
import torch
import torch.nn as nn
from torch.distributions import Normal

env = gym.make("Humanoid-v5")   
obs, info = env.reset(seed=64)
obs = torch.tensor(obs, dtype=torch.float32)

input_dim = obs.shape[0]
output_dim = env.action_space.shape[0]
discount_factor = 0.99
learning_rate = 3e-4
beta = 0.01

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.core = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU()
        )
        self.mean_head = nn.Linear(512, output_dim)
        self.log_std = nn.Parameter(torch.ones(output_dim) * 0.2)

    def forward(self, x):
        x = self.core(x)
        mean = self.mean_head(x)
        std = self.log_std.exp().expand_as(mean)
        return mean, std

policy = PolicyNetwork(input_dim, output_dim)
optimizer = torch.optim.Adam(policy.parameters(), lr=learning_rate)


In [10]:

episodes = 10000
total_loss = 0
for episode in range(episodes):
    obs, info = env.reset(seed=episode)
    obs = torch.tensor(obs, dtype=torch.float32)
    done = False
    observations = []
    rewards = []
    actions = []

    while not done:
        with torch.no_grad():
            mean, std = policy(obs)
            dist = Normal(mean, std)
            raw_action = dist.sample()
            clamped_action = torch.clamp(raw_action, -1, 1)
            observations.append(obs)
            actions.append(raw_action)
            obs, reward, terminated, truncated, info = env.step(clamped_action.detach().numpy())
            # print(terminated, truncated)
            obs = torch.tensor(obs, dtype=torch.float32)
            done = terminated or truncated
            rewards.append(reward)
        
    
    discounted_rewards = []
    total_reward = 0
    for r in reversed(rewards):
        total_reward = r + discount_factor * total_reward
        discounted_rewards.append(total_reward)
    discounted_rewards.reverse()
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
    Gs = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-7)
    # env.render()
    loss = 0
    
    for obs, reward, action in zip(observations, Gs, actions):
        mean, std = policy(obs)
        dist = Normal(mean, std)
        log_probs = dist.log_prob(action).sum()
        entropy = dist.entropy().sum()
        loss += -log_probs * reward - beta * entropy
    loss = loss / len(observations)

    total_loss += loss
    if (episode + 1) % 100 == 0:
        batch_loss = total_loss / 100
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print(f"Episode {episode + 1}, Loss: {batch_loss.item()}, Return: {sum(rewards)}")

        total_loss = 0

env.close()


Episode 100, Loss: -0.27843886613845825, Return: 201.49035314197278
Episode 200, Loss: -0.2373594343662262, Return: 167.46315598553522
Episode 300, Loss: -0.35641148686408997, Return: 201.01219409110644
Episode 400, Loss: -0.3101004660129547, Return: 178.82881235460925
Episode 500, Loss: -0.282781720161438, Return: 193.19660301935698
Episode 600, Loss: -0.3129720091819763, Return: 199.48630163553608
Episode 700, Loss: -0.2479310929775238, Return: 181.89244715445545
Episode 800, Loss: -0.31906720995903015, Return: 236.2175380359367
Episode 900, Loss: -0.3369368612766266, Return: 191.60266173217747
Episode 1000, Loss: -0.2707272171974182, Return: 107.36358023967387
Episode 1100, Loss: -0.2768295109272003, Return: 85.21585917127881
Episode 1200, Loss: -0.24571041762828827, Return: 175.84734751773934
Episode 1300, Loss: -0.24822227656841278, Return: 102.28215925647363
Episode 1400, Loss: -0.17905400693416595, Return: 105.40008925213691
Episode 1500, Loss: -0.23660974204540253, Return: 145.

KeyboardInterrupt: 

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
env = gym.make("Humanoid-v5", render_mode="human")
with torch.no_grad():
    obs, info = env.reset(seed =64)
    obs = torch.tensor(obs, dtype=torch.float32)

    episodes = 10000
    total_loss = 0
    for episode in range(episodes ):
        
        obs, info = env.reset(seed=episode)
        obs = torch.tensor(obs , dtype=torch.float32)

        done = False 
        observations = []
        rewards = []
        actions = []
        
        while not done:
            mean , std = policy(obs)
            dist = Normal(mean, std) 
            action = dist.sample()
            action = torch.clamp(action , -0.4 , 0.4)
            observations.append(obs)
            actions.append(action)
            obs, reward, terminated , truncated , info = env.step(action.numpy()) 
            obs = torch.tensor(obs , dtype=torch.float32)
            done = terminated or truncated  
            rewards.append(reward)
            env.render()
        discounted_rewards = []
        total_reward = 0 

        # for i in range(len(rewards)-1, -1 , -1):
        #     total_reward = rewards[i] + discount_factor * total_reward 
        #     discounted_rewards.append(total_reward)
        # discounted_rewards.reverse()

        # discounted_rewards = torch.tensor(discounted_rewards , dtype=torch.float32)
        # Gs = (discounted_rewards - discounted_rewards.mean())/(discounted_rewards.std() + 1e-7)
        # loss = 0 

        # for obs, reward, action in zip(observations , Gs,actions):
        #     mean , std  = policy(obs)
        #     dist = Normal(mean , std)
        #     log_probs = dist.log_prob(action).sum()
        #     entropy = dist.entropy().sum()
        #     loss += -log_probs * reward  - beta * entropy 
            
            
        # total_loss +=loss 
        # if (episode+1) % 100 == 0:
            
        #     total_loss /= 100
        #     # optimizer.zero_grad()
        #     # total_loss.backward()
        #     # optimizer.step()
        #     print(f"Episode {episode + 1}, Loss: {total_loss.item()}")
        #     total_loss = 0 