In [1]:

import gymnasium as gym 
import random
import torch.nn as nn
import torch
from torch.distributions.normal import Normal 
from copy import deepcopy

env = gym.make("Humanoid-v5",render_mode='human')
action_shape = env.action_space.shape[0]
observation_shape = env.observation_space.shape[0]

class Policy(nn.Module):
    def __init__(self):
        super().__init__()
        self.mean = nn.Sequential(
            nn.Linear(observation_shape,128),
            nn.ReLU(),
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,action_shape)
        )
        self.std = nn.Parameter(torch.zeros(action_shape))

    def forward(self,X,action=None):
        mean = self.mean(X)
        std = torch.exp(self.std)
        dist = Normal(mean,std)
    
        if action == None:
            action = dist.sample()  
        
        log_prob = dist.log_prob(action).sum(dim=-1)

        return action ,log_prob ,dist.entropy().sum(dim=-1)

class Value(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(observation_shape,1)
        )
        
    def forward(self,X):
        
        return self.model(X)



In [2]:



policy = Policy()

old_policy = deepcopy(policy)
value_model = Value()
def update_old_policy():
    global old_policy
    old_policy = deepcopy(policy)
    old_policy.requires_grad_=False



value_optimizer = torch.optim.Adam(value_model.parameters(),lr=1e-3,weight_decay=0.001)
policy_optimizer = torch.optim.Adam(policy.parameters(),lr=1e-3,weight_decay=0.001)

episodes = 1000

gamma = 0.001
eps = 0.7 
lamda = 0.99 
for episode in range(episodes):
    update_old_policy()
    print('starting episode: ',episode)
    done = False
    observation,info = env.reset()
    observation = torch.tensor(observation,dtype=torch.float32)
    actions = []
    observations = []
    confidences = []
    rewards = []
    entropies = []
    with torch.no_grad():
        while not done:
            action,prob,entropy =  policy(observation)
            # probs = torch.softmax(logits,dim=-1)
            
            # action_index = torch.multinomial(probs,num_samples=1)
            # action = torch.zeros(logits.shape[-1])
            # action[action_index]=1
            observation, reward , terminated , truncated , info  = env.step(action)
            entropies.append(entropy)
            observation = torch.tensor(observation,dtype=torch.float32)
            observations.append(observation)
            actions.append(action) 
           
            confidences.append(prob)
            rewards.append(reward)
            done = terminated 
            env.render()


    reward = 0 
    discontinued_rewards = []
    for i in range(-1,-len(observations)-1,-1):
        reward = rewards[i] + lamda*reward 
        discontinued_rewards.append(reward)
    discontinued_rewards.reverse()
    discontinued_rewards = torch.tensor(discontinued_rewards,dtype=torch.float32)

    observations = torch.stack(observations)

    #train the value model 
    reward_preds = value_model(observations)
    loss = torch.nn.functional.mse_loss(reward_preds.squeeze(-1),discontinued_rewards)
    value_optimizer.zero_grad()
    loss.backward()
    
    value_optimizer.step()
    #advantages 
    with torch.no_grad():
        advs = []
        for i  in range(len(observations)):
            reward = discontinued_rewards[i]
            pred = value_model(observations[i])
            adv = reward - pred 
            advs.append(adv)
        advs = torch.tensor(advs)
        advs = (advs - advs.mean())/(advs.std() + 1e-8)
    
    #train the policy model
    

    actions = torch.stack(actions)
    with torch.no_grad():
        _,old_conf,_ = old_policy(observations,actions)
    _,current_log_probs,entropies = policy(observations,actions)
    conf_ratio = torch.exp(current_log_probs - old_conf)
    clipped_conf_ratio = torch.clip(conf_ratio, 1-eps,1+eps)
    objective = torch.min(clipped_conf_ratio*advs , conf_ratio*advs)
    
    # entropies = torch.stack(entropies)
    loss = (-objective) - gamma*entropies
    total_loss = loss.mean()
    policy_optimizer.zero_grad()
    total_loss.backward()
    policy_optimizer.step() 

starting episode:  0
starting episode:  1
starting episode:  2
starting episode:  3
starting episode:  4
starting episode:  5
starting episode:  6


ValueError: Expected parameter loc (Tensor of shape (17,)) of distribution Normal(loc: torch.Size([17]), scale: torch.Size([17])) to satisfy the constraint Real(), but found invalid values:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])