# My implementation of PPO Algorithm

## import some packages and check the env

In [1]:
import gymnasium as gym
import numpy as np
# these are new packages in this file
import panda_gym
import stable_baselines3
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from collections import deque
import matplotlib.pyplot as plt
import random
import imageio
%matplotlib inline

In [2]:
env_id = 'PandaReachDense-v3'
env = gym.make(env_id)
state_space = env.observation_space
action_space = env.action_space.shape
print(f'state space: {state_space}')
print(f'action space: {action_space}')

pybullet build time: Oct  5 2023 20:55:18


argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
state space: Dict('achieved_goal': Box(-10.0, 10.0, (3,), float32), 'desired_goal': Box(-10.0, 10.0, (3,), float32), 'observation': Box(-10.0, 10.0, (6,), float32))
action space: (3,)


In [3]:
env = make_vec_env(env_id, n_envs=1)

env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

obs = env.reset()

achieved_goal = obs['achieved_goal']
desired_goal = obs['desired_goal']
observation = obs['observation']

merged_array = np.concatenate([achieved_goal, desired_goal, observation], axis=-1)
merged_array.shape

argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886


(1, 12)

In [4]:
action  = env.action_space.sample()
env.step(action)

(OrderedDict([('achieved_goal',
               array([[0.61660784, 0.76940197, 0.6517585 ]], dtype=float32)),
              ('desired_goal',
               array([[ 0.00013169, -0.00035006,  0.00164964]], dtype=float32)),
              ('observation',
               array([[0.61660784, 0.76940197, 0.6517585 , 0.9982495 , 0.99903286,
                       0.9996298 ]], dtype=float32))]),
 array([-7.913147], dtype=float32),
 array([False]),
 [{'is_success': False, 'TimeLimit.truncated': False}])

## Build networks of PPO

The networks are same with A2C

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# we don't use the Categorical since the actions are continuous
# we should use the Normal to model the continous distribution
from torch.distributions import Categorical,Normal

In [6]:
# helper function: to concatenate the input and convert them to tensor
def t(x):
    achieved_goal = x['achieved_goal']
    desired_goal = x['desired_goal']
    observation = x['observation']
    merged_array = np.concatenate([achieved_goal, desired_goal, observation], axis=-1)
    return torch.from_numpy(merged_array).float()

In [7]:
class Actor(nn.Module):
    def __init__(self, state_space, action_space,lr,device):
        super(Actor,self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_space, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, action_space * 2), #the acition space is continous
        )
        self.optimizer = optim.Adam(self.parameters(),lr=lr)
        self.to(device)
    
    def forward(self, X):
        return self.model(t(X))

In [8]:
class Critic(nn.Module):
    def __init__(self, state_space,lr,device):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_space, 64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        self.optimizer = optim.Adam(self.parameters(),lr=lr)
        self.to(device)
    
    def forward(self, X):
        return self.model(t(X))

![clip](Algotithms/clipped.jpg)

In [None]:
class Agent():
    def __init__(
        self,
        env: gym.envs,
        state_space: int,
        action_space: int,
        lr: float,
        device: torch.device,
        gamma: float, # discounted rewards
        n_training_episodes: int,
        n_eval_episodes: int,
        max_t: int 
    ):
         # init the variables
        self.env = env
        self.state_space = state_space
        self.action_space = action_space
        self.lr = lr
        self.device = device
        self.gamma = gamma
        self.n_training_episodes = n_training_episodes
        self.n_eval_episodes = n_eval_episodes
        self.max_t = max_t
        
        # self.actor_critic = ActorCritic(state_space,action_space,
        #                                 lr,device)
        self.actor = Actor(state_space,action_space,lr,device)
        self.critic = Critic(state_space,lr,device)
        
    def act(self,state):
        # input: state (Batch,12)
        # output: action,log_prob and entropy
        act_out = self.actor(state)
        mean = act_out[:,:self.action_space]
        log_std = act_out[:,self.action_space:]
        std = torch.exp(log_std)
        normal = Normal(mean,std)
        action = normal.sample()
        log_prob = normal.log_prob(action)
        entropy = normal.entropy()
        return action.numpy(),log_prob,entropy
    
    def step(self,action):
        # the agent take the action and obtain the next observation and immedient reward
        next_obs,reward,done,info = self.env.step(action)
        return next_obs, reward, done
    
    def update(self,advantage,log_prob,entropy):
        