In [1]:
from network import net
import torch
import numpy as np
import matplotlib.pyplot as plt
import gym
from torch.distributions import MultivariateNormal
import torch.nn as nn
import torch.optim as optim
from collections import deque
import sys

  for external in metadata.entry_points().get(self.group, []):


In [2]:
class PPO():
    def __init__(self,env):
        ##environment initialized##
        self.env = env
        
        ##number of dim in state and number of possible action are initilized##
        self.obs_dim = self.env.observation_space.shape[0]
        self.act_dim = self.env.action_space.shape[0]
        
        ##actor and critic network##
        self.actor = net(self.obs_dim,self.act_dim)
        self.critic = net(self.obs_dim,1)

        ##initialize some of the hyperparameters here##
        self._init_hyperparameters()

        ## create a covariance matrix ##
        cov_var = torch.full((self.act_dim,),0.5)
        self.cov_mat = torch.diag(cov_var)

        ##define optimizers##
        self.optim_actor = optim.Adam(self.actor.parameters(),lr=self.lr)
        self.optim_critic = optim.Adam(self.critic.parameters(),lr=self.lr)
    ##function to optimize the nets##
    def learn(self):
        t = 0
        actor_loss_window = deque([],maxlen=100)
        critic_loss_window = deque([],maxlen=100)
        actor_loss_full = []
        critic_loss_full = [] 
        while t < self.total_timestep:
            ##collect a batch##
            batch_obs,batch_act,batch_rtgs,batch_logprobs,batch_lens = self.rollout()
            #print("Rolled OUT: ",t)
            t += torch.sum(batch_lens) ##increment the timesteps##
            for _ in range(self.num_of_iterations_per_batch):
                ##update each iteration from the same batch here##
                V,current_logprobs = self.evaluate(batch_obs,batch_act)
                ratio = torch.exp(current_logprobs - batch_logprobs)
                #print("batchReturnsShape: ",batch_rtgs.shape)
                #print("ValueShape: ",V.shape)
                Ad = batch_rtgs - V
                ##actor loss##
                #print("Calculating Actor loss........")
                surr1 = Ad * ratio
                surr2 = torch.clamp(ratio,1-self.clip,1+self.clip) * Ad
                actor_loss = (-torch.min(surr1,surr2).sum()/(batch_lens.shape[0]))
                actor_loss_full.append(actor_loss)
                actor_loss_window.append(actor_loss)
                ##critic loss##
                #print("calculating Critic loss.........")
                critic_loss = nn.MSELoss()(V,batch_rtgs)
                critic_loss_full.append(critic_loss)
                critic_loss_window.append(critic_loss)
                ##update actor##
                self.optim_actor.zero_grad()
                actor_loss.backward(retain_graph=True)
                self.optim_actor.step()
                ##update critic##
                self.optim_critic.zero_grad()
                critic_loss.backward()
                self.optim_critic.step()
            sys.stdout.flush()
            print("\r{}/{}".format(t,self.total_timestep),"Average Actor Loss:",torch.mean(torch.tensor(actor_loss_window)),"Averge critic Loss:",torch.mean(torch.tensor(critic_loss_window)),end="")
        plt.subplot(211)
        plt.plot(np.log(np.arange(len(actor_loss_full))),actor_loss_full)
        plt.subplot(212)
        plt.plot(np.log(np.arange(len(critic_loss_full))),critic_loss_full)
        torch.save(self.actor.state_dict(),"actorModel")
    ##function to initialize all hyperparameters##
    def _init_hyperparameters(self):
        self.total_timestep = 500000
        self.num_of_iterations_per_batch = 5
        self.clip = 0.2
        self.time_step_per_batch = 1000
        self.max_timestep_per_eps = 200
        self.gamma = 0.99
        self.lr = 0.005
    ##function to extract a batch of timesteps without keeping track of gradients##
    def rollout(self):
        batch_obs = []
        batch_act = []
        batch_logprobs = []
        batch_lens = []
        t_step = 0
        batch_rews = []
        while t_step < self.time_step_per_batch:
            state = self.env.reset()
            eps_rews = []
            for n in range(self.max_timestep_per_eps):
                t_step += 1
                batch_obs.append(state)
                act,logprobs = self.get_action(state) ## define this function ##
                state,rew,done,_=self.env.step(act)
                eps_rews.append(rew)
                batch_act.append(act)
                batch_logprobs.append(logprobs)
                if done:
                    break
            batch_lens.append(n + 1)
            batch_rews.append(eps_rews)
        #print("Collected a batch!")
        batch_rtgs = self.ret_rtgs(batch_rews)## define this function ##
        batch_obs = torch.tensor(batch_obs,dtype=torch.float)
        batch_act = torch.tensor(batch_act,dtype=torch.float)
        batch_logprobs = torch.tensor(batch_logprobs,dtype=torch.float)
        batch_lens = torch.tensor(batch_lens,dtype=torch.float)
        return batch_obs,batch_act,batch_rtgs,batch_logprobs,batch_lens
    ##function to get the action and its log_prob given a state and current policy##
    def get_action(self,state):
        mean = self.actor(state)
        dist = MultivariateNormal(mean,self.cov_mat)
        action = dist.sample()
        logprob = dist.log_prob(action)
        return action.detach().numpy(),logprob.detach()
    ##function to return sum of rewards given a list of step rewards##
    def ret_rtgs(self,batch_rews):
        batch_rtgs = []
        for ep in reversed(batch_rews):
            discounted_reward = 0
            for rew in reversed(ep):
                discounted_reward = rew + discounted_reward * self.gamma
                batch_rtgs.insert(0,discounted_reward)
        return torch.tensor(batch_rtgs,dtype=torch.float)
    ##function to return the log_probs and value given state and action taken##
    def evaluate(self,batch_obs,batch_act):
       mean = self.actor(batch_obs)
       dist = MultivariateNormal(mean,self.cov_mat)
       log_prob = dist.log_prob(batch_act)

       V = self.critic(batch_obs).squeeze()
       return V,log_prob

In [3]:
env = gym.make("BipedalWalker-v3")
learning_agent = PPO(env)
learning_agent.learn()

NameError: name 'PPO' is not defined

In [3]:
env = gym.make("BipedalWalker-v3")
state = env.reset()
new_agent = PPO(env)
new_agent.actor.load_state_dict(torch.load("actorModel"))
for i in range(2000):
    env.render(state)
    mean = new_agent.actor(state)
    dist = MultivariateNormal(mean,new_agent.cov_mat)
    sam = dist.sample().detach()
    log_prob = dist.log_prob(sam).detach()
    state,_,_,_=env.step(sam)
env.close()

  return self.viewer.render(return_rgb_array=mode == "rgb_array")
