In [1]:
!pip install swig
!pip install "gymnasium[box2d]"

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379374 sha256=0cef5187bed4ee086b8bebabc5df61be4ac79d71cbef17f61658b942f6442f32
  Stored in directory: /root/.cache/pip/wheels/ab

In [2]:
import gymnasium as gym
env = gym.make("BipedalWalker-v3", hardcore=True, render_mode="rgb_array")


In [3]:
print(env.observation_space.shape)
print(env.action_space.shape)
print(env.action_space.low, env.action_space.high)

(24,)
(4,)
[-1. -1. -1. -1.] [1. 1. 1. 1.]


In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
from torch.distributions import MultivariateNormal

In [5]:
from torch.distributions import Normal

In [6]:
device = torch.device('cpu')

if(torch.cuda.is_available()):
    device = torch.device('cuda:0')
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")

Device set to : cpu


In [19]:
class ActorCritic(nn.Module):
  def __init__(self, state_dim, action_dim, action_std):
    super(ActorCritic, self).__init__()
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.action_std = action_std
    self.action_variance = torch.full((action_dim,), action_std*action_std).to(device) # Corrected size to be a tuple (action_dim,)
    self.actor = nn.Sequential(
        nn.Linear(state_dim, 512),
        nn.LeakyReLU(),
        nn.Linear(512,256),
        nn.LeakyReLU(),
        nn.Linear(256, action_dim),
    )
    self.critic = nn.Sequential(
        nn.Linear(state_dim, 512),
        nn.Tanh(),
        nn.Linear(512,256),
        nn.Tanh(),
        nn.Linear(256, 1)
    )
    # Initialize the weights of the last layer of the actor
    self.actor[-1].weight.data.mul_(0.1)
    # Initialize log_std as a parameter
    self.log_std = nn.Parameter(torch.zeros(action_dim))


  def set_action_variance(self, new_action_std):
    self.action_variance = torch.full((self.action_dim,), new_action_std*new_action_std).to(device) # Corrected size to be a tuple (self.action_dim,)

  def forward(self):
    pass
  def act(self, state):
    action_mean = self.actor(state) # Get action mean directly from the actor network
    action_mean = F.tanh(action_mean)
    std = torch.exp(self.log_std)
    covariance_matrix = torch.diag(std ** 2).unsqueeze(0) #covariance diagonal matrix with added dimension of shape action_dim*action_dim and value action_variance
    distribution = MultivariateNormal(action_mean, covariance_matrix) # to allow exploration as it enables sampling actions and computing probabilites in continuous action spaces
    action = distribution.sample() #tensor of shape action_dim
    action_log_probs = distribution.log_prob(action) #log of probability of the action needed for gradient updates while learning
    state_values = self.critic(state)
    return action.detach(), action_log_probs.detach(), state_values.detach()
  def evaluate(self, state, action):
    action_mean = self.actor(state)
    action_var = self.action_variance.expand_as(action_mean)
    cov_mat = torch.diag_embed(action_var).to(device)
    dist = MultivariateNormal(action_mean, cov_mat)
    if self.action_dim == 1:
      action = action.reshape(-1, self.action_dim)
    action_logprobs = dist.log_prob(action)
    dist_entropy = dist.entropy().mean() #entropy to allow exploration
    state_values = self.critic(state)
    return action_logprobs, state_values, dist_entropy

In [8]:
class RolloutBuffer:
  def __init__(self):
    self.actions = []
    self.states = []
    self.rewards = []
    self.log_probs = []
    self.entropy = []
    self.state_values = []
    self.is_terminals = []
  def clear(self):
    del self.actions[:]
    del self.states[:]
    del self.rewards[:]
    del self.log_probs[:]
    del self.entropy[:]
    del self.state_values[:]
    del self.is_terminals[:]


In [9]:
import os
import glob
import time
from datetime import datetime



In [16]:
class PPO:
  def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma,K_epochs ,eps_clip, action_std = 0.6):
    self.lr_actor = lr_actor
    self.lr_critic = lr_critic
    self.gamma = gamma
    self.eps_clip = eps_clip
    self.action_std = action_std
    self.K_epochs = K_epochs
    self.entropy_coef = 0.01
    self.entropy_coef_start = 0.01
    self.entropy_coef_end = 0.001
    self.entropy_decay_steps = 3e6


    self.buffer = RolloutBuffer()  #create buffer instance
    self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) #define policy network
    self.optimizer = torch.optim.Adam([
        {'params': self.policy.actor.parameters(), 'lr': self.lr_actor},
        {'params': self.policy.critic.parameters(), 'lr': self.lr_critic}
    ]) # set optimizers for both actor and critic networks
    self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) #old policy (frozen) for computin loss
    self.policy_old.load_state_dict(self.policy.state_dict()) # save weights of current policy network to old policy

    self.MseLoss = nn.MSELoss() #define loss function for critic loss
  def set_action_std(self, new_action_std): #set new action standard deviation (exploration noise level)
    self.action_std = new_action_std
    self.policy.set_action_variance(new_action_std) #update action variance inside both policies
    self.policy_old.set_action_variance(new_action_std)

  def decay_action_std(self, action_std_decay_rate, min_action_std): # gradually decay the action standard deviation to reduce exploration over time
    self.action_std = self.action_std*0.9995 #reduce current std dev by decay rate
    self.action_std = round(self.action_std, 4)
    if(self.action_std < min_action_std): #ensure it doesnt go below minimum
      self.action_std = min_action_std
    print("setting action_std to ", self.action_std)
    self.policy.set_action_variance(self.action_std) #apply new std dev to current policy
  def select_action(self, state): # to select action based on action values and probs received
    with torch.no_grad(): #disable gradient compute
      state = torch.FloatTensor(state).to(device) #tensor of current state
      action, action_log_probs, state_values = self.policy_old.act(state) #get action values, probs and state value based on previous state
      self.buffer.states.append(state)
      self.buffer.actions.append(action)
      self.buffer.log_probs.append(action_log_probs)
      self.buffer.state_values.append(state_values) #store values in buffer for training

      return action.detach().cpu().numpy().flatten() #convert action tensor to numpy array, detach from computation graph, flatten to 1D array.

  def update(self):
    rewards = []
    discounted_reward = 0

    for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
        if is_terminal:
            discounted_reward = 0
        discounted_reward = reward + (self.gamma * discounted_reward)
        rewards.insert(0, discounted_reward)

    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)

    old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
    old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
    old_log_probs = torch.squeeze(torch.stack(self.buffer.log_probs, dim=0)).detach().to(device)
    old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)

    # GAE-Lambda Advantage Estimation
    advantages = []
    gae = 0
    next_value = 0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + self.gamma * next_value * (1 - int(self.buffer.is_terminals[t])) - old_state_values[t]
        gae = delta + self.gamma * 0.95 * (1 - int(self.buffer.is_terminals[t])) * gae
        advantages.insert(0, gae)
        next_value = old_state_values[t]
    advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

    # PPO update with mini-batch training
    batch_size = old_states.size(0)
    mini_batch_size = 512

    for _ in range(self.K_epochs):
        # Shuffle the indices
        indices = torch.randperm(batch_size)

        for start in range(0, batch_size, mini_batch_size):
            end = start + mini_batch_size
            mb_idx = indices[start:end]

            states = old_states[mb_idx]
            actions = old_actions[mb_idx]
            old_log_probs_mb = old_log_probs[mb_idx]
            advantages_mb = advantages[mb_idx]
            rewards_mb = rewards[mb_idx]
            old_state_values_mb = old_state_values[mb_idx]

            log_probs, state_values, dist_entropy = self.policy.evaluate(states, actions)
            state_values = torch.squeeze(state_values)

            ratios = torch.exp(log_probs - old_log_probs_mb.detach())
            surrogate1 = ratios * advantages_mb
            surrogate2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages_mb

            actor_loss = -torch.min(surrogate1, surrogate2)
            critic_loss = self.MseLoss(state_values, rewards_mb)

            current_entropy_coef = self.entropy_coef_start - (self.entropy_coef_start - self.entropy_coef_end) * min(time_step / self.entropy_decay_steps, 1.0)
            loss = actor_loss + 0.5 * critic_loss - current_entropy_coef * dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=0.5)
            self.optimizer.step()

    self.policy_old.load_state_dict(self.policy.state_dict())
    self.buffer.clear()

    return actor_loss.mean().item(), critic_loss.mean().item(), dist_entropy.mean().item()

  def save(self, checkpoint_path):
    torch.save(self.policy_old.state_dict(), checkpoint_path)

  def laod(self, checkpoint_path):
    self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
    self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))

In [17]:
import os
env_name = "BipedalWalker-v3"
directory = directory = "PPO_preTrained" + '/' + env_name + '/'
checkpoint_path = directory + "PPO_{}_{}.pth".format(env_name, 42)
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

In [20]:
from math import log
env_name = "BipedalWalker-v3"
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

eps_clip = 0.2
K_epochs = 10
gamma = 0.99
lr_actor = 2.5e-4
lr_critic = 5e-4
random_seed = 13

action_std = 0.6 # Initialize action_std

log_dir = 'PPO_Bipelarwalk-v3_logs'
if not os.path.exists(log_dir):
  os.makedirs(log_dir)

log_dir = log_dir + '/' + env_name + '/'

if not os.path.exists(log_dir):
      os.makedirs(log_dir)

run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files)

log_dir = log_dir + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
os.makedirs(log_dir)

log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

print("current logging run number for " + env_name + " : ", run_num)
print("logging at : " + log_f_name)

print("state space dimension : ", state_dim)
print("action space dimension : ", action_dim)
print("starting std of action distribution", action_std)

print('-' * 50)
print('PPO K epochs', K_epochs)
print('PPO eps clip', eps_clip)
print('PPO gamma', gamma)
print('PPO learning rate actor', lr_actor)
print('PPO learning rate critic', lr_critic)
print('random seed', random_seed)
print('-' * 50)

ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma,K_epochs, eps_clip, action_std)

start_time = datetime.now().strftime("%Y%m%d-%H%M%S")
print("started training at : ", start_time)
print("-" * 50)

log_f = open(log_f_name, "w+")
log_f.write('episode, timestpe, reward \n')

print_running_reward = 0
print_running_episodes = 0

log_running_reward = 0
log_running_episodes = 0

max_training_steps = int(10e6)
batch_size = 8192
time_step = 0
i_episode = 0

max_ep_len = 600
update_timestep = 2049
log_freq = max_ep_len * 2
print_freq = max_ep_len * 4

action_std_decay_rate = 0.0001
min_action_std = 0.1
save_model_freq = int(2e4)

action_std_decay_freq = 100000  # new
last_decay_timestep = 0


directory = directory = "PPO_preTrained" + '/' + env_name + '/'
checkpoint_path = directory + "PPO_{}_{}.pth".format(env_name, random_seed)

episode_rewards = []

while time_step <= max_training_steps:
  state, info = env.reset()
  current_ep_reward = 0

  for t in range(1, max_ep_len + 1):
    action = ppo_agent.select_action(state) #select action
    state, reward, done, truncated, info = env.step(action) #get next state, reward, if terminal from env
    ppo_agent.buffer.rewards.append(reward) #append reward to buffer
    ppo_agent.buffer.is_terminals.append(done) #append is terminal or not to buffer

    time_step += 1
    current_ep_reward += reward
    episode_rewards.append(reward)

    if len(ppo_agent.buffer.rewards) >= batch_size:
      actor_loss, critic_loss, entropy = ppo_agent.update()
      if time_step > 500000 and (time_step - last_decay_timestep) >= action_std_decay_freq:
        ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)
        last_decay_timestep = time_step
      avg_reward = np.mean(episode_rewards[-50:])  # or longer window
      print(f"[{time_step}] Avg reward (last 50 episodes): {avg_reward}")

    if time_step % log_freq == 0:
      log_average_reward = round(log_running_reward / log_running_episodes, 4)
      log_f.write('{},{},{}\n'.format(log_running_episodes, time_step, log_average_reward))
      log_f.flush()

      log_running_reward = 0
      log_running_episodes = 0

    if time_step % print_freq == 0:
      print_average_reward = round(print_running_reward / print_running_episodes, 2)
      print('episode : {}, timestep : {}, average reward : {}'.format(print_running_episodes, time_step, print_average_reward))
      print_running_reward = 0
      print_running_episodes = 0

    if time_step % save_model_freq == 0 and time_step != 0:
      ppo_agent.save(checkpoint_path)

    if done or truncated: # Also check for truncation
      break
  print_running_reward += current_ep_reward
  print_running_episodes += 1

  log_running_reward += current_ep_reward
  log_running_episodes += 1

  i_episode += 1

  # Decay action_std after each episode
  if time_step > 500000 and (time_step - last_decay_timestep) >= action_std_decay_freq:
    ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)
    last_decay_timestep = time_step

log_f.close()
env.close()
print("finished training at : ", datetime.now().strftime("%Y%m%d-%H%M%S"))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
episode : 6, timestep : 890400, average reward : -69.02999877929688
episode : 4, timestep : 892800, average reward : -73.52999877929688
[892928] Avg reward (last 50 episodes): -0.07683190703392029
episode : 6, timestep : 895200, average reward : -68.8499984741211
episode : 5, timestep : 897600, average reward : -67.12999725341797
episode : 6, timestep : 900000, average reward : -63.439998626708984
setting action_std to  0.5985
[901120] Avg reward (last 50 episodes): -0.08855648338794708
episode : 4, timestep : 902400, average reward : -66.44999694824219
episode : 4, timestep : 904800, average reward : -40.36000061035156
episode : 4, timestep : 907200, average reward : -40.79999923706055
[909312] Avg reward (last 50 episodes): -0.03156334534287453
episode : 4, timestep : 909600, average reward : -38.849998474121094
episode : 4, timestep : 912000, average reward : -43.54999923706055
episode : 5, timestep : 914400, average r

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Replace with the actual path to your log file
log_file = '/content/PPO_Bipelarwalk-v3_logs/BipedalWalker-v3/20250629-045736/PPO_BipedalWalker-v3_log_0.csv'

# Load the CSV
df = pd.read_csv(log_file)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Plotting
plt.figure(figsize=(10, 6))
print(df.columns)
plt.plot(df['timestpe'], df['reward'], label='Average Episode Reward', color='blue')

# Smoothing using rolling average (optional)
df['smoothed'] = df['reward'].rolling(window=10).mean()
plt.plot(df['timestpe'], df['smoothed'], label='Smoothed Reward (window=10)', color='orange', linestyle='--')

plt.title('PPO Training Reward Progress on BipedalWalker-v3')
plt.xlabel('Timestep')
plt.ylabel('Reward')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()