In [1]:
!pip install stable_baselines3



In [2]:
!pip install box2d



In [3]:
!pip install 'shimmy>=2.0'



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import gymnasium as gym
import numpy as np
import torch
import imageio
import os
import multiprocessing
import stable_baselines3
from gymnasium import spaces
from stable_baselines3 import DQN
from IPython.display import HTML
from base64 import b64encode
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize

In [6]:
def evaluate_model(model, env, n_eval_episodes=10):
    """Evaluate the model on a given environment and return mean and std rewards."""
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes)
    return mean_reward, std_reward

# Discretizing the Action Space

In [7]:
class DiscretizedBipedalWalker(gym.Wrapper):
    def __init__(self, env, num_bins=5):
        super(DiscretizedBipedalWalker, self).__init__(env)
        self.num_bins = num_bins

        # Create a discrete action space with `num_bins^4` possible actions (since action space has 4 dimensions)
        self.action_space = spaces.Discrete(num_bins**4)

        # Create bin edges for each action dimension
        self.action_bins = np.linspace(-1, 1, num_bins)  # Binning the continuous range [-1, 1]

    def step(self, action):
        # Convert discrete action into 4D continuous action
        action_indices = np.unravel_index(action, (self.num_bins,) * 4)
        continuous_action = np.array([self.action_bins[i] for i in action_indices], dtype=np.float32)
        return self.env.step(continuous_action)

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

# Function Def

In [8]:
def train_bipedalwalker(n_env, bs, lr):
    # Create the discretized environment & Wrap in vectorized env
    env_bipedal = make_vec_env(lambda: DiscretizedBipedalWalker(gym.make("BipedalWalker-v3"), num_bins=5), n_envs=n_env)
    log = "/content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv"+str(n_env)+"B"+str(bs)+"LR"+str(lr)+"/"
    model_b = DQN("MlpPolicy", env_bipedal, learning_rate= lr, buffer_size=1000000, learning_starts=10000, batch_size= bs, tensorboard_log= log,
                  tau=0.005, gamma=0.99, train_freq=(4, "step"), gradient_steps=1, target_update_interval=1000, verbose=1,
                  exploration_fraction=0.2, exploration_final_eps=0.02, exploration_initial_eps=1.0, policy_kwargs=dict(net_arch=[256, 256]))

    eval_freq = 10000  # Evaluate every 10,000 steps
    n_eval_episodes = 10  # Number of episodes per evaluation

    for step in range(1, 200001, eval_freq):
        # Train the model
        model_b.learn(total_timesteps=eval_freq, reset_num_timesteps=False)

        # Evaluate the model on BipedalWalker
        mean_reward, std_reward = evaluate_model(model_b, env_bipedal, n_eval_episodes)

        # Print evaluation results for BipedalWalker
        print(f"BipedalWalker - Step: {step} | Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")

    # Save the trained model for BipedalWalker
    model_b.save(log)

# 1. env = 1, batch size= 64, lr = 1e-4

In [9]:
train_bipedalwalker(1, 64, 1e-4)

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv1B64LR0.0001/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 833      |
|    ep_rew_mean      | -104     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3444     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3332     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.03e+03 |
|    ep_rew_mean      | -107     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3451     |
|    time_elapsed     | 2        |
|    total_timesteps  | 8209     |
----------------------------------
BipedalWalker - Step: 1 | Mean Reward: -87.93 ± 1.37
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv1B64LR0.0001/



# 2. env = 1, batch size= 64, lr = 3e-4

In [10]:
train_bipedalwalker(1, 64, 3e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv1B64LR0.0003/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 829      |
|    ep_rew_mean      | -103     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3402     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3316     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 830      |
|    ep_rew_mean      | -105     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3416     |
|    time_elapsed     | 1        |
|    total_timesteps  | 6638     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 830      |
|    ep_rew_mean      | -



# 3. env = 1, batch size= 128, lr = 1e-4

In [11]:
train_bipedalwalker(1, 128, 1e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv1B128LR0.0001/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 834      |
|    ep_rew_mean      | -102     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3402     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3334     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 618      |
|    ep_rew_mean      | -120     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3385     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4942     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 562      |
|    ep_rew_mean      | 



# 4. env = 1, batch size= 128, lr = 3e-4

In [12]:
train_bipedalwalker(1, 128, 3e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv1B128LR0.0003/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 86.2     |
|    ep_rew_mean      | -106     |
|    exploration_rate | 0.831    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2202     |
|    time_elapsed     | 0        |
|    total_timesteps  | 345      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 396      |
|    ep_rew_mean      | -133     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2647     |
|    time_elapsed     | 1        |
|    total_timesteps  | 3165     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 412      |
|    ep_rew_mean      | 



# 5. env = 4, batch size= 64, lr = 1e-4

In [13]:
train_bipedalwalker(4, 64, 1e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv4B64LR0.0001/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 71.2     |
|    ep_rew_mean      | -110     |
|    exploration_rate | 0.757    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3210     |
|    time_elapsed     | 0        |
|    total_timesteps  | 496      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 664      |
|    ep_rew_mean      | -109     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4120     |
|    time_elapsed     | 1        |
|    total_timesteps  | 6896     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 604      |
|    ep_rew_mean      | -



# 6. env = 4, batch size= 64, lr = 3e-4

In [14]:
train_bipedalwalker(4, 64, 3e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv4B64LR0.0003/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 65       |
|    ep_rew_mean      | -109     |
|    exploration_rate | 0.796    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3159     |
|    time_elapsed     | 0        |
|    total_timesteps  | 416      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 69.1     |
|    ep_rew_mean      | -110     |
|    exploration_rate | 0.526    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3539     |
|    time_elapsed     | 0        |
|    total_timesteps  | 968      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 323      |
|    ep_rew_mean      | -



# 7. env = 4, batch size= 128, lr = 1e-4

In [15]:
train_bipedalwalker(4, 128, 1e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv4B128LR0.0001/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.25e+03 |
|    ep_rew_mean      | -107     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4109     |
|    time_elapsed     | 1        |
|    total_timesteps  | 6400     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 855      |
|    ep_rew_mean      | -110     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4118     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7188     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 598      |
|    ep_rew_mean      | 



# 8. env = 4, batch size= 128, lr = 3e-4

In [16]:
train_bipedalwalker(4, 128, 3e-4)

Using cpu device
Logging to /content/drive/MyDrive/RL_models/dqn_bipedalwalkerEnv4B128LR0.0003/DQN_0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 71.5     |
|    ep_rew_mean      | -108     |
|    exploration_rate | 0.769    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3184     |
|    time_elapsed     | 0        |
|    total_timesteps  | 472      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 80.4     |
|    ep_rew_mean      | -109     |
|    exploration_rate | 0.414    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3420     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1196     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 207      |
|    ep_rew_mean      | 

