In [1]:
import gym 
from stable_baselines3 import PPO

In [86]:
# env = gym.make("CartPole-v1", render_mode="human")    # For rendering a video
env = gym.make("CartPole-v1")

model = PPO("MlpPolicy", env, verbose=1, device='cuda')
model.learn(total_timesteps=10000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.5     |
|    ep_rew_mean     | 21.5     |
| time/              |          |
|    fps             | 726      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 25.2       |
|    ep_rew_mean          | 25.2       |
| time/                   |            |
|    fps                  | 626        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00899452 |
|    clip_fraction        | 0.122      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.6

<stable_baselines3.ppo.ppo.PPO at 0x7fe246704280>

In [87]:
rewards = []      
episodes = 30

for episode in range(episodes):
    state = env.reset()[0]
    done = False
    episode_reward = 0

    while not done:
        action, _ = model.predict(state, deterministic=True)
        state, reward, done, t, _ = env.step(action)
        episode_reward += reward
        # env.render()

    rewards.append(episode_reward)

# env.close()

avg_reward = sum(rewards) / episodes
print(f"Average reward : {avg_reward}")


Average reward : 514.5666666666667


In [3]:
import numpy as np
from gym import RewardWrapper

class CustomCartPoleReward(RewardWrapper):
    def __init__(self, env):
        super(CustomCartPoleReward, self).__init__(env)

    def reward(self, reward):
        # Increase reward for keeping the pole upright and penalize for moving away from the center
        x, x_dot, theta, theta_dot = self.env.state
        new_reward = reward - np.abs(theta)  # Penalize for angle from upright
        return new_reward

In [88]:
# Initialize the custom environment
# custom_env = CustomCartPoleReward(gym.make("CartPole-v1", render_mode="human"))   # For rendering a video
custom_env = CustomCartPoleReward(gym.make("CartPole-v1"))

custom_model = PPO("MlpPolicy", custom_env, verbose=1, device='cuda')
custom_model.learn(total_timesteps=10000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.2     |
|    ep_rew_mean     | 21.5     |
| time/              |          |
|    fps             | 715      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 27.6        |
|    ep_rew_mean          | 25.5        |
| time/                   |             |
|    fps                  | 593         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009799795 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.2         |
|    entropy_loss  

<stable_baselines3.ppo.ppo.PPO at 0x7fe246706830>

In [89]:
custom_rewards = []
episodes = 30

for episode in range(episodes):
    state = custom_env.reset()[0]
    done = False
    episode_reward = 0

    while not done:
        action, _ = custom_model.predict(state, deterministic=True)
        state, reward, done, t, _ = custom_env.step(action)
        episode_reward += reward
        # custom_env.render()

    custom_rewards.append(episode_reward)

# custom_env.close()

avg_custom_reward = sum(custom_rewards) / episodes
print(f"Average reward with custom reward function: {avg_custom_reward}")

Average reward with custom reward function: 772.5772681648491


In [25]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch as th
from torch import nn
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed

class CustomMLP(nn.Module):
    def __init__(self):
        super(CustomMLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(4, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.network(x)
    
# Modify the policy architecture
model2 = PPO("MlpPolicy", custom_env, policy_kwargs=dict(activation_fn=th.nn.ReLU, net_arch=[128, 128]), verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [33]:
custom_env = gym.make("CartPole-v1")
# custom_env = CustomCartPoleReward(gym.make("CartPole-v1"))
# custom_env = CustomCartPoleReward(gym.make("CartPole-v1", render_mode='human')) # For video

model2.learn(total_timesteps=10000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 491      |
| time/              |          |
|    fps             | 789      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 490          |
| time/                   |              |
|    fps                  | 682          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0044079837 |
|    clip_fraction        | 0.0527       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.45        |
|    explained_variance   | 0.741        |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x7f0bd8b68d30>

In [32]:

rewards = []
episodes = 30

for episode in range(episodes):
    state = custom_env.reset()[0]
    done = False
    episode_reward = 0

    while not done:
        action, _ = model2.predict(state, deterministic=True)
        state, reward, done, t, _ = custom_env.step(action)
        episode_reward += reward
        # custom_env.render()

    rewards.append(episode_reward)

# custom_env.close()

avg_reward = sum(rewards) / episodes
print(f"Average reward : {avg_reward}")

Average reward : 1493.8


A custom, complex neural network architecture does not necessarily improve performance. In this example, the model trained with the custom network didn't learn to keep the cart in the center of the screen, so while it balanced the pole well, it got a lower reward on average by falling into this rut.