# CartPole_RewardShaping

In [1]:
import math
from typing import Optional, Tuple, Union

import numpy as np
import pandas as pd

import gymnasium as gym
from gymnasium import logger, spaces
from gymnasium.envs.classic_control import utils
from gymnasium.error import DependencyNotInstalled
from gymnasium.experimental.vector import VectorEnv
from gymnasium.vector.utils import batch_space
from gymnasium.wrappers import TimeLimit

# CartPole_RewardShaping Environment

In [2]:
class CartPoleEnv_RewardShaping(gym.Env[np.ndarray, Union[int, np.ndarray]]):

    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 50,
    }

    def __init__(self, render_mode: Optional[str] = None):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = self.masspole + self.masscart
        self.length = 0.5  # actually half the pole's length
        self.polemass_length = self.masspole * self.length
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates
        self.kinematics_integrator = "euler"

        # Angle at which to fail the episode
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.x_threshold = 2.4

        # Angle limit set to 2 * theta_threshold_radians so failing observation
        # is still within bounds.
        high = np.array(
            [
                self.x_threshold * 2,
                np.finfo(np.float32).max,
                self.theta_threshold_radians * 2,
                np.finfo(np.float32).max,
            ],
            dtype=np.float32,
        )

        # self.action_space = spaces.Discrete(2) Changed!!!
        ##########
        self.action_space = spaces.Discrete(3)
        ##########
        self.observation_space = spaces.Box(-high, high, dtype=np.float32)

        self.render_mode = render_mode

        self.screen_width = 600
        self.screen_height = 400
        self.screen = None
        self.clock = None
        self.isopen = True
        self.state = None

        self.steps_beyond_terminated = None

    def step(self, action):
        assert self.action_space.contains(
            action
        ), f"{action!r} ({type(action)}) invalid"
        assert self.state is not None, "Call reset before using step method."
        x, x_dot, theta, theta_dot = self.state
        # force = self.force_mag if action == 1 else -self.force_mag Changed!!!
        ##########
        if action == 2:
            force = 0
        elif action == 1:
            force = self.force_mag
        else:
            force = -self.force_mag
        ##########
        costheta = math.cos(theta)
        sintheta = math.sin(theta)

        # For the interested reader:
        # https://coneural.org/florian/papers/05_cart_pole.pdf
        temp = (
            force + self.polemass_length * theta_dot**2 * sintheta
        ) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)
        )
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass

        if self.kinematics_integrator == "euler":
            x = x + self.tau * x_dot
            x_dot = x_dot + self.tau * xacc
            theta = theta + self.tau * theta_dot
            theta_dot = theta_dot + self.tau * thetaacc
        else:  # semi-implicit euler
            x_dot = x_dot + self.tau * xacc
            x = x + self.tau * x_dot
            theta_dot = theta_dot + self.tau * thetaacc
            theta = theta + self.tau * theta_dot

        self.state = (x, x_dot, theta, theta_dot)

        terminated = bool(
            x < -self.x_threshold
            or x > self.x_threshold
            or theta < -self.theta_threshold_radians
            or theta > self.theta_threshold_radians
        )

        if not terminated:
            reward = 1.0
            ########## Added!!!
            if action == 0 or action == 1:
                reward -= 0.5
            ##########
        elif self.steps_beyond_terminated is None:
            # Pole just fell!
            self.steps_beyond_terminated = 0
            reward = 1.0
        else:
            if self.steps_beyond_terminated == 0:
                logger.warn(
                    "You are calling 'step()' even though this "
                    "environment has already returned terminated = True. You "
                    "should always call 'reset()' once you receive 'terminated = "
                    "True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_terminated += 1
            reward = 0.0

        if self.render_mode == "human":
            self.render()
        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
        # Note that if you use custom reset bounds, it may lead to out-of-bound
        # state/observations.
        low, high = utils.maybe_parse_reset_bounds(
            options, -0.05, 0.05  # default low
        )  # default high
        self.state = self.np_random.uniform(low=low, high=high, size=(4,))
        self.steps_beyond_terminated = None

        if self.render_mode == "human":
            self.render()
        return np.array(self.state, dtype=np.float32), {}

    def render(self):
        if self.render_mode is None:
            assert self.spec is not None
            gym.logger.warn(
                "You are calling render method without specifying any render mode. "
                "You can specify the render_mode at initialization, "
                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
            )
            return

        try:
            import pygame
            from pygame import gfxdraw
        except ImportError as e:
            raise DependencyNotInstalled(
                "pygame is not installed, run `pip install gymnasium[classic-control]`"
            ) from e

        if self.screen is None:
            pygame.init()
            if self.render_mode == "human":
                pygame.display.init()
                self.screen = pygame.display.set_mode(
                    (self.screen_width, self.screen_height)
                )
            else:  # mode == "rgb_array"
                self.screen = pygame.Surface((self.screen_width, self.screen_height))
        if self.clock is None:
            self.clock = pygame.time.Clock()

        world_width = self.x_threshold * 2
        scale = self.screen_width / world_width
        polewidth = 10.0
        polelen = scale * (2 * self.length)
        cartwidth = 50.0
        cartheight = 30.0

        if self.state is None:
            return None

        x = self.state

        self.surf = pygame.Surface((self.screen_width, self.screen_height))
        self.surf.fill((255, 255, 255))

        l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
        axleoffset = cartheight / 4.0
        cartx = x[0] * scale + self.screen_width / 2.0  # MIDDLE OF CART
        carty = 100  # TOP OF CART
        cart_coords = [(l, b), (l, t), (r, t), (r, b)]
        cart_coords = [(c[0] + cartx, c[1] + carty) for c in cart_coords]
        gfxdraw.aapolygon(self.surf, cart_coords, (0, 0, 0))
        gfxdraw.filled_polygon(self.surf, cart_coords, (0, 0, 0))

        l, r, t, b = (
            -polewidth / 2,
            polewidth / 2,
            polelen - polewidth / 2,
            -polewidth / 2,
        )

        pole_coords = []
        for coord in [(l, b), (l, t), (r, t), (r, b)]:
            coord = pygame.math.Vector2(coord).rotate_rad(-x[2])
            coord = (coord[0] + cartx, coord[1] + carty + axleoffset)
            pole_coords.append(coord)
        gfxdraw.aapolygon(self.surf, pole_coords, (202, 152, 101))
        gfxdraw.filled_polygon(self.surf, pole_coords, (202, 152, 101))

        gfxdraw.aacircle(
            self.surf,
            int(cartx),
            int(carty + axleoffset),
            int(polewidth / 2),
            (129, 132, 203),
        )
        gfxdraw.filled_circle(
            self.surf,
            int(cartx),
            int(carty + axleoffset),
            int(polewidth / 2),
            (129, 132, 203),
        )

        gfxdraw.hline(self.surf, 0, self.screen_width, carty, (0, 0, 0))

        self.surf = pygame.transform.flip(self.surf, False, True)
        self.screen.blit(self.surf, (0, 0))
        if self.render_mode == "human":
            pygame.event.pump()
            self.clock.tick(self.metadata["render_fps"])
            pygame.display.flip()

        elif self.render_mode == "rgb_array":
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
            )

    def close(self):
        if self.screen is not None:
            import pygame

            pygame.display.quit()
            pygame.quit()
            self.isopen = False

# CartPole_RewardShaping Learning

In [3]:
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.logger import configure
from stable_baselines3 import PPO
import os

In [4]:
model_dir = "models"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
log_dir = "logs"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

result_dir = "results"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

In [5]:
model_rewardshaping_dir = f"{model_dir}/RewardShaping"
if not os.path.exists(model_rewardshaping_dir):
    os.makedirs(model_rewardshaping_dir)

log_rewardshaping_dir = f"{log_dir}/RewardShaping"
if not os.path.exists(log_rewardshaping_dir):
    os.makedirs(log_rewardshaping_dir)

result_rewardshaping_dir = f"{result_dir}/RewardShaping"
if not os.path.exists(result_rewardshaping_dir):
    os.makedirs(result_rewardshaping_dir)
    
logger_rewardshaping = configure(log_rewardshaping_dir, ["stdout", "csv", "tensorboard"])

Logging to logs/RewardShaping


In [6]:
env_train_rewardshaping = CartPoleEnv_RewardShaping()
env_train_rewardshaping = TimeLimit(env_train_rewardshaping, max_episode_steps=500)

In [7]:
model_rewardshaping = PPO("MlpPolicy", env_train_rewardshaping, verbose=1)
model_rewardshaping.set_logger(logger_rewardshaping)

checkpoint_callback = CheckpointCallback(save_freq=2048, save_path=model_rewardshaping_dir)
model_rewardshaping.learn(100000, callback=checkpoint_callback)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.3     |
|    ep_rew_mean     | 16.4     |
| time/              |          |
|    fps             | 2133     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 25.4         |
|    ep_rew_mean          | 17.5         |
| time/                   |              |
|    fps                  | 1457         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0067037703 |
|    clip_fraction        | 0.0692       |
|    clip_range           | 0.2          |
|    en

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 159         |
|    ep_rew_mean          | 112         |
| time/                   |             |
|    fps                  | 1141        |
|    iterations           | 11          |
|    time_elapsed         | 19          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.014363999 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.908      |
|    explained_variance   | 0.93        |
|    learning_rate        | 0.0003      |
|    loss                 | 2.16        |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0115     |
|    value_loss           | 6.05        |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 177 

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 326         |
|    ep_rew_mean          | 263         |
| time/                   |             |
|    fps                  | 1127        |
|    iterations           | 21          |
|    time_elapsed         | 38          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011327798 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.539      |
|    explained_variance   | 0.877       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.883       |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0129     |
|    value_loss           | 2.35        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 339   

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 425        |
|    ep_rew_mean          | 384        |
| time/                   |            |
|    fps                  | 1121       |
|    iterations           | 31         |
|    time_elapsed         | 56         |
|    total_timesteps      | 63488      |
| train/                  |            |
|    approx_kl            | 0.01971127 |
|    clip_fraction        | 0.0926     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.203     |
|    explained_variance   | 0.855      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.104      |
|    n_updates            | 300        |
|    policy_gradient_loss | -0.00712   |
|    value_loss           | 0.12       |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 429         |
|    ep_rew_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 489          |
|    ep_rew_mean          | 466          |
| time/                   |              |
|    fps                  | 1119         |
|    iterations           | 41           |
|    time_elapsed         | 75           |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0006864169 |
|    clip_fraction        | 0.00903      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.0775      |
|    explained_variance   | 0.438        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0276       |
|    n_updates            | 400          |
|    policy_gradient_loss | -0.00153     |
|    value_loss           | 0.0292       |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

<stable_baselines3.ppo.ppo.PPO at 0x21277b677c0>

In [8]:
env_train_rewardshaping.close()

In [9]:
model_learning_data_rewardshaping = pd.read_csv("logs/rewardshaping/progress.csv")
model_id_best_rewardshaping = (model_learning_data_rewardshaping["rollout/ep_rew_mean"].idxmax() + 1) * 2048
model_path_best_rewardshaping = f"{model_rewardshaping_dir}/rl_model_{model_id_best_rewardshaping}_steps.zip"
model_best_rewardshaping = PPO.load(model_path_best_rewardshaping)

# CartPole_RewardShaping Evaluation

In [10]:
env_test_rewardshaping = CartPoleEnv_RewardShaping()
env_test_rewardshaping = TimeLimit(env_test_rewardshaping, max_episode_steps=500)

In [11]:
actions_rewardshaping = []
rewards_rewardshaping = []
steps_rewardshaping = []

for i in range(100):
    step = 0
    score = 0
    truncated = False
    terminated = False
    observation, info = env_test_rewardshaping.reset(seed=100+i)

    while not terminated and not truncated:
        action = model_best_rewardshaping.predict(observation)[0]
        observation, reward, terminated, truncated, info = env_test_rewardshaping.step(action)
        
        score += reward
        step += 1
        actions_rewardshaping.append(int(action))
        
    rewards_rewardshaping.append(score)
    steps_rewardshaping.append(step)

In [12]:
env_test_rewardshaping.close() 

In [13]:
np.savetxt(f"{result_rewardshaping_dir}/rewards_rewardshaping.txt", rewards_rewardshaping)
np.savetxt(f"{result_rewardshaping_dir}/steps_rewardshaping.txt", steps_rewardshaping)
np.savetxt(f"{result_rewardshaping_dir}/actions_rewardshaping.txt", actions_rewardshaping)