In [None]:
# Requirements
import gymnasium as gym
from stable_baselines3 import PPO

# Environment

In [None]:
# Creating the environment
gym_env = gym.make("CartPole-v1", render_mode="human")
observation, info = gym_env.reset()

# Testing the environment
for ep in range(1, 11):

    print(f"Episode #{ep}")

    # Resetting episode's variables
    observation, info = gym_env.reset()
    episode_over = False
    score = 0

    while not episode_over:
        # Random action
        action = gym_env.action_space.sample()
        # Information after the random action has been applied
        observation, reward, terminated, truncated, _ = gym_env.step(action)
        score += reward
        episode_over = terminated or truncated

    print(f"Episode #{ep} Score: {score}")


Episode #1
Episode #1 Score: 27.0
Episode #2
Episode #2 Score: 35.0
Episode #3
Episode #3 Score: 52.0
Episode #4
Episode #4 Score: 11.0
Episode #5
Episode #5 Score: 27.0
Episode #6
Episode #6 Score: 20.0
Episode #7
Episode #7 Score: 35.0
Episode #8
Episode #8 Score: 22.0
Episode #9
Episode #9 Score: 18.0
Episode #10
Episode #10 Score: 12.0


# Training the Model

In [3]:
model = PPO("MlpPolicy", gym_env, verbose = 1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [4]:
STEPS = 50_000

observation, info = gym_env.reset()

model.learn(total_timesteps=STEPS)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.9     |
|    ep_rew_mean     | 22.9     |
| time/              |          |
|    fps             | 46       |
|    iterations      | 1        |
|    time_elapsed    | 43       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.9        |
|    ep_rew_mean          | 26.9        |
| time/                   |             |
|    fps                  | 46          |
|    iterations           | 2           |
|    time_elapsed         | 88          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009422157 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00223    |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x2344ffc3800>

# Model Evaluation

In [None]:
observation, info = gym_env.reset()

# Testing the model in 10 episodes
for ep in range(1, 11):

    print(f"Episode #{ep}")

    # Resetting episode's variables
    observation, info = gym_env.reset()
    episode_over = False
    score = 0

    while not episode_over:
        # Model's action prediction based on observation
        action, _ = model.predict(observation, deterministic=True)
        # Information after the predicted action has been applied
        observation, reward, terminated, truncated, _ = gym_env.step(action)
        score += reward
        episode_over = terminated or truncated

    print(f"Episode #{ep} Score: {score}")

Episode #1
Episode #1 Score: 500.0
Episode #2
Episode #2 Score: 500.0
Episode #3
Episode #3 Score: 500.0
Episode #4
Episode #4 Score: 500.0
Episode #5
Episode #5 Score: 500.0
Episode #6
Episode #6 Score: 500.0
Episode #7
Episode #7 Score: 500.0
Episode #8
Episode #8 Score: 500.0
Episode #9
Episode #9 Score: 500.0
Episode #10
Episode #10 Score: 500.0


# Saving

In [None]:
# Saving the model's weights
model_name = "PPO_carpole-v1_" + str(STEPS) + "steps"
model.save(model_name)

# Loading

In [None]:
# Loading the model
model = PPO.load(model_name)

# Testing 

In [None]:
observation, info = gym_env.reset()

# Testing the model in 10 episodes
for ep in range(1, 11):

    print(f"Episode #{ep}")

    # Resetting episode's variables
    observation, info = gym_env.reset()
    episode_over = False
    score = 0

    while not episode_over:
        # Model's action prediction based on observation
        action, _ = model.predict(observation, deterministic=True)
        # Information after the predicted action has been applied
        observation, reward, terminated, truncated, _ = gym_env.step(action)
        score += reward
        episode_over = terminated or truncated

    print(f"Episode #{ep} Score: {score}")

Episode #1
Episode #1 Score: 500.0
Episode #2
Episode #2 Score: 500.0
Episode #3
Episode #3 Score: 500.0
Episode #4
Episode #4 Score: 500.0
Episode #5
Episode #5 Score: 500.0
Episode #6
Episode #6 Score: 500.0
Episode #7
Episode #7 Score: 500.0
Episode #8
Episode #8 Score: 500.0
Episode #9
Episode #9 Score: 500.0
Episode #10
Episode #10 Score: 500.0


In [None]:
gym_env.close()