In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
environment_name= 'CartPole-v1'
env=gym.make(environment_name, render_mode="human")

In [3]:
environment_name

'CartPole-v1'

In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, _= env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()   

Episode:1 Score:18.0
Episode:2 Score:21.0
Episode:3 Score:55.0
Episode:4 Score:11.0
Episode:5 Score:34.0


# Understanding the Environment

In [5]:
env.action_space

Discrete(2)

In [7]:
env.action_space.sample()

1

In [8]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [9]:
env.observation_space.sample()

array([-1.0296053e+00,  2.1191183e+38, -2.6319033e-01, -8.4887147e+37],
      dtype=float32)

# Training

In [11]:
log_path = os.path.join('Training', 'Logs')

In [12]:
log_path

'Training\\Logs'

In [14]:
env=gym.make(environment_name)
env= DummyVecEnv([lambda : env])
model= PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [15]:
PPO??

In [16]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 1689 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1199         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0071994443 |
|    clip_fraction        | 0.0691       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.00542     |
|    learning_rate        | 0.0003       |
|    loss                 | 7.39         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0102      |
|    value_loss           | 51.4         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x21df900b6a0>

In [18]:
# Saving the model
PPO_Path= os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')
model.save(PPO_Path)

In [None]:
#MODEL=ppo.load(PPO_Path, env)

# Evaluation

In [22]:
evaluate_policy(model, env, n_eval_episodes=10,render=True)

(461.4, 62.47751595574203)

# Testing

In [25]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info, = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()   

Episode:1 Score:[260.]
Episode:2 Score:[484.]
Episode:3 Score:[500.]
Episode:4 Score:[232.]
Episode:5 Score:[379.]
