In [27]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [28]:
!pip3 install stable-baselines3[extra]

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [29]:
!pip install gymnasium
!pip install gymnasium-robotics
!pip install gymnasium-robotics[mujoco-py]

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip 

In [30]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment

In [31]:
environment_name = "FetchReachDense-v2"

In [32]:
# Creates an environment previously registered with gymnasium.register() or a EnvSpec.
env = gym.make(environment_name)
#gym.make??

In [33]:
# Resets the environment to the initial state, required before calling step. 
# Returns the first agent observation for an episode and information, i.e. metrics, debug info.
env.reset()

({'observation': array([ 1.34185486e+00,  7.49100508e-01,  5.34707205e-01,  2.00232294e-04,
          6.92377335e-05, -3.25336729e-06, -2.19655130e-09,  5.16581247e-06,
          4.76882452e-06, -2.31810359e-06]),
  'achieved_goal': array([1.34185486, 0.74910051, 0.5347072 ]),
  'desired_goal': array([1.37528771, 0.71790258, 0.60663199])},
 {})

In [34]:
# This method defines how many discrete/actions there are. 
# The Space object corresponding to valid actions, all valid actions should be contained within the space.
env.action_space

Box(-1.0, 1.0, (4,), float32)

In [35]:
# This is the method that allows you to view the observation space
env.observation_space

OrderedDict([('achieved_goal', array([-0.80570279,  0.33007362, -0.85485567])),
             ('desired_goal', array([ 0.17337401, -1.25829022,  0.22100899])),
             ('observation',
              array([ 0.41221713, -0.71711916,  1.26236457,  0.65946385,  1.28486468,
                      0.81318092,  1.0118798 ,  0.73813479, -0.97106976,  1.02243229]))])

### Run for loop of environment

In [37]:
# Closes the environment, which is important when external software is used, i.e. pygame for rendering, databases
env.close()

In [39]:
env = gym.make(environment_name,render_mode="human") # `render_mode` allows the environment to ve visualized.

In [40]:
# Create a loop to test our the reach environment
episodes = 3
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, __ = env.step(action)
        score+=reward
        # print(score)
        if score < -100:
            break
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-100.15069724624391
Episode:2 Score:-100.08685220842337
Episode:3 Score:-100.07345328470538


# Understanding The Environment
https://robotics.farama.org/envs/fetch/reach/

In [41]:
# The action space is a Box(-1.0, 1.0, (4,), float32). An action represents the Cartesian displacement dx, dy, and dz 
# of the end effector. In addition to a last action that controls closing and opening of the gripper. This last action 
# is not required since there is no object to be manipulated, thus its value won’t generate any control output.
env.action_space.sample()

array([ 0.82578987, -0.4887287 ,  0.81608254, -0.30251455], dtype=float32)

In [42]:
# The observation is a goal-aware observation space. It consists of a dictionary with information about the robot’s end 
# effector state and goal. 
env.observation_space.sample()

OrderedDict([('achieved_goal', array([ 0.32617484, -0.17370267, -1.25221234])),
             ('desired_goal', array([-1.1180499 ,  0.77907649,  0.83331702])),
             ('observation',
              array([ 0.93202486, -0.51681035, -0.10157951, -1.14686444,  0.32856169,
                     -1.01855906,  0.36187152, -1.14371704,  0.27138729, -0.22074443]))])

# 3. Train an RL Model

In [43]:
# Make your directories first
log_path = os.path.join('Training', 'Logs')

In [44]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MultiInputPolicy', env, verbose = 1, tensorboard_log=log_path)
# PPO??

Using cpu device


In [62]:
# Train your model, defining the total time steps. 
model.learn(total_timesteps=200000)

Logging to Training/Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 398  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010254806 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.48       |
|    explained_variance   | 0.367       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0492      |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.0157     |
|    std                  | 0.951       |
|    value_loss           | 0.122       |
---

<stable_baselines3.ppo.ppo.PPO at 0x7fdb291195b0>

# 4. Save and Reload Model

In [19]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Fetch_Reach_model')

In [20]:
model.save(PPO_path)

In [21]:
del model

In [22]:
model = PPO.load(PPO_path, env=env)

# 4. Evaluation

In [23]:
from stable_baselines3.common.evaluation import evaluate_policy

In [24]:
env = gym.make(environment_name,render_mode="human")
env = DummyVecEnv([lambda: env])
evaluate_policy(model, env, n_eval_episodes=6, render=True)



(-6.876124573250611, 1.488251813926538)

In [25]:
env.close()

# 5. Test Model

In [26]:
env = gym.make(environment_name,render_mode="human")
env = DummyVecEnv([lambda: env])
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action,  _state = model.predict(obs) #NOW USING MODEL HERE!!!
        obs, reward, done, info = env.step(action)
        score+=reward

    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:[-12.344315]


/home/strider/.local/lib/python3.10/site-packages/glfw/__init__.py:916: GLFWError: (65544) b'Wayland: Standard cursor shape unavailable'


Episode:2 Score:[-13.3492985]
Episode:3 Score:[-7.520219]
Episode:4 Score:[-8.934184]
Episode:5 Score:[-14.073304]
Episode:6 Score:[-10.518241]
Episode:7 Score:[-13.509146]
Episode:8 Score:[-6.328489]
Episode:9 Score:[-12.732246]
Episode:10 Score:[-11.644218]


### Reviewing the model methods

In [53]:
# model method api
model.predict??

In [55]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
obs = env.reset()

In [56]:
obs

array([[ 0.03617643, -0.01765439,  0.03399159, -0.02785905]],
      dtype=float32)

In [57]:
action, _states = model.predict(obs)

In [60]:
env.step(action)

(array([[ 0.03582334,  0.17696403,  0.03343441, -0.30962643]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

# 6. Viewing Logs in Tensorboard

In [78]:
training_log_path = os.path.join('Training', 'Logs', 'PPO_1')
# training_log_path

In [79]:
!tensorboard --logdir={training_log_path}

TensorFlow installation not found - running with reduced feature set.
/home/alan/.local/lib/python3.8/site-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /home/alan/.local/lib/python3.8/site-packages/tensorboard_data_server/bin/server)
/home/alan/.local/lib/python3.8/site-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /home/alan/.local/lib/python3.8/site-packages/tensorboard_data_server/bin/server)
/home/alan/.local/lib/python3.8/site-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /home/alan/.local/lib/python3.8/site-packages/tensorboard_data_server/bin/server)
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.14.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
