In [None]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [None]:
!pip3 install stable-baselines3[extra]

In [None]:
!pip install gymnasium
!pip install gymnasium-robotics
!pip install gymnasium-robotics[mujoco-py]

In [None]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment

In [None]:
environment_name = "FetchReachDense-v2"

In [None]:
# Creates an environment previously registered with gymnasium.register() or a EnvSpec.
env = gym.make(environment_name)
#gym.make??

In [None]:
# Resets the environment to the initial state, required before calling step. 
# Returns the first agent observation for an episode and information, i.e. metrics, debug info.
env.reset()

In [None]:
# This method defines how many discrete/actions there are. 
# The Space object corresponding to valid actions, all valid actions should be contained within the space.
env.action_space

In [None]:
# This is the method that allows you to view the observation space
env.observation_space

### Run for loop of environment

In [None]:
# Closes the environment, which is important when external software is used, i.e. pygame for rendering, databases
env.close()

In [None]:
env = gym.make(environment_name,render_mode="human") # `render_mode` allows the environment to ve visualized.

In [None]:
# Create a loop to test our the reach environment
episodes = 3
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, __ = env.step(action)
        score+=reward
        # print(score)
        if score < -100:
            break
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

# Understanding The Environment
https://robotics.farama.org/envs/fetch/reach/

In [None]:
# The action space is a Box(-1.0, 1.0, (4,), float32). An action represents the Cartesian displacement dx, dy, and dz 
# of the end effector. In addition to a last action that controls closing and opening of the gripper. This last action 
# is not required since there is no object to be manipulated, thus its value won’t generate any control output.
env.action_space.sample()

In [None]:
# The observation is a goal-aware observation space. It consists of a dictionary with information about the robot’s end 
# effector state and goal. 
env.observation_space.sample()

# 3. Train an RL Model

In [None]:
# Make your directories first
log_path = os.path.join('Training', 'Logs')

In [None]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MultiInputPolicy', env, verbose = 1, tensorboard_log=log_path)
# PPO??

In [None]:
# Train your model, defining the total time steps. 
model.learn(total_timesteps=50000)

# 4. Save and Reload Model

In [None]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Fetch_Reach_model')

In [None]:
model.save(PPO_path)

In [None]:
del model

In [None]:
model = PPO.load(PPO_path, env=env)

# 4. Evaluation

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
env = gym.make(environment_name,render_mode="human")
env = DummyVecEnv([lambda: env])
evaluate_policy(model, env, n_eval_episodes=6, render=True)

In [None]:
env.close()

# 5. Test Model

In [None]:
env = gym.make(environment_name,render_mode="human")
env = DummyVecEnv([lambda: env])
episodes = 50
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action,  _state = model.predict(obs) #NOW USING MODEL HERE!!!
        obs, reward, done, info = env.step(action)
        score+=reward

    print('Episode:{} Score:{}'.format(episode, score))
env.close()

### Reviewing the model methods

In [None]:
# model method api
model.predict??

# 6. Viewing Logs in Tensorboard

In [None]:
training_log_path = os.path.join('Training', 'Logs', 'PPO_1')
# training_log_path

In [None]:
!tensorboard --logdir={training_log_path}