In [None]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [1]:
!pip install stable-baselines3[extra]
!pip install pyglet==1.5.27





In [2]:
import gym 
from stable_baselines3 import PPO #ALGORITHM
from stable_baselines3.common.vec_env import DummyVecEnv #this allows vectorization which allows multiple training at the same time
from stable_baselines3.common.evaluation import evaluate_policy #makes it easier to test how the environment is performence, shows standard deviation and reward

## Simulated vs Real Environments

It is important to consider the agent, policies, and environment.
The environment can be either real or simulated. Simulated environments give you the ability to trial and train a model in a safe AND cost effective manner

## OpenAI Gym 
Gives an easy way to build environments for training RL agents

When we look at OpenAI Gym environment, they are represented by Spaces

Box - n dimensional tensor, range of values. E.g. Box (0,1,shape = (3,3))
(low value, high value, shape of the space)

Discreate - set of items
E.g. Discrete (3) - gives 0, 1, 2. Typically associated with actions

Tuples - Tuple of other spaces e.g.Box or Discrete
E.g. Tuple ((Discrete(2), Box(0,100,shape = (1,))))

Dict - Dictionary of spaces e.g. Box or Discrete
E.g. Dict ({'height': Discrete(2), "speed":Box(0,100, shape =(1,))})

MultiBinary - One hot encoded binary values
E.g. MultiBinary(4) list of values of 4 positions (0,1,2,3) with 0 or 1 located in each value

MultiDiscrete - multiple discrete values
E.g. MultiDiscrete ([5,2,2])

# 2. Load Environment

In [3]:
#maps to the preinstalled OpenAI Gym Environment
environment_name = "CartPole-v0"

In [4]:
env = gym.make(environment_name)

In [5]:
#this loop is used to test out the environment

episodes = 5 # we are looping through the environment 5 times
for episode in range(1, episodes+1): #loop each episode from 1 to 5
    state = env.reset() #reset the environment, gives initial set of environment. We will later pass this to the agent to determine best path
    done = False #temp variable on wheather or not it is done
    score = 0  # score vairable
    
    while not done:
        env.render() #this renders the environment for view
        action = env.action_space.sample() #generate a random action
        n_state, reward, done, info = env.step(action) #pass random action to environment
        score+=reward #accumulate score
    print('Episode:{} Score:{}'.format(episode, score)) #prints
env.close()

Episode:1 Score:15.0
Episode:2 Score:11.0
Episode:3 Score:23.0
Episode:4 Score:13.0
Episode:5 Score:63.0


In [None]:
env.step(1)# returns next set of observation, reward (increment or decrement), if our episode is done

In [None]:
env.action_space #this gives out 2 becuase there is only 2 actions the agent can take

# Understanding The Environment
https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [None]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

In [None]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()# cart position, velocity, pole angle, pole velocity

# 3. Train an RL Model

## Types of Algorithms

Model Free RL - only uses current based value for predication

Model Based RL - make a predication of the future state of the model

Check out: https://spinningup.openai.com/en/latest/spinningup/

The algoirthm used should map properly to what action space you need. Certain algorithm can only work on certain action spaces.

For this tutorial, we know the action space is discrete so we can only use discrete algorithms

## Training Metric

we need to understand certain metrics

Rollout Metrics:

Episode length is how long the episode went for

Episode reward is reward

Time Metrics:

fps, iterations, time_elapsed, total_timesteps

Train:

entropy_loss, explained_variance, etc

In [None]:
env = gym.make(environment_name) #recreated env
env = DummyVecEnv([lambda: env]) #wrapped env into non vectorized env
model = PPO('MlpPolicy', env, verbose = 1) #PPO is the algorithm, first input is policy (MlpPolicy is standard neural network layers)

In [None]:
model.learn(total_timesteps=20000) #this is to train the model. Input is how long to train

# 4. Save and Reload Model

In [None]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model') # saved into the taining folder then saved models folder then save it as PPO_model name

In [None]:
model.save(PPO_path) #save it

In [None]:
del model# delete the model

In [None]:
model = PPO.load('PPO_model', env=env)# obtain the model and reload it

# 4. Evaluation

The metric is based on the algorithm itself 

PPO is considered solved if the model gives 200 or higher. Some environment has cap in which it is considered solved

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True) #test out environment policy, passes model, env, how many episodes, if you want to render it

In [None]:
env.close()

# 5. Test Model

In [None]:
obs = env.reset() #taking observation and passing it
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break

In [None]:
env.close()

# 6. Viewing Logs in Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_3')

In [None]:
!tensorboard --logdir={training_log_path}

# 7. Adding a callback to the training Stage

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [None]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs') #make directories

In [None]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
model_path = os.path.join('Training', 'Saved Models', 'best_model')
model = PPO.load(model_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

# 8. Changing Policies

In [None]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs={'net_arch': net_arch})

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

# 9. Using an Alternate Algorithm

In [None]:
from stable_baselines3 import DQN

In [None]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [None]:
model.save(dqn_path)

In [None]:
model = DQN.load(dqn_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()