# Dependensies
`pip install 'stable-baselines3[extra]'`

In [None]:
import os
import gym # simulation environment
from stable_baselines3 import PPO # training algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # if env.step() is fast, use this
# from stable_baselines3.common.vec_env import SubprocVecEnv # if env.step() is slow, use this
from stable_baselines3.common.evaluation import evaluate_policy 
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Environment setup

In [None]:
env_name = "CartPole-v0" # specify the environment
env_name

In [None]:
env = gym.make(env_name) # initialize the entire environment

In [None]:
episodes = 5 # epoch, times of training
for episode in range(episodes):
    state = env.reset() # initialize the state of environment
    done = False # is the environment done or not (e.g. clear the stage or the character is dead)
    score = 0 # total reward
    
    while not done:
        env.render() # visualize the environment
        action = env.action_space.sample() # random action
        state, reward, done, info = env.step(action) # take the action, and obtain the current state
        score += reward
    print(f'episode: {episode} score: {score}')
    env.close() # close the visualize window, but it doesn't work well

# What's Environment ?
https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
## Types of space
- Box: range of values, n dimensional tensor
    - Box(0, 1, shape=(2,))
    - [0.1, 0.2], [0.25, 0.75], ...
- Discrete: set of items
    - Discrete(2)
    - 0, 1
- Tuple: tuple of other spaces
    - Tuple((Discrete(2), Box(0, 1, shape=(2,))))
    - (0, [0.1, 0.2]), (1, [0.25, 0.75]), ...
- ...

## Two spaces in environment
- action_space
- observation_space

In [None]:
print(f'action space: {env.action_space}\nsample: {env.action_space.sample()}')

In [None]:
print(f'observation space: {env.observation_space}\nsample: {env.observation_space.sample()}')

In [None]:
print(f'return of env.reset: {env.reset()}')

In [None]:
print(f'return of env.step: {env.step(1)}')

# Training RL model

In [None]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose= 1)

In [None]:
model.learn(total_timesteps= 10000)

# Save and reload the trained model

In [None]:
model_save_path = os.path.join('training', 'saved_model', 'PPO')

In [None]:
model.save(model_save_path)

In [None]:
del model

In [None]:
model = PPO.load(model_save_path, env=env)

# Evaluation

In [None]:
evaluate_policy(model, env, n_eval_episodes= 5, render= True)

In [None]:
env.close()

# Testing

In [None]:
episodes = 5
for episode in range(episodes):
    state = env.reset() # initialize the observation (including state) of environment
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(state) # based on current state, predict the action
        observation = env.step(action) # take the action
        state, reward, done, info = observation # observation includes state, reward, done, info
        score += reward
    print(f'episode: {episode} score: {score}')
    env.close()

# Tensorboard

In [None]:
log_save_path = os.path.join('training', 'log')
log_save_path

In [None]:
env_name = "CartPole-v0"
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose= 1, tensorboard_log= log_save_path)

In [None]:
model.learn(total_timesteps= 10000)

In [None]:
!tensorboard --logdir={log_save_path}

# Callback

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold= 190, verbose= 1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best= stop_callback, # when there is a better model, call stop_callback
                             eval_freq= 10000, # evaluate every 10000 episodes
                             best_model_save_path= model_save_path,
                             verbose= 1)

In [None]:
env_name = "CartPole-v0"
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose= 1)

In [None]:
model.learn(total_timesteps= 20000, callback= eval_callback)

In [None]:
# del model
best_model_save_path = os.path.join(model_save_path, 'best_model')
model = PPO.load(best_model_save_path, env= env)

In [None]:
evaluate_policy(model, env, n_eval_episodes= 10, render= True)

In [None]:
env.close()