# Cart Pole Reinforcement Learning

## 01. Import Dependencies

In [2]:
import os
import gym
from stable_baselines3 import PPO # Model free algorithm (policy optimization)
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 02. Load Environment

In [6]:
env_name = 'MountainCar-v0'
env = gym.make(env_name)

In [7]:
eps = 10
for episode in range(1, eps+1):
    state = env.reset() # Obtain initial Observations
    done = False
    score = 0
    
    while not done:
        env.render()   # Visualize environment
        action = env.action_space.sample() # random action
        n_state, reward, done, info = env.step(action)  # apply an action
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))
env.close() # close render frame

Episode: 1 Score: -200.0
Episode: 2 Score: -200.0
Episode: 3 Score: -200.0
Episode: 4 Score: -200.0
Episode: 5 Score: -200.0
Episode: 6 Score: -200.0
Episode: 7 Score: -200.0
Episode: 8 Score: -200.0
Episode: 9 Score: -200.0
Episode: 10 Score: -200.0


## Understanding Environment

In [8]:
env.action_space # action to move right (1) or left (0)

Discrete(3)

In [9]:
env.observation_space.sample() # [cart pos, cart vel, pole angle, pol ang vel]

array([-0.6019822, -0.0430167], dtype=float32)

## 03. Train Reinforcement Learning Model

In [10]:
log_path = os.path.join('Training', 'Logs')

In [11]:
env = gym.make(env_name)
env = DummyVecEnv([lambda:env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [12]:
model.learn(total_timesteps=50000)

Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 221  |
|    iterations      | 1    |
|    time_elapsed    | 9    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 151         |
|    iterations           | 2           |
|    time_elapsed         | 27          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008980989 |
|    clip_fraction        | 0.0216      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.00495     |
|    learning_rate        | 0.0003      |
|    loss                 | 15.4        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00321    |
|    value_loss           | 138         |
-----------------------------------------
---

-----------------------------------------
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 13          |
|    time_elapsed         | 243         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.006320159 |
|    clip_fraction        | 0.0307      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.694      |
|    explained_variance   | -0.000147   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.259       |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.000302   |
|    value_loss           | 3.56        |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 108          |
|    iterations           | 14           |
|    time_elapsed         | 263          |
|    total_timesteps      | 2

------------------------------------------
| time/                   |              |
|    fps                  | 107          |
|    iterations           | 24           |
|    time_elapsed         | 455          |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0023233236 |
|    clip_fraction        | 0.0155       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.353       |
|    explained_variance   | 1.93e-05     |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0117       |
|    n_updates            | 230          |
|    policy_gradient_loss | -0.000679    |
|    value_loss           | 0.0277       |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 25          |
|    time_elapsed         | 473         |
|    total_times

<stable_baselines3.ppo.ppo.PPO at 0x2047fa85780>

## 04. Save and Reload Model

In [13]:
# Save model
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_MountainCar')
model.save(PPO_Path)

In [15]:
# Load Model
del model
model = PPO.load(PPO_Path, env=env)

## 05. Evaluate Model

In [16]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(-200.0, 0.0)

## 06. Test Model

In [17]:
eps = 10
for episode in range(1, eps+1):
    obs = env.reset() 
    done = False
    score = 0
    
    while not done:
        env.render()   # Visualize environment
        action, _ = model.predict(obs) # Take action using model
        obs, reward, done, info = env.step(action)  # apply an action
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))
env.close() 

Episode: 1 Score: [-200.]
Episode: 2 Score: [-200.]
Episode: 3 Score: [-200.]
Episode: 4 Score: [-200.]
Episode: 5 Score: [-200.]
Episode: 6 Score: [-200.]
Episode: 7 Score: [-200.]
Episode: 8 Score: [-200.]
Episode: 9 Score: [-200.]
Episode: 10 Score: [-200.]


## 07. Viewing Logs in Tensorboard

In [23]:
training_log_path = os.path.join(log_path, 'PPO_2')

In [24]:
# Visualize performance in Tensorboard (Run in command)
!tensorboard --logdir=training_log_path

^C


## 08. Add Callbacks

In [25]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [30]:
save_path = os.path.join('Training', 'Saved Models')

In [31]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback,
                            eval_freq=10000, best_model_save_path=save_path, verbose=1)

In [32]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [33]:
model.learn(total_timesteps=30000, callback=eval_callback)

Logging to Training\Logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 278  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 165         |
|    iterations           | 2           |
|    time_elapsed         | 24          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008816953 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.0034      |
|    learning_rate        | 0.0003      |
|    loss                 | 8.48        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0169     |
|    value_loss           | 56.9        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x162c5c15f70>

## 09. Changing Policies

In [34]:
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [35]:
model.learn(total_timesteps=30000, callback=eval_callback)

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 205  |
|    iterations      | 1    |
|    time_elapsed    | 9    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 2           |
|    time_elapsed         | 37          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015287836 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00485     |
|    learning_rate        | 0.0003      |
|    loss                 | 2.95        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0266     |
|    value_loss           | 20.5        |
-----------------------------------------
---

-----------------------------------------
| time/                   |             |
|    fps                  | 66          |
|    iterations           | 12          |
|    time_elapsed         | 368         |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.007117984 |
|    clip_fraction        | 0.0978      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.487      |
|    explained_variance   | 0.216       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.266       |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00289    |
|    value_loss           | 1.31        |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 65           |
|    iterations           | 13           |
|    time_elapsed         | 404          |
|    total_timesteps      | 2

<stable_baselines3.ppo.ppo.PPO at 0x162c9336a30>

## 10. Train Different Algorithm

In [36]:
from stable_baselines3 import DQN
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=20000)

Using cpu device
Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.96     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 724      |
|    time_elapsed     | 0        |
|    total_timesteps  | 84       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.926    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 731      |
|    time_elapsed     | 0        |
|    total_timesteps  | 155      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.897    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 722      |
|    time_elapsed     | 0        |
|    total_timesteps  | 216      |
----------------------------------
-------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 883      |
|    time_elapsed     | 2        |
|    total_timesteps  | 2516     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 882      |
|    time_elapsed     | 2        |
|    total_timesteps  | 2580     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 875      |
|    time_elapsed     | 3        |
|    total_timesteps  | 2664     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 860      |
|    time_elapsed     | 5        |
|    total_timesteps  | 4999     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 865      |
|    time_elapsed     | 5        |
|    total_timesteps  | 5114     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 865      |
|    time_elapsed     | 5        |
|    total_timesteps  | 5165     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 832      |
|    time_elapsed     | 8        |
|    total_timesteps  | 7347     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 835      |
|    time_elapsed     | 8        |
|    total_timesteps  | 7461     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 838      |
|    time_elapsed     | 9        |
|    total_timesteps  | 7561     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 850      |
|    time_elapsed     | 11       |
|    total_timesteps  | 9789     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 851      |
|    time_elapsed     | 11       |
|    total_timesteps  | 9885     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 853      |
|    time_elapsed     | 11       |
|    total_timesteps  | 10029    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 872      |
|    time_elapsed     | 13       |
|    total_timesteps  | 12059    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 873      |
|    time_elapsed     | 13       |
|    total_timesteps  | 12144    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 875      |
|    time_elapsed     | 13       |
|    total_timesteps  | 12242    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 897      |
|    time_elapsed     | 16       |
|    total_timesteps  | 14423    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 897      |
|    time_elapsed     | 16       |
|    total_timesteps  | 14526    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 894      |
|    time_elapsed     | 16       |
|    total_timesteps  | 14596    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 885      |
|    time_elapsed     | 19       |
|    total_timesteps  | 16870    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 886      |
|    time_elapsed     | 19       |
|    total_timesteps  | 16966    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 883      |
|    time_elapsed     | 19       |
|    total_timesteps  | 17094    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 895      |
|    time_elapsed     | 21       |
|    total_timesteps  | 19419    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 892      |
|    time_elapsed     | 21       |
|    total_timesteps  | 19493    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 892      |
|    time_elapsed     | 21       |
|    total_timesteps  | 19548    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x162ca1b68b0>

In [39]:
evaluate_policy(model, env, n_eval_episodes=1000, render=False)

(9.36, 0.7565712127751095)

In [40]:
# Save model
DQN_Path = os.path.join('Training', 'Saved Models', 'DQN_Model_Cartpole')
model.save(DQN_Path)

In [None]:
# Load Model
del model
model = DQN.load(DQN_Path, env=env)