![CartPole Enviroment](Machine%20Learning/Deep%20Reinforcement%20Learning/Cartpole_intro_to_RL/Media/Cartpole-v1_enviroment.PNG)

# Dependencies

In [1]:
# System: Track Python version
import sys
# Enviroment
import gym 
# Reinforcement Learning Agent and Policy
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

# Enviroment

In [2]:
# Naming (Selecting) the Enviroment
environment_name = 'CartPole-v1'
# Creating the Enviroment
env = gym.make(environment_name)

## Testing the Enviroment with Random Actions

In [3]:
# Number of Episodes to test
episodes = 10

for episode in range(1, episodes+1):
    # Reset the Enviroment
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        # Selecting a Random Action
        action = env.action_space.sample()
        # Applying the Action to the Enviroment
        n_state, reward, done, info = env.step(action)
        score+=reward
    # Printing the Score per Episode
    print('Episode:{} Score:{}'.format(episode, score))
    
env.close()

Episode:1 Score:15.0
Episode:2 Score:12.0
Episode:3 Score:21.0
Episode:4 Score:21.0
Episode:5 Score:13.0
Episode:6 Score:16.0
Episode:7 Score:37.0
Episode:8 Score:18.0
Episode:9 Score:20.0
Episode:10 Score:14.0


# Model

In [4]:
# Selecting the Algorithm and Policy
model = A2C('MlpPolicy', env, verbose = 2)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Training

In [25]:
# Training the Algorithm while performing 20000 Steps (Actions)
steps = 20000
model.learn(total_timesteps=steps)

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 212      |
|    ep_rew_mean        | 212      |
| time/                 |          |
|    fps                | 959      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.571   |
|    explained_variance | 0.000193 |
|    learning_rate      | 0.0007   |
|    n_updates          | 1099     |
|    policy_loss        | 1.02     |
|    value_loss         | 2.25     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 142      |
|    ep_rew_mean        | 142      |
| time/                 |          |
|    fps                | 966      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x2717b8670a0>

# Testing

In [48]:
# Testing the Model with Method #1
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
# Closing the Prompt (Render)
env.close()

In [46]:
# Testing the Model with Method #2

# Selecting number of Episodes
episodes = 30

for episode in range(1, episodes+1):
    # Reset the Enviroment
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        # Predicting an Action
        action, _states = model.predict(state)
        # Applying the Action to the Enviroment
        n_state, reward, done, info = env.step(action)
        score+=reward
    # Printing the Score per Episode
    print('Episode:{} Score:{}'.format(episode, score))
    
env.close()

Episode:1 Score:11.0
Episode:2 Score:9.0
Episode:3 Score:9.0
Episode:4 Score:10.0
Episode:5 Score:8.0
Episode:6 Score:10.0
Episode:7 Score:10.0
Episode:8 Score:10.0
Episode:9 Score:10.0
Episode:10 Score:10.0
Episode:11 Score:8.0
Episode:12 Score:10.0
Episode:13 Score:10.0
Episode:14 Score:10.0
Episode:15 Score:10.0
Episode:16 Score:11.0
Episode:17 Score:10.0
Episode:18 Score:8.0
Episode:19 Score:8.0
Episode:20 Score:10.0
Episode:21 Score:9.0
Episode:22 Score:9.0
Episode:23 Score:10.0
Episode:24 Score:8.0
Episode:25 Score:11.0
Episode:26 Score:10.0
Episode:27 Score:10.0
Episode:28 Score:10.0
Episode:29 Score:9.0
Episode:30 Score:10.0


# Saving 

In [49]:
# Saving the Model Weights
model_name = "A2C_carpole-v1_" + str(steps) + "steps"
model.save(model_name)

In [50]:
# Remove to demonstrate saving and loading
del model 

In [51]:
# Checking if deleted 
try:
    model
except NameError as e:
    print (e)

name 'model' is not defined


# Loading

In [52]:
# Code to use if importing a model built from a different Python version
newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

custom_objects = {}

if newer_python_version:
    custom_objects = {
        "learning_rate": 0.0,
        "lr_schedule": lambda _: 0.0,
        "clip_range": lambda _: 0.0,
    }

In [53]:
# Loading the Model
model = A2C.load(model_name, custom_objects=custom_objects)

In [54]:
# Testing the Model after Loading

# Selecting number of Episodes
episodes = 30

for episode in range(1, episodes+1):
    # Reset the Enviroment
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        # Predicting an Action
        action, _states = model.predict(state)
        # Applying the Action to the Enviroment
        n_state, reward, done, info = env.step(action)
        score+=reward
    # Printing the Score per Episode
    print('Episode:{} Score:{}'.format(episode, score))
    
env.close()

Episode:1 Score:10.0
Episode:2 Score:10.0
Episode:3 Score:9.0
Episode:4 Score:9.0
Episode:5 Score:10.0
Episode:6 Score:10.0
Episode:7 Score:8.0
Episode:8 Score:10.0
Episode:9 Score:9.0
Episode:10 Score:9.0
Episode:11 Score:8.0
Episode:12 Score:11.0
Episode:13 Score:10.0
Episode:14 Score:9.0
Episode:15 Score:10.0
Episode:16 Score:8.0
Episode:17 Score:9.0
Episode:18 Score:9.0
Episode:19 Score:10.0
Episode:20 Score:9.0
Episode:21 Score:10.0
Episode:22 Score:10.0
Episode:23 Score:10.0
Episode:24 Score:11.0
Episode:25 Score:9.0
Episode:26 Score:8.0
Episode:27 Score:10.0
Episode:28 Score:12.0
Episode:29 Score:10.0
Episode:30 Score:10.0
