# CartPole-v0

> reference: 

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. 

A `reward` of +1 is provided for every timestep that the pole remains upright. The episode ends when `the pole is more than 15 degrees from vertical`, or `the cart moves more than 2.4 units from the center`.

In [1]:
import gym, threading, random, itertools, time
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import History
from collections import deque
import numpy as np


random.seed(42)

Using TensorFlow backend.


In [2]:
class Brain():
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size     
        self.gamma = 0.95    # discount rate
        self._learning_rate = 0.001
        
        self.value_size = 1
        self.actor, self.critic = self._build_model()
        
    def _build_model(self, hidden_size=24):
        state = Input(batch_shape=(None,  self._state_size))
        shared = Dense(hidden_size, input_dim=self._state_size, activation='relu', kernel_initializer='glorot_uniform')(state)

        actor_hidden = Dense(hidden_size, activation='relu', kernel_initializer='glorot_uniform')(shared)
        action_prob = Dense(self._action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)
        actor = Model(inputs=state, outputs=action_prob)
        
        value_hidden = Dense(hidden_size, activation='relu', kernel_initializer='he_uniform')(shared)
        state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)
        critic = Model(inputs=state, outputs=state_value)

        opt = Adam(lr=self._learning_rate)
        actor.compile(loss='categorical_crossentropy', optimizer=opt)
        critic.compile(loss='mean_squared_error', optimizer=opt)
        
        actor.summary()
        critic.summary()

        return actor, critic


In [3]:
class A2CAgent():
    def __init__(self):      
        self.memory = deque(maxlen=2000)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
      
    # update policy network every episode
    def train_model(self, batch_size=20):
        def batch(iterable, n=1):
            l = len(iterable)
            for ndx in range(0, l, n):
                yield list(itertools.islice(iterable, ndx, min(ndx + n, l)))
        
        history = History()
        batch_data = list(batch(self.memory, batch_size))
        selected_batch = random.sample(batch_data, 1)
        
        for state, action, reward, next_state, done in selected_batch[0]:
            target = np.zeros((1, brain.value_size))
            advantages = np.zeros((1, brain._action_size))

            value = brain.critic.predict(state)[0]
            next_value = brain.critic.predict(next_state)[0]

            if done:
                advantages[0][action] = reward - value
                target[0][0] = reward
            else:
                advantages[0][action] = reward + brain.gamma * (next_value) - value
                target[0][0] = reward + brain.gamma * next_value

            brain.actor.fit(state, advantages, epochs=1, verbose=0)
            brain.critic.fit(state, target, epochs=1, verbose=0) 

    # using the output of policy network, pick action stochastically
    def takeAction(self, state):
        policy = brain.actor.predict(state, batch_size=1).flatten()
        return np.random.choice(brain._action_size, 1, p=policy)[0]

In [4]:
class Environment(threading.Thread):
    stop_signal = False
    ENV = 'CartPole-v0'
    THREAD_DELAY = 0.001
    
    def __init__(self, render=False):
        threading.Thread.__init__(self)

        self.render = render
        self.env = gym.make(self.ENV)
        self.agent = A2CAgent()
        
        self.episodes = 0
        self.shows = 20

    def runEpisode(self):

        state = self.env.reset()
        state = np.reshape(state, [1, 4])
        Rewards = 0

        while True: 
            time.sleep(self.THREAD_DELAY) # yield

            if self.render: self.env.render()

            action = self.agent.takeAction(state)
            next_state, reward, done, _ = self.env.step(action)

            next_state = np.reshape(next_state, [1, 4])
            self.agent.remember(state, action, reward, next_state, done)
            state = next_state
            Rewards += reward

            if done or self.stop_signal: break

        self.agent.train_model(32)
        
        if Rewards >= 200:
            self.stop()
            
        print("Episode {}, Reward: {}".format(self.episodes, Rewards))
        self.episodes += 1
        
            
    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True

In [5]:
env_test = Environment(render=True)
NUM_STATE = env_test.env.observation_space.shape[0]
NUM_ACTIONS = env_test.env.action_space.n
NONE_STATE = np.zeros(NUM_STATE)
RUN_TIME = 30
THREADS = 4

brain = Brain(NUM_STATE, NUM_ACTIONS)    # brain is global in A3C

envs = [Environment() for i in range(THREADS)]

for e in envs:
    e.start()

time.sleep(RUN_TIME)

for e in envs:
    e.stop()
for e in envs:
    e.join()

print("Training finished")
env_test.run()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/Users/yuting/miniconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-4-2e12f1855a74>", line 48, in run
    self.runEpisode()
  File "<ipython-input-4-2e12f1855a74>", line 27, in runEpisode
    action = self.agent.takeAction(state)
  File "<ipython-input-3-e7c4d70f02c9>", line 38, in takeAction
    policy = brain.actor.predict(state, batch_size=1).flatten()
  File "/Users/yuting/yuting_data/github/openAI/.venv/lib/python3.6/site-packages/keras/engine/training.py", line 1162, in predict
    self._make_predict_function()
  File "/Users/yuting/yuting_data/github/openAI/.venv/lib/python3.6/site-packages/keras/engine/training.py", line 543, in _make_predict_function
    **kwargs)
  File "/Users/yuting/yuting_data/github/openAI/.venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2695, in function
    return Function(inputs, outputs, updates=upd

Training finished
Episode 0, Reward: 8.0
Episode 1, Reward: 13.0
Episode 2, Reward: 12.0
Episode 3, Reward: 12.0
Episode 4, Reward: 12.0
Episode 5, Reward: 10.0
Episode 6, Reward: 9.0
Episode 7, Reward: 12.0
Episode 8, Reward: 10.0
Episode 9, Reward: 11.0
Episode 10, Reward: 11.0
Episode 11, Reward: 10.0
Episode 12, Reward: 11.0
Episode 13, Reward: 10.0
Episode 14, Reward: 10.0
Episode 15, Reward: 9.0
Episode 16, Reward: 10.0
Episode 17, Reward: 11.0
Episode 18, Reward: 9.0
Episode 19, Reward: 8.0
Episode 20, Reward: 10.0
Episode 21, Reward: 11.0
Episode 22, Reward: 9.0
Episode 23, Reward: 9.0
Episode 24, Reward: 10.0
Episode 25, Reward: 11.0
Episode 26, Reward: 8.0
Episode 27, Reward: 12.0
Episode 28, Reward: 12.0
Episode 29, Reward: 9.0
Episode 30, Reward: 10.0
Episode 31, Reward: 10.0
Episode 32, Reward: 9.0
Episode 33, Reward: 10.0
Episode 34, Reward: 11.0
Episode 35, Reward: 31.0
Episode 36, Reward: 10.0
Episode 37, Reward: 14.0
Episode 38, Reward: 16.0
Episode 39, Reward: 19.0
Ep