# CartPole-v1

> reference: https://github.com/rlcode/reinforcement-learning/blob/master/2-cartpole/3-reinforce/cartpole_reinforce.py

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. 

A `reward` of +1 is provided for every timestep that the pole remains upright. The episode ends when `the pole is more than 15 degrees from vertical`, or `the cart moves more than 2.4 units from the center`.

In [1]:
import gym

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import History
import numpy as np

Using TensorFlow backend.


In [2]:
class policyModel():
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size
        self._learning_rate = 0.001
        self.discount_factor = 0.99
        self.model = self._build_model()
        self.memory = list()
    
    def _build_model(self, hidden_size=24):
        model = Sequential()
        model.add(Dense(hidden_size, activation='relu', kernel_initializer='glorot_uniform', input_shape=(self._state_size,)))
        model.add(Dense(hidden_size, activation='relu', kernel_initializer='glorot_uniform', input_dim=hidden_size))
        model.add(Dense(self._action_size, activation='softmax', kernel_initializer='glorot_uniform', input_dim=hidden_size))
        opt = Adam(lr=self._learning_rate)
        
        # Using categorical crossentropy as a loss is a trick to easily
        # implement the policy gradient. Categorical cross entropy is defined
        # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set 
        # p_a = advantage. q_a is the output of the policy network, which is
        # the probability of taking the action a, i.e. policy(s, a). 
        # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))
        model.compile(loss='categorical_crossentropy', optimizer=opt)
        model.summary()
        return model
    
    def remember(self, state, action, reward):
        self.memory.append((state, action, reward))
    
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    def train_model(self):
        history = History()
        states, actions, rewards = list(zip(*self.memory))
        states = np.array(states)
        episode_length = len(agent.memory)

        discounted_rewards = self.discount_rewards(rewards)
        baseline = np.mean(discounted_rewards)
        discounted_rewards -= baseline
        discounted_rewards /= np.std(discounted_rewards)

        advantages = np.zeros((episode_length, self._action_size))
        for i in range(episode_length):
            advantages[i][actions[i]] = discounted_rewards[i]

        self.model.fit(states, advantages, epochs=1, verbose=0, callbacks=[history])
#         print(history.history)
        self.memory = list()

    def takeAction(self, state):
        state = state.reshape(1,self._state_size)

        # using the output of policy network, pick action stochastically
        policy = self.model.predict(state).flatten()
        return np.random.choice(self._action_size, 1, p=policy)[0]

In [3]:
env = gym.make('CartPole-v0')
state = env.reset()
agent = policyModel(state.shape[0], env.action_space.n)

total_reward = list()
render = False
shows = 20

for i_episode in range(500):
    
    state = env.reset()
    score = 0
    done = False
    
    while not done:
        
        if render:
            env.render()  
    
        action = agent.takeAction(state)
        next_state, reward, done, info = env.step(action)
        
        agent.remember(state, action, reward)
        state = next_state
        score += reward
        
        if done:
            agent.train_model()
            total_reward.append(score)
    
    if i_episode % shows == 0:
        print("Episode: {}, Total Reward: {}".format(i_episode, np.mean(total_reward[-shows:])))

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
Episode: 0, Total Reward: 15.0
Episode: 20, Total Reward: 16.7
Episode: 40, Total Reward: 21.0
Episode: 60, Total Reward: 18.9
Episode: 80, Total Reward: 21.45
Episode: 100, Total Reward: 17.95
Episode: 120, Total Reward: 25.3
Episode: 140, Total Reward: 24.55
Episode: 160, Total Reward: 26.8
Episod