# CartPole-v0

> reference: https://keon.io/deep-q-learning/

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. 

A `reward` of +1 is provided for every timestep that the pole remains upright. The episode ends when `the pole is more than 15 degrees from vertical`, or `the cart moves more than 2.4 units from the center`.

In [4]:
import gym
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import History
from collections import deque
import itertools
import numpy as np
import random
import os

random.seed(42)

In [5]:
class DQNAgent():
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size        
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self._learning_rate = 0.001
        self.model_name = 'CartPole-DQN.h5'
        self.model = self._build_model()
        self.target_model = self._build_model()
    
    
    def _build_model(self, hidden_size=24):
        model = Sequential()
        model.add(Dense(hidden_size, activation='relu', kernel_initializer='glorot_uniform', input_shape=(self._state_size,)))
        model.add(Dense(hidden_size, activation='relu', kernel_initializer='glorot_uniform', input_dim=hidden_size))
        model.add(Dense(self._action_size, activation='linear', kernel_initializer='glorot_uniform', input_dim=hidden_size))
        opt = Adam(lr=self._learning_rate)
        model.compile(loss='mean_squared_error', optimizer=opt)
        model.summary()
        if os.path.exists(self.model_name):
            model.load_weights(self.model_name)
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def update_target_model(self):
        self.target_model = self.model
        
    def train_model(self, batch_size=20):
        def batch(iterable, n=1):
            l = len(iterable)
            for ndx in range(0, l, n):
                yield list(itertools.islice(iterable, ndx, min(ndx + n, l)))
        
        history = History()
        batch_data = list(batch(self.memory, batch_size))
        selected_batch = random.sample(batch_data, 1)
        
        for state, action, reward, next_state, done in selected_batch[0]:
            target = self.model.predict(state)
            target_val = self.model.predict(next_state)
            target_val_ = self.target_model.predict(next_state)
            if done:
                target[0][action] = reward
            else:
                a = np.argmax(target_val)
                target[0][action] = reward + self.gamma * target_val_[0][a]
            
            self.model.fit(state, target, epochs=1, verbose=0, callbacks=[history])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay    

    def takeAction(self, state):
        state = state.reshape(1,self._state_size)
        act = np.argmax(self.model.predict(state))
        if np.random.rand() < self.epsilon:
            act = random.randrange(self._action_size)
        return act


In [6]:
episodes = 500
render = False
shows = 20
total_reward = list()

env = gym.make('CartPole-v0')
state = env.reset()
agent = DQNAgent(state.shape[0], env.action_space.n)

for e in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, 4])
    
    for time_t in range(500):
        
        if render:
            env.render()
            
        action = agent.takeAction(state)
        next_state, reward, done, _ = env.step(action)
        
        next_state = np.reshape(next_state, [1, 4])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            total_reward.append(time_t)
            break
            
    agent.train_model(32)
    
    if e % shows == 0:
        print("Episode: {}, Total Reward: {}".format(e, np.mean(total_reward[-shows:])))
        agent.update_target_model()
        agent.model.save_weights(agent.model_name)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_5 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_8 (Dense)              (None, 