# Reinforcement Learning (DQN) - Cartpole Task

* The goal is for agent to learn keep the pole attached to a cart upright as long as possible. The agent does so by choosing between two actions: 'move the cart left or right'.

* In reinforcement learning the agent learns how to act or take action by trail and error. In other words, by trying out and action and receving a feedback/reward. It uses the feedback/reward to learn if taking an action in a particular senario (state) was good or bad. The model aim to maximise the total accumulated reward. 

* Adaptation of https://github.com/keon/deep-q-learning/blob/master/ddqn.py

* Current version just uses CPU.

### imports

In [1]:
import random
import gym
import numpy as np
from time import time
from keras.callbacks import TensorBoard
from collections import deque, namedtuple
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Multiply
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Hyperparameters

In [2]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.99
TARGET_UPDATE = 100
LEARNING_RATE = 0.001
EPISODES = 100

### MDP transition represented as a named tuple

In [3]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

### Replay memory (a database to store input data and to sample from to create a training batch)

In [4]:
class ReplayMemory(object):
    def __init__(self, capacity=10000):
        self.memory = deque(maxlen = capacity)
        
    def add(self, *args):
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size=10):
        return random.sample(self.memory, batch_size)
    
    def get_memory(self):
        return self.memory
    
    def __len__(self):
        return len(self.memory)


### Define Huber Loss Function.

In [5]:
def _huber_loss(y_true, y_pred, clip_delta=1.0):
    error = y_true - y_pred
    cond  = K.abs(error) <= clip_delta
    squared_loss = 0.5 * K.square(error)
    quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)
    return K.mean(tf.where(cond, squared_loss, quadratic_loss))


### Q-network

In [6]:
class Network(object):
    def __init__(self, state_size, action_size):

        # create the neural network.
        states_input = Input((state_size,), name='states')
        actions_input = Input((action_size,), name='mask')

        fc1 = Dense(24, activation='tanh')(states_input)
        fc2 = Dense(24, activation='tanh')(fc1)
        out = Dense(action_size, activation='linear')(fc2)
        filtered_output = Multiply()([out, actions_input])
        self.model = Model(inputs=[states_input, actions_input], outputs=filtered_output)
        self.model.compile(loss=_huber_loss,optimizer=Adam(lr=LEARNING_RATE, clipnorm=1.))
    
    def get_model(self):
        return self.model
        


net = Network(4, 2)
model = net.get_model()
print(model.summary())  

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
states (InputLayer)             (None, 4)            0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 24)           120         states[0][0]                     
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 24)           600         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 2)            50          dense_2[0][0]                    
__________________________________________________________________________________________________
mask (Inpu

In [None]:
class DQN(object):
    def __init__(self):
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        #self.tensorboard = TensorBoard(log_dir='logs/{}'.format(time()))
        self.action_size = self.env.action_space.n
        self.network = Network(self.state_size, self.action_size)
        self.target_net = Network(self.state_size, self.action_size)
        self.memory = ReplayMemory()
        self.steps = 0
        self.epsilon = EPS_START
        self.duration = [0] * 100

    def update_target_model(self):
        # copy weights from model to target_model
        self.target_net.get_model().set_weights(self.network.get_model().get_weights())
        
    def select_action(self, state):
        if self.epsilon > EPS_END:
            self.epsilon *= EPS_DECAY        
        self.steps += 1
        #Epsilon greedy exploration/exploitation.
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.network.get_model().predict([state, np.ones((1,2))])
        return np.argmax(act_values[0])
    
    def exploit_action(self, state):
        act_values = self.network.get_model().predict([state, np.ones((1,2))])
        return np.argmax(act_values[0])
    
    def train(self):
        
        #Check if we have generated enough data to train.
        if len(self.memory) < BATCH_SIZE:
            return
        
        #sample the minibatch to train network on.
        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = np.asarray(batch.state).reshape(BATCH_SIZE,self.state_size)
        action_batch = np.asarray(batch.action).reshape(BATCH_SIZE,-1)
        reward_batch = np.asarray(batch.reward).reshape(BATCH_SIZE,-1)
        next_state_batch = np.asarray(batch.next_state).reshape(BATCH_SIZE,self.state_size)
        done_batch = np.asarray(batch.done).reshape(BATCH_SIZE,-1)
        
        #one hot encoding of action space.
        one_hot_targets = (np.eye(self.action_size)[action_batch]).reshape(BATCH_SIZE,-1)
        
        # Compute max V(s_{t+1}) for all next states.
        next_state_values = self.target_net.get_model().predict([next_state_batch, np.ones(one_hot_targets.shape)])
        
        # Compute the expected Q values
        end_multiplier = -(done_batch - 1) 
        expected_state_action_values = ((np.max(next_state_values, axis=1, keepdims=True) * GAMMA)*end_multiplier) + reward_batch
        # Fit the keras model.
        model.fit([state_batch, one_hot_targets], one_hot_targets * expected_state_action_values,
                   epochs=1, batch_size=BATCH_SIZE, verbose=1)
        #callbacks=[self.tensorboard]  
        
    def run(self, episode_no):
        #reset the environment to start a fresh trial.
        state = self.env.reset()
        state = np.reshape(state, [1, self.state_size])
        self.steps = 0
        total_reward = 0
        while True:
            action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action)
            next_state = np.reshape(next_state, [1, self.state_size])
            if done and self.steps < 195:
                reward = -1
            total_reward += reward
            #add transition to replay memory.
            self.memory.add(state, action, next_state, reward, done)
            
            # Move to the next state
            state = next_state
            
            if done:
                print("{2} Episode {0} finished after {1} steps  and with total reward {3}"
                      .format(episode_no, self.steps, '\033[92m' if self.steps >= 195 else '\033[99m',
                              total_reward))
                break
        self.duration[(episode_no%100)] = self.steps
        #train for mini batch.
        self.train()

        if episode_no % TARGET_UPDATE == 0:
            self.update_target_model()

    def test(self, number_of_episodes):
        for e in range(number_of_episodes):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            t = 0
            total_reward = 0
            while True:
                t += 1
                self.env.render()
                action = self.exploit_action(state)
                next_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                state = np.reshape(next_state, [1, self.state_size])
                if done:
                    break
            
            print("number of steps : " + str(t))
            print("total_reward : " + str(total_reward))
            
            
        
            
            
agent = DQN()

for i in range(500):
    agent.run(i)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[99m Episode 0 finished after 16 steps  and with total reward 14.0
[99m Episode 1 finished after 20 steps  and with total reward 18.0
[99m Episode 2 finished after 21 steps  and with total reward 19.0
[99m Episode 3 finished after 15 steps  and with total reward 13.0
[99m Episode 4 finished after 17 steps  and with total reward 15.0
[99m Episode 5 finished after 38 steps  and with total reward 36.0
[99m Episode 6 finished after 63 steps  and with total reward 61.0
Epoch 1/1
[99m Episode 7 finished after 30 steps  and with total reward 28.0
Epoch 1/1
[99m Episode 8 finished after 34 steps  and with total reward 32.0
Epoch 1/1
[99m Episode 9 finished after 78 steps  and with total reward 76.0
Epoch 1/1
[99m Episode 10 finished after 50 steps  and with total reward 48.0
Epoch 1/1
[99m Episode 11 finished after 27 steps  and with total reward 25.0
Epoch 1/1
[99m Episode 

[99m Episode 58 finished after 58 steps  and with total reward 56.0
Epoch 1/1
[99m Episode 59 finished after 39 steps  and with total reward 37.0
Epoch 1/1
[99m Episode 60 finished after 31 steps  and with total reward 29.0
Epoch 1/1
[99m Episode 61 finished after 40 steps  and with total reward 38.0
Epoch 1/1
[99m Episode 62 finished after 44 steps  and with total reward 42.0
Epoch 1/1
[99m Episode 63 finished after 49 steps  and with total reward 47.0
Epoch 1/1
[99m Episode 64 finished after 32 steps  and with total reward 30.0
Epoch 1/1
[99m Episode 65 finished after 43 steps  and with total reward 41.0
Epoch 1/1


In [None]:
agent.test(10)
print('Complete')


In [None]:
agent.network.get_model().save_weights("cart.h5")

In [None]:
agent.env.close()