In [159]:
%matplotlib inline

# Deep reinforcement learning #

Arthur ALCARAZ

In [160]:
from time import sleep, time
import numpy as np
import random
import gym
import sys

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam

import matplotlib.pyplot as plt

from collections import namedtuple
from IPython.display import clear_output

## Deep Q-Network ##

In [161]:
class DQN():
    
    def __init__(self, observation_space_dim, action_space_dim):
        
        self.observation_space_dim = observation_space_dim
        self.action_space_dim = action_space_dim
        self._build_model()
    
    def _build_model(self):
        
        self.model = Sequential()
        self.model.add(Dense(24, input_dim=self.observation_space_dim, activation='relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(self.action_space_dim, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(), metrics=['mae'])
        
    def predict(self, next_state):
        return self.model.predict(next_state)
    
    def update(self, states_batch, targets_batch):
        return self.model.train_on_batch(states_batch, targets_batch)

## Experience replay ##

* Transition
* Replay Memory

From Adam Paszke <https://github.com/apaszke>

In [162]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

## Policy ##

In [163]:
def egreedy_action(state, epsilon, valid_actions, dqn):
    """
    
    Pick an action following an epsilon greedy policy
    
    """
    
    if np.random.rand() < epsilon:
        action = np.random.choice(valid_actions)
    else:
        action = np.argmax(dqn.model.predict(state))
    
    return action

## Training ##

In [164]:
# Environment
env = gym.make('CartPole-v1').env

observation_space_dim = env.observation_space.shape[0]
action_space_dim = env.action_space.n

In [165]:
replay_memory_size=500000
replay_memory_init_size=50000
update_target_network_every=1000
discount_factor=0.99
epsilon_start=1.0
epsilon_end=0.1
epsilon_decay_steps=500000
batch_size=32

policy_network = DQN(observation_space_dim, action_space_dim)
target_network = DQN(observation_space_dim, action_space_dim)

memory = ReplayMemory(50000)

episodes = 1000

valid_actions = [0,1]

total_t = 0 

# The epsilon decay schedule
epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

stats = []

# Training
for e in range(episodes):
    state = env.reset()
    state = np.array([state])

    for t in range(500):
        
        #env.render()
        
        # Epsilon for this time step
        epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
        
        
        action = egreedy_action(state, epsilon, valid_actions, policy_network)
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.array([next_state])
        
        # Store the transition in memory
        memory.push(state, action, next_state, reward, done)

        state = next_state

        # Optimize 
        if len(memory) > batch_size:
            
            
            # Sample a minibatch from the replay memory
            samples = memory.sample(batch_size)
            
            states_batch, action_batch, next_states_batch, reward_batch, done_batch = map(np.array, zip(*samples))
            states_batch = states_batch.reshape((batch_size,observation_space_dim))
            next_states_batch = next_states_batch.reshape((batch_size,observation_space_dim))
            
            # Calculate q values and targets
            q_values_next = target_network.predict(next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            
            #print(states_batch.shape)
            #print(targets_batch.shape)
            targets = policy_network.predict(states_batch)
            for i in range(len(targets)):
                targets[i][action_batch[i]] = targets_batch[i]
            
            #print(targets.shape)
            
            # Perform gradient descent update
            loss = policy_network.update(states_batch, targets)

        # Maybe update the target estimator
        if total_t % update_target_network_every == 0:
            target_network = policy_network
            print("\nCopied model parameters to target network.")
        
        if done:
            clear_output(wait=True)
            print("episode: {}/{}, loss: {}, score: {}, total_t: {}".format(e, episodes, loss, t, total_t))
            break
        
        total_t += 1
        

env.close()

episode: 999/1000, loss: [407259.44, 262.05908], score: 34, total_t: 22266


In [168]:
# Play game
print("\nPlaying Game...")
sleep(1)

s = env.reset()
done = False
while not done:
    env.render()
    a = np.argmax(policy_network.predict(np.array([s])))
    newS, r, done, _ = env.step(a)
    s = newS


Playing Game...


In [169]:
env.close()