# <center>-- Performances of a random policy --</center>

In [11]:
# Create the Cart-Pole game environment
import gym
env = gym.make('CartPole-v0')

statfile = open("stat_randomagent.dat","w")

for ep in range(100): #episodes
    
    state = env.reset()
    
    total_reward = 0
    
    for t in range(100): #episode length
        
        env.render() # comment this line if you don't want to see the cartpole
        #print(state)
        
        action = env.action_space.sample() #sample a random action
        
        # The agent action is implemented using env.step
        # env.step returns:
        # state= [x,x_dot,theta,theta_dot]
        # reward 
        # done={True,False} True= the agent reached one of the terminal states     
        state, reward, done, info = env.step(action)
        total_reward += reward
        
        if done:
            statfile.write('{} {}\n'.format(ep,total_reward))
            # print("Episode {} finished after {} timesteps".format(i_episode,t+1))
            break

statfile.close()
env.close()



# <center>-- Setting up the deep neural network --</center>

In [12]:
import tensorflow as tf
import numpy as np


class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=4, 
                 action_size=2, hidden_size=10, 
                 name='QNetwork'):
        
        
        # state inputs to the Q-network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            
            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [None], name='actions')
            one_hot_actions = tf.one_hot(self.actions_, action_size)
            
            # Target Q values for training
            self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')
            
            # ReLU hidden layers
            # ReLU = max(0,node_input)
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size,activation_fn=tf.nn.relu)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size,activation_fn=tf.nn.relu)
            #self.fc3 = tf.contrib.layers.fully_connected(self.fc2, hidden_size,activation_fn=tf.nn.relu)
            
            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc2, action_size, 
                                                            activation_fn=None)
            
            
            ### Train with loss (targetQ - Q)^2
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

# <center> -- Experience replay -- </center>

## - inspired by biology  (e.g. rats replay the experience when sleeping or awake at rest) 
## - remove correlations inside consecutive


In [13]:
from collections import deque

class Memory():

    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    ## add an experience to the memory
    def add(self, experience):
        self.buffer.append(experience)
    
    
    ## sample a mini-batch of experience from the memory without replacement.
    ## it will be replaced by new experiences
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False) # sample without replacement
        return [self.buffer[ii] for ii in idx]

# <center> -- The algorithm in words --

## -- Training --

- initialize the action-value network $Q$ with random weigths
- initialize the memory M

> for episode in $1, \cdots, N$  
> > &nbsp;&nbsp; for each step in the episode (until any terminal state has been reached)   
> > > &nbsp;&nbsp;&nbsp;&nbsp; - select an action $a_t$ using the $\epsilon$-greedy rule _(exploration-exploitation)_  
> > > &nbsp;&nbsp;&nbsp;&nbsp; - execute the action $a_t$ and observe reward $r_{t+1}$ and $s_{t+1}$  
> > > &nbsp;&nbsp;&nbsp;&nbsp; - save the experience $e_t = \{s_t,a_t,r_{t+1},s_{t+1}\}$ in the memory M  
> > > &nbsp;&nbsp;&nbsp;&nbsp; - sample a mini-batch of experiences uniformly from the memory  
> > > &nbsp;&nbsp;&nbsp;&nbsp; - SGD step to adjust the $\textbf{w}$'s with  $L(\textbf{w}) = \frac{1}{2}(\hat{Q} - Q)^2$ &nbsp;&nbsp; ($\hat{Q} = r + \gamma max_{a'} Q(s',a',\textbf{w})$ is the _target_)


RETURN: the right $\textbf{w*}$'s such that $ a^*(s) = \text{argmax}_a Q(s,a; \textbf{w*})$

## -- Testing --

Use $Q(s,a; \textbf{w*})$ to play the game

# <center> -- Training --</center>

# <center> Initialize parameters

In [38]:
train_episodes = 200         # max number of episodes to learn from
max_steps = 100              # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 0.1         # exploration probability at start
explore_stop = 0.001            # minimum exploration probability 

# Network parameters
hidden_size = 32            # number of units in each Q-network hidden layer ------32
learning_rate = 0.0008        # Q-network learning rate 

# Memory parameters
memory_size = 10000 # memory capacity
batch_size = 32              # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

# <center> Before starting we fill the memory with experiences from a random policy

In [39]:
# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

# <center> -- Agent/Environment interaction --

In [40]:
explore_p = explore_start
rewards_list = []

tf.reset_default_graph()
mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate)

saver = tf.train.Saver()

statfile = open("stat_training.dat","w")


env.reset()

with tf.Session() as sess:
    
    # Initialize variables
    sess.run(tf.global_variables_initializer())
   
    #step = 0
    for ep in range(1, train_episodes):
        
        total_reward = 0
        t = 0
        
        while t < max_steps:
                        
            # Uncomment this next line to watch the training. WARNING: It slows down the code.
            # env.render()
            
            
            if explore_p > np.random.rand():
                
                # Make a random action
                action = env.action_space.sample()
                
            else:
                
                ## Get action from Q-network.
                
                feed = {mainQN.inputs_: state.reshape(1, *state.shape)}
                
                ## The feedforward step
                Qs = sess.run(mainQN.output, feed_dict=feed)
                
                ## Qs = [Qs_pushrigth,Qs_pushleft]
                action = np.argmax(Qs)

            
            # Take action, get new state and reward
            next_state, reward, done, _ = env.step(action)
    
            total_reward += reward
            
            if done:
                
                # the episode ends so no next state
                next_state = np.zeros(state.shape)
                
                t = max_steps
                
                statfile.write('{} {} {} {}\n'.format(ep,total_reward,loss,explore_p))
                statfile.flush()
                if ep % 100 == 0: print('Episode {}'.format(ep))
                rewards_list.append((ep, total_reward))
                
                # Add experience to memory
                memory.add((state, action, reward, next_state))
              
                # Start new episode
                env.reset()
                # Take one random step to get the pole and cart moving
                state, reward, done, _ = env.step(env.action_space.sample())

            else:
                
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                state = next_state
                t += 1
                #env.close()
            
            
            ####### ----------- TRAINING THE NETWORK ----------- #######
            
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            actions = np.array([each[1] for each in batch])
            rewards = np.array([each[2] for each in batch])
            next_states = np.array([each[3] for each in batch])
            
            # Train network
            target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})
            
            # Set target_Qs to 0 for states where episode ends
            episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
            
            target_Qs[episode_ends] = (0, 0)
            
            targets = rewards + gamma * np.max(target_Qs, axis=1)
            
            loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                feed_dict={mainQN.inputs_: states,
                                           mainQN.targetQs_: targets,
                                           mainQN.actions_: actions})
        
 
  
        
        ##explore_p = explore_stop + (explore_start - explore_stop)/ep           

 

    
    saver.save(sess, "./dnn_cartpole.ckpt")
statfile.close()


Episode 100


# <center> TESTING </center>

In [None]:
test_episodes = 10
test_max_steps = 200
env.reset()

# move the cart with a random action
state, reward, done, _ = env.step(env.action_space.sample())

with tf.Session() as sess:
    
    #saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    saver.restore(sess, './dnn_cartpole.ckpt')
    for ep in range(1, test_episodes):
        totalr = 0
        t = 0
        while t < test_max_steps:
            env.render() 
            
            # Get action from Q-network
            feed = {mainQN.inputs_: state.reshape(1, *state.shape)}
            Qs = sess.run(mainQN.output, feed_dict=feed)
            action = np.argmax(Qs)
            
            # Take action, get new state and reward
            next_state, reward, done, _ = env.step(action)
            totalr += reward
            if done:
                print('{} {}'.format(ep,totalr))
                t = test_max_steps
                env.reset()
                # Take one random step to get the pole and cart moving
                state, reward, done, _ = env.step(env.action_space.sample())

            else:
                state = next_state
                t += 1

env.close()