# Hands-on Lesson 10 - Deep reinforcement learning #

In this hands-on lesson, we will use a neural network to control an autonomous car.   

In [1]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model

from IPython.display import HTML
plt.rcParams["animation.html"] = "jshtml"

## Autonomous car environement ##
The road is between $r_{int}$ and $r_{ext}$ defined in polar coordinates as
$$ r_{int} = R_{int}\left(1 + \epsilon \cos n_1\theta + \epsilon \sin n_2\theta\right)$$
$$ r_{ext} = R_{ext}\left(1 + \epsilon \cos n_1\theta + \epsilon \sin n_2\theta\right)$$

In [15]:
# PARAMETERS
N1 = 5
N2 = 3
RINT = 1
REXT = 1.5
REPS = 0.05
NSENSORS = 3

class autonomous_car:  
    def __init__(self):
        self.reset()
        self._fill_archive()
        
    def reset(self):
        self.rcar = (self._rint(0) + self._rext(0)) / 2 
        self.thetacar = 0
        self.phicar = np.pi/2
        self.rcararchive = []
        self.thetacararchive = []
        self.statearchive = []
        self.rarchive = []
        self.tarchive = []
        self._xy()
        state, _, _ = self._state()
        return state 

    def step(self, action):
        # Possible actions
        if action == 0: # STRAIGHT
            pas = 1
        if action == 1: # RIGHT
            self.phicar -= np.pi/10
            pas = 0.5
        if action == 2: # LEFT
            self.phicar += np.pi/10
            pas = 0.5
        if action == 3: # RIGHT
            self.phicar -= np.pi/20
            pas = 0.75
        if action == 4: # LEFT
            self.phicar += np.pi/20
            pas = 0.75
            
        self._xy()
        xnew = self.xcar + 0.1 * pas * np.cos(self.phicar)
        ynew = self.ycar + 0.1 * pas * np.sin(self.phicar)
        rcarnew = np.sqrt(xnew**2 + ynew**2)
        thetacarnew = np.arctan2(ynew,xnew)
        thetacarnew = np.remainder(thetacarnew + 0.3, 2*np.pi) - 0.3

        # Reward
        done = False
        reward = thetacarnew - self.thetacar - 0.01 
        if thetacarnew > 1.65*np.pi:
            done = True
            reward = 10
        if rcarnew < self._rint(thetacarnew):
            done = True
            reward = -1
        if rcarnew > self._rext(thetacarnew):
            done = True
            reward = -1
            
        self.rcar = rcarnew
        self.thetacar = thetacarnew   
        state = self._fill_archive()
        return state, reward, done
    
    def render(self):
        self._init_render()
        self.anim = matplotlib.animation.FuncAnimation(
                        self.fig, self._animate,
                        frames=len(self.rarchive))
        
    def _rint(self,theta):
        return RINT * (1 + REPS*(np.cos(N1*theta)+np.sin(N2*theta)))

    def _rext(self,theta):
        return REXT * (1 + REPS*(np.cos(N1*theta)+np.sin(N2*theta)))
    
    def _fill_archive(self):
        self.rcararchive.append(self.rcar)
        self.thetacararchive.append(self.thetacar)
        state, r, t = self._state()
        self.statearchive.append(state)
        self.rarchive.append(r)
        self.tarchive.append(t)
        return state

    def _init_render(self):
        theta = 2 * np.pi * np.arange(0, 1.01, 0.01)
        self.fig, self.ax = plt.subplots(1, 1, subplot_kw=dict(polar=True))
        self.ax.plot(theta, self._rint(theta),'r')
        self.ax.plot(theta, self._rext(theta),'r')
        self.ax.plot(1.65 * np.pi * np.ones(2), [self._rint(1.65 * np.pi), self._rext(1.65 * np.pi)], 'k:')
        self.fig_car, = self.ax.plot(self.thetacararchive[0], self.rcararchive[0],'ok')
        self.fig_sense = []
        for j in range(NSENSORS):
            fig, = self.ax.plot(
                        [self.thetacararchive[0], self.tarchive[0][j]], 
                        [self.rcararchive[0], self.rarchive[0][j]], 'b')
            self.fig_sense.append(fig)
       
        self.ax.grid(False)
        plt.axis('off')
        plt.ioff()
    
    def _animate(self,i):
        self.fig_car.set_data(self.thetacararchive[i], self.rcararchive[i])
        self.ax.set_title(self.statearchive[i])
        for j in range(NSENSORS):
            self.fig_sense[j].set_data([self.thetacararchive[i], self.tarchive[i][j]], 
                         [self.rcararchive[i], self.rarchive[i][j]])
        
    def _xy(self):
        self.xcar = self.rcar * np.cos(self.thetacar)
        self.ycar = self.rcar * np.sin(self.thetacar)

    def _state(self):
        theta = self.phicar + np.pi/2 + np.pi/6 * np.array((1,0,-1)).reshape(-1,1)  # SENSOR ANGLES  
        theta_line =  self.thetacar + (theta - np.pi/2 - self.thetacar) * np.arange(0, 0.95, 0.01)
        
        rint = self.rcar * np.cos(self.thetacar - theta) / np.cos(theta_line - theta) - self._rint(theta_line)
        rext =-self.rcar * np.cos(self.thetacar - theta) / np.cos(theta_line - theta) + self._rext(theta_line)
        rint[:,-1] = -1
        rext[:,-1] = -1
        index = np.minimum(np.argmax(rint<0,axis=1), np.argmax(rext<0,axis=1))

        thetaborder = np.zeros(NSENSORS)
        for j in range(NSENSORS):
            thetaborder[j] = theta_line[j,index[j]]
        rborder = self.rcar * np.cos(self.thetacar - theta[:,0]) / np.cos(thetaborder - theta[:,0])
        
        x = rborder * np.cos(thetaborder)
        y = rborder * np.sin(thetaborder)
        d = np.sqrt((x - self.xcar)**2 + (y - self.ycar)**2)
        return d, rborder, thetaborder

Let's play with this environment with a random walk

In [18]:
myenv = autonomous_car()
myenv.reset()
for _ in range(20):
    action = np.random.randint(5)
    state, reward, done = myenv.step(action)
    print("state, action, reward, done", state, action, reward, done)
    if done: break

state, action, reward, done [1.87406577 0.97322116 0.55240942] 4 0.04688695791598064 False
state, action, reward, done [0.65644002 1.18687292 0.604155  ] 4 0.04643792855936136 False
state, action, reward, done [0.45502516 1.44544217 0.65418466] 4 0.045713304992772825 False
state, action, reward, done [0.36039765 1.59409615 0.80497829] 4 0.04467903651381972 False
state, action, reward, done [0.27412847 1.64452739 1.01934596] 4 0.04328409927337707 False
state, action, reward, done [0.41162199 1.23322542 0.62768051] 1 0.03147461929581231 False
state, action, reward, done [0.26008741 1.51907762 0.93710911] 2 0.028446789490911735 False
state, action, reward, done [0.22524645 0.40943882 1.1528067 ] 4 0.045744253856289226 False
state, action, reward, done [0.18400592 0.34313703 1.12840961] 0 0.07037128652043358 False
state, action, reward, done [0.13219593 0.11169002 1.28274124] 2 0.024706977005869386 False
state, action, reward, done [0.075 0.075 0.075] 4 -1 True


In [19]:
myenv.render()
myenv.anim

  if s != self._text:


## Deep Q-learning algorithm

In [24]:
class Qnetwork(Model):
    def __init__(self, Nstates, Nhidden, Nactions):
        super(Qnetwork, self).__init__()
        self.input_layer = layers.InputLayer(input_shape = (Nstates,))
        self.dense1 = layers.Dense(Nhidden, activation='tanh')
        self.dense2 = layers.Dense(Nactions, activation='linear')

    @tf.function
    def call(self, inputs, **kwargs):
        x = self.input_layer(inputs)
        x = self.dense1(x)
        x = self.dense2(x)
        return x

    
class Qlearning():
    def __init__(self, Nstates, Nhidden, Nactions, LearningRate):
        self.model = Qnetwork(Nstates, Nhidden, Nactions)
        self.optimizer = tf.keras.optimizers.Adam(lr=LearningRate, )
        self.loss = tf.keras.losses.MeanSquaredError()
        
    def Qvalues(self, state):        
        return self.model(state)
    
    def action(self, state):        
        q_values = self.model(state)
        return np.argmax(q_values[0])  
    
    @tf.function
    def train(self, state, Qtarget):
        train_variables = self.model.trainable_variables
        with tf.GradientTape() as tape:
            Qoutput = self.model(state)
            loss = self.loss(Qtarget, Qoutput)  # 0.5 * (Qtarget - Qoutput)**2
        
        gradients = tape.gradient(loss, train_variables)
        self.optimizer.apply_gradients(zip(gradients, train_variables))

Now, let's try to train this network to control the car. 

In [25]:
N_STATES = NSENSORS
N_ACTIONS = 3  # 5
N_HIDDEN = 4
LEARNING_RATE = 0.003
DISCOUNT_FACTOR = 0.99
N_EPISODES = 1001
EPSILON = 0.9

myenv = autonomous_car()
myqlearning = Qlearning(
    Nstates=N_STATES, 
    Nhidden=N_HIDDEN, 
    Nactions=N_ACTIONS, 
    LearningRate=LEARNING_RATE)
step = 0
list_fullrewards = []

for episode in range(N_EPISODES):
    # initial state
    s = myenv.reset()
    total_reward = 0
    EPSILON *= 0.9975
    
    while True:
        # epsilon-greedy policy
        a = myqlearning.action(s[np.newaxis,:])
        if np.random.rand(1) < EPSILON:
            a = np.random.randint(N_ACTIONS)
        s_, r, done = myenv.step(a)
        
        Qtarget = myqlearning.Qvalues(s[np.newaxis,:])  # Qtarget is initiliazed with Qoutput
        Q_ = myqlearning.Qvalues(s_[np.newaxis,:])
        maxQ_ = np.max(Q_[0])
        Qtarget = np.array(Qtarget)
        Qtarget[0, a] = r + DISCOUNT_FACTOR*maxQ_
        myqlearning.train(s[np.newaxis,:], Qtarget)
        # For experience replay, we need to store [s, a, r, s_, maxQ_]
        s = s_
        total_reward += r
        if done:
            break
    
    list_fullrewards.append(total_reward)

    if episode % 100 == 0:
        print("Episode: ", episode, "  // epsilon: ", EPSILON, "  // Total reward: ", total_reward)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Episode:  0   // epsilon:  0.89775   // Total reward:  -0.6797660192883922
Episode:  100   // epsilon:  0.6989495822916731   // Total reward:  -0.09427878150782354
Episode:  200   // epsilon:  0.5441721176114787   // Total reward:  0.22406522704413634
Episode:  300   // epsilon:  0.42366903291486324   // Total reward:  14.339001755704633
Episode:  400   // epsilon:  0.3298505080320368   // Total reward:  14.294035577353354
Episode:  500   // epsilon:  0.2568074350405888   // Total reward:  14.23791530706079
Episode:  600   // epsilon:  0.1999392363698308   // Total reward:  14.293215077210698
Episode:  700   // epsilon:  0.15566409996592523   // Total reward:  14.31901021532165
Episode:  80

## Some questions now
- Plot an animation of the greedy policy
- trace a = pi(s)
- Plot the evolution of the loss/total reward as learning is implemented
- Play with the parameters (learning rate, hidden layers, number of neurons, etc.)
- Try to see if learning is better/faster with more sensors
- Try to play with the rewards to see if learning is better/faster
- Try to see if the car can learn different tracks
- Try to see if the car can learn random tracks
- Could you code learning with "experience replay" (i.e. mini-batch descent)?

In [12]:
s = np.array([1, .1])
print(s, myqlearning.action(s[np.newaxis,:]))

[1.  0.1] 1


In [22]:
myenv = autonomous_car()
state = myenv.reset()
for _ in range(200):
    action = myqlearning.action(state[np.newaxis,:])
    state, reward, done = myenv.step(action)
    print("state, action, reward, done", state, action, reward, done)
    if done: break

state, action, reward, done [1.66241831 0.75929657 0.49532638] 0 0.0660435592791171 False
state, action, reward, done [1.52530533 0.65202451 0.42842852] 0 0.06517413086732367 False
state, action, reward, done [1.36149765 0.55296931 0.38727392] 0 0.06349347826820197 False
state, action, reward, done [1.17641357 0.46077957 0.32967596] 0 0.06110860012435471 False
state, action, reward, done [0.97417243 0.35299602 0.25771295] 0 0.05815918797064668 False
state, action, reward, done [1.4514053  0.44638344 0.22226421] 2 0.02547754881048992 False
state, action, reward, done [1.32981511 0.39793981 0.23046304] 0 0.060228758041157134 False
state, action, reward, done [1.17388748 0.30630085 0.19476847] 0 0.058717842214984724 False
state, action, reward, done [1.54707946 0.68096379 0.17452902] 2 0.02470534434086203 False
state, action, reward, done [1.45223027 0.63905809 0.21871306] 0 0.059727378105489455 False
state, action, reward, done [1.2883272 0.5293584 0.1985832] 0 0.05958671719165443 False


In [23]:
myenv.render()
myenv.anim

  if s != self._text:
