# Hands-on Lesson 10 - Deep reinforcement learning #

In this hands-on lesson, we will use a neural network to control an autonomous car.   

In [8]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model

from IPython.display import HTML
plt.rcParams["animation.html"] = "jshtml"

## Autonomous car environement ##

In this environement, an autonomous car drives on a loop. Its goal is to finish the loop without touching the road borders.
The road is between $r_{int}$ and $r_{ext}$ defined in polar coordinates as
$$ r_{int} = R_{int}\left(1 + \epsilon \cos n_1\theta + \epsilon \sin n_2\theta\right)$$
$$ r_{ext} = R_{ext}\left(1 + \epsilon \cos n_1\theta + \epsilon \sin n_2\theta\right)$$

In [15]:
# PARAMETERS
N1 = 5
N2 = 3
RINT = 1
REXT = 1.5
REPS = 0.05
NSENSORS = 2

class autonomous_car:  
    def __init__(self):
        self.reset()
        self._fill_archive()
        
    def reset(self):
        self.rcar = (self._rint(0) + self._rext(0)) / 2 
        self.thetacar = 0
        self.phicar = np.pi/2
        self.rcararchive = []
        self.thetacararchive = []
        self.statearchive = []
        self.rarchive = []
        self.tarchive = []
        self._xy()
        state, _, _ = self._state()
        return state 

    def step(self, action):
        # Possible actions
        if action == 0: # STRAIGHT
            pas = 1
        if action == 1: # RIGHT
            self.phicar -= np.pi/10
            pas = 0.5
        if action == 2: # LEFT
            self.phicar += np.pi/10
            pas = 0.5
        if action == 3: # RIGHT
            self.phicar -= np.pi/20
            pas = 0.75
        if action == 4: # LEFT
            self.phicar += np.pi/20
            pas = 0.75
            
        self._xy()
        xnew = self.xcar + 0.1 * pas * np.cos(self.phicar)
        ynew = self.ycar + 0.1 * pas * np.sin(self.phicar)
        rcarnew = np.sqrt(xnew**2 + ynew**2)
        thetacarnew = np.arctan2(ynew,xnew)
        thetacarnew = np.remainder(thetacarnew + 0.3, 2*np.pi) - 0.3

        # Reward
        done = False
        reward = thetacarnew - self.thetacar 
        if thetacarnew > 1.65*np.pi:
            done = True
            reward = 10
        if rcarnew < self._rint(thetacarnew):
            done = True
            reward = -1
        if rcarnew > self._rext(thetacarnew):
            done = True
            reward = -1
            
        self.rcar = rcarnew
        self.thetacar = thetacarnew   
        state = self._fill_archive()
        return state, reward, done
    
    def render(self):
        self._init_render()
        self.anim = matplotlib.animation.FuncAnimation(
                        self.fig, self._animate,
                        frames=len(self.rarchive))
        
    def _rint(self,theta):
        return RINT * (1 + REPS*(np.cos(N1*theta)+np.sin(N2*theta)))

    def _rext(self,theta):
        return REXT * (1 + REPS*(np.cos(N1*theta)+np.sin(N2*theta)))
    
    def _fill_archive(self):
        self.rcararchive.append(self.rcar)
        self.thetacararchive.append(self.thetacar)
        state, r, t = self._state()
        self.statearchive.append(state)
        self.rarchive.append(r)
        self.tarchive.append(t)
        return state

    def _init_render(self):
        theta = 2 * np.pi * np.arange(0, 1.01, 0.01)
        self.fig, self.ax = plt.subplots(1, 1, subplot_kw=dict(polar=True))
        self.ax.plot(theta, self._rint(theta),'r')
        self.ax.plot(theta, self._rext(theta),'r')
        self.ax.plot(1.65 * np.pi * np.ones(2), [self._rint(1.65 * np.pi), self._rext(1.65 * np.pi)], 'k:')
        self.fig_car, = self.ax.plot(self.thetacararchive[0], self.rcararchive[0],'ok')
        self.fig_sense = []
        for j in range(NSENSORS):
            fig, = self.ax.plot(
                        [self.thetacararchive[0], self.tarchive[0][j]], 
                        [self.rcararchive[0], self.rarchive[0][j]], 'b')
            self.fig_sense.append(fig)
       
        self.ax.grid(False)
        plt.axis('off')
        plt.ioff()
    
    def _animate(self,i):
        self.fig_car.set_data(self.thetacararchive[i], self.rcararchive[i])
        self.ax.set_title(self.statearchive[i])
        for j in range(NSENSORS):
            self.fig_sense[j].set_data([self.thetacararchive[i], self.tarchive[i][j]], 
                         [self.rcararchive[i], self.rarchive[i][j]])
        
    def _xy(self):
        self.xcar = self.rcar * np.cos(self.thetacar)
        self.ycar = self.rcar * np.sin(self.thetacar)

    def _state(self):
        # theta = self.phicar + np.pi/2 + np.pi/6 * np.array((-1,1)).reshape(-1,1)  # SENSOR ANGLES  
        theta = self.phicar + np.pi/2 + np.pi/6 * np.linspace(-1,1, num=NSENSORS).reshape(-1,1)  # SENSOR ANGLES  
        theta_line =  self.thetacar + (theta - np.pi/2 - self.thetacar) * np.arange(0, 0.95, 0.01)
        
        rint = self.rcar * np.cos(self.thetacar - theta) / np.cos(theta_line - theta) - self._rint(theta_line)
        rext =-self.rcar * np.cos(self.thetacar - theta) / np.cos(theta_line - theta) + self._rext(theta_line)
        rint[:,-1] = -1
        rext[:,-1] = -1
        index = np.minimum(np.argmax(rint<0,axis=1), np.argmax(rext<0,axis=1))

        thetaborder = np.zeros(NSENSORS)
        for j in range(NSENSORS):
            thetaborder[j] = theta_line[j,index[j]]
        rborder = self.rcar * np.cos(self.thetacar - theta[:,0]) / np.cos(thetaborder - theta[:,0])
        
        x = rborder * np.cos(thetaborder)
        y = rborder * np.sin(thetaborder)
        d = np.sqrt((x - self.xcar)**2 + (y - self.ycar)**2)
        return d, rborder, thetaborder

Let's play with this environment with a random walk

In [16]:
myenv = autonomous_car()
myenv.reset()
for _ in range(20):
    action = np.random.randint(5)
    state, reward, done = myenv.step(action)
    print("state, action, reward, done", state, action, reward, done)
    if done: break

state, action, reward, done [0.37208304 0.99961317] 1 0.03579388185326887 False
state, action, reward, done [0.36235297 0.85647248] 0 0.06880746079030797 False
state, action, reward, done [0.30840442 1.00857961] 4 0.05223325246215027 False
state, action, reward, done [0.22343385 0.4603078 ] 1 0.028544247480435103 False
state, action, reward, done [0.19961123 0.48574049] 4 0.04526340875345197 False
state, action, reward, done [0.1842519  0.95848704] 2 0.033181584409562315 False
state, action, reward, done [0.17894201 0.66611762] 0 0.06455871924688084 False
state, action, reward, done [0.08938336 0.14205932] 3 0.043288641832980645 False
state, action, reward, done [0.05 0.05] 2 -1 True


In [18]:
myenv.render()
myenv.anim

  if s != self._text:


## Deep Q-learning algorithm

In [19]:
class Qnetwork(Model):
    def __init__(self, Nstates, Nhidden, Nactions):
        super(Qnetwork, self).__init__()
        self.input_layer = layers.InputLayer(input_shape = (Nstates,))
        self.dense1 = layers.Dense(Nhidden, activation='tanh')
        self.dense2 = layers.Dense(Nactions, activation='linear')

    @tf.function
    def call(self, inputs, **kwargs):
        x = self.input_layer(inputs)
        x = self.dense1(x)
        x = self.dense2(x)
        return x

    
class Qlearning():
    def __init__(self, Nstates, Nhidden, Nactions, LearningRate):
        self.model = Qnetwork(Nstates, Nhidden, Nactions)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LearningRate, )
        self.loss = tf.keras.losses.MeanSquaredError()
        
    def Qvalues(self, state):        
        return self.model(state)
    
    def action(self, state):        
        q_values = self.model(state)
        return np.argmax(q_values[0])  
    
    @tf.function
    def train(self, state, Qtarget):
        train_variables = self.model.trainable_variables
        with tf.GradientTape() as tape:
            Qoutput = self.model(state)
            loss = self.loss(Qtarget, Qoutput)
        
        gradients = tape.gradient(loss, train_variables)
        self.optimizer.apply_gradients(zip(gradients, train_variables))

Now, let's try to train this network to control the car. 

In [20]:
N_STATES = NSENSORS
N_ACTIONS = 3
N_HIDDEN = 4
LEARNING_RATE = 0.003
DISCOUNT_FACTOR = 0.99
N_EPISODES = 1001
EPSILON = 0.9

myenv = autonomous_car()
myqlearning = Qlearning(
    Nstates=N_STATES, 
    Nhidden=N_HIDDEN, 
    Nactions=N_ACTIONS, 
    LearningRate=LEARNING_RATE)
step = 0
list_fullrewards = []

for episode in range(N_EPISODES):
    # initial state
    s = myenv.reset()
    total_reward = 0
    EPSILON *= 0.9975
    
    while True:
        # epsilon-greedy policy
        a = myqlearning.action(s[np.newaxis,:])
        if np.random.rand(1) < EPSILON:
            a = np.random.randint(N_ACTIONS)
        s_, r, done = myenv.step(a)
        
        Qtarget = myqlearning.Qvalues(s[np.newaxis,:])
        Q_ = myqlearning.Qvalues(s_[np.newaxis,:])
        maxQ_ = np.max(Q_[0])
        Qtarget = np.array(Qtarget)
        Qtarget[0,a] = r + DISCOUNT_FACTOR*maxQ_
        myqlearning.train(s[np.newaxis,:],Qtarget)
        s = s_
        total_reward += r
        if done:
            break
    
    list_fullrewards.append(total_reward)

    if episode % 100 == 0:
        print("Episode: ", episode, "  // epsilon: ", EPSILON, "  // Total reward: ", total_reward)



Episode:  0   // epsilon:  0.89775   // Total reward:  -0.2935594465152338
Episode:  100   // epsilon:  0.6989495822916731   // Total reward:  0.19216956928627527
Episode:  200   // epsilon:  0.5441721176114787   // Total reward:  2.5456062058658198
Episode:  300   // epsilon:  0.42366903291486324   // Total reward:  1.508860738036772
Episode:  400   // epsilon:  0.3298505080320368   // Total reward:  15.149236929582937
Episode:  500   // epsilon:  0.2568074350405888   // Total reward:  15.153162449539014
Episode:  600   // epsilon:  0.1999392363698308   // Total reward:  15.177400887467668
Episode:  700   // epsilon:  0.15566409996592523   // Total reward:  0.5794781123234158
Episode:  800   // epsilon:  0.12119338083986932   // Total reward:  15.16587520531937
Episode:  900   // epsilon:  0.09435595980455848   // Total reward:  15.173578823698854
Episode:  1000   // epsilon:  0.07346149673308394   // Total reward:  15.17046399027124


## Some questions now
- Plot an animation of the greedy policy
- Plot the function $a = \pi^*(s)$
- Plot the evolution of the loss/total reward as learning is implemented
- Play with the parameters (learning rate, hidden layers, number of neurons, etc.)
- Try to see if learning is better/faster with more sensors
- Try to play with the rewards to see if learning is better/faster
- Try to see if the car can learn different tracks
- Try to see if the car can learn random tracks
- Could you code learning with "experience replay" (i.e. mini-batch descent)?