# Training self driving car on TD3 deep reinforcement learning algorithm 

## Summary of what is done 
- Added CNN to calculate state from image
    - replaced the sensor input with cropped & rotated Image input
- Inputting the orientation of the car to neural network
    - __NOTE__ : Please refer to Actor and Critic Images to understand better
    - Without orientation car was stumbling here and therewhile staying on road. 
    - So added orientation to acknowledge the agent to reach destiny
- Shifted the entire update operation to car.py from brain.update()
- removed tanh activation __to stop__ the __ghumr effect__
- reduced the LR of the optmiser
    
    


## Observations
- After training with this code car was able to understand how to keep itself on roads. But was unable to learn how to reach destiny
- tweaking the reward and environment and done conditions should give better results\
__NOTE__ : I dont have GPU so I was unable to do much hyper parameter tuning



## Improvements (that can be done)
- Have done imrovements. with a 2 phase learning please refer to two-phase-learning branch for detailed documentation and code




__Refer__ [this](https://youtu.be/A6wUZMdBIzE) link to see some video of how car was training.
- These are some small instances of recording while the model was training.
- It can be clearly observed that the model is trying move across the destinations but facing a little difficulty in staying on road



 

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from collections import deque

In [2]:
class ReplayBuffer(object):

  def __init__(self, max_size=2e3):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_orientation , batch_next_states, batch_next_orientation,  batch_actions, batch_rewards, batch_dones = [], [], [], [], [], [], []
    for i in ind:
      state, orientation,  next_state, next_orientation, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_orientation.append(np.array(orientation, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_next_orientation.append(np.array(next_orientation, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_orientation),np.array(batch_next_states), np.array(batch_next_orientation), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)



## actor architecture
![actor](image_pres/final_actor.jpg)

- observe in the __forward()__ function in am not using tanh. 
- Because of etrimities of tanh I believe that the agent is tending to rotate at a single position(__ghumr__).
- so i have decided to remove the tanh part. __*results were better after removing tanh*__

In [3]:
class Actor(nn.Module):

  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.convblock1 = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(3, 3), padding=0, bias=False, stride = 2),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.convblock2 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(16)
    )
    self.pool1 = nn.MaxPool2d(2, 2)
    self.convblock3 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(16)
    )
    #self.pool2 = nn.MaxPool2d(2,2)
    self.convblock4 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.pool3 = nn.MaxPool2d(2,2)
    self.convblock5 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.convblock6 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=2, kernel_size=(1, 1), padding=0),
        nn.BatchNorm2d(2),
        # nn.ReLU() NEVER!
    )
    self.layer_1 = nn.Linear(18 + 2, 16)
    self.layer_2 = nn.Linear(16, 8)
    self.layer_3 = nn.Linear(8, action_dim)
    self.max_action = max_action


  def forward(self, x, o):
    x = self.convblock1(x)
    x = self.convblock2(x)
    x = self.pool1(x)
    #x = self.convblock3(x)
    #x = self.pool2(x)
    x = self.convblock4(x)
    x = self.pool3(x)
    x = self.convblock5(x)
    x = self.convblock6(x)
    x = x.view(x.size(0), -1)
    x = torch.cat([x, o], 1)
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
#    x = self.layer_1(x)
#    x = self.layer_2(x)
    x = self.layer_3(x)
#    x = self.max_action * torch.tanh(self.layer_3(x))
    return x


## critic architecture
![critic](image_pres/final_critic.jpg)

In [4]:
class Critic(nn.Module):

  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network

    self.convblock1 = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(3, 3), padding=0, bias=False, stride = 2),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.convblock2 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(16)
    )
    self.pool1 = nn.MaxPool2d(2, 2)
    self.convblock3 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(16)
    )
    self.pool2 = nn.MaxPool2d(2, 2)
    self.convblock4 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.pool3 = nn.MaxPool2d(2,2)
    self.convblock5 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.convblock6 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=2, kernel_size=(1, 1), padding=0,),
        nn.BatchNorm2d(2)
        # nn.ReLU() NEVER!
    )


    self.convblock7 = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(3, 3), padding=0, bias=False, stride = 2),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.convblock8 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(16)
    )
    self.pool4 = nn.MaxPool2d(2, 2)
    self.convblock9 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(16)
    )
    self.pool5 = nn.MaxPool2d(2, 2)
    self.convblock10 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.pool6 = nn.MaxPool2d(2,2)
    self.convblock11 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
        nn.ReLU(),
        nn.BatchNorm2d(8)
    )
    self.convblock12 = nn.Sequential(
        nn.Conv2d(in_channels=8, out_channels=2, kernel_size=(1, 1), padding=0),
        nn.BatchNorm2d(2)
        # nn.ReLU() NEVER!
    )

    self.layer_1 = nn.Linear(18+2+action_dim, 16)
    self.layer_2 = nn.Linear(16, 8)
    self.layer_3 = nn.Linear(8, 1)
    # Defining the second Critic neural network
    self.layer_4 = nn.Linear(18+2+action_dim, 16)
    self.layer_5 = nn.Linear(16, 8)
    self.layer_6 = nn.Linear(8, 1)

  def forward(self, x, o, u):
    #print("x : ", x)
    #print("u : ", u)
    x1 = self.convblock1(x)
    x1 = self.convblock2(x1)
    x1 = self.pool1(x1)
    #x1 = self.convblock3(x1)
    #x1 = self.pool2(x1)
    x1 = self.convblock4(x1)
    x1 = self.pool3(x1)
    x1 = self.convblock5(x1)
    x1 = self.convblock6(x1)
    x1 = x1.view( x1.size(0), -1)
    x1o = torch.cat([x1,o], 1)
    x1u = torch.cat([x1o, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    x1 = F.relu(self.layer_1(x1u))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # Forward-Propagation on the second Critic Neural Network
    x2 = self.convblock7(x)
    x2 = self.convblock8(x2)
    x2 = self.pool4(x2)
    #x2 = self.convblock9(x2)
    #x2 = self.pool5(x2)
    x2 = self.convblock10(x2)
    x2 = self.pool6(x2)
    x2 = self.convblock11(x2)
    x2 = self.convblock12(x2)
    x2 = x2.view( x2.size(0), -1)
    x2o = torch.cat([x2,o], 1)
    x2u = torch.cat([x2o, u], 1)
    x2 = F.relu(self.layer_4(x2u))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, o, u):
    x1 = self.convblock1(x)
    x1 = self.convblock2(x1)
    x1 = self.pool1(x1)
    #x1 = self.convblock3(x1)
    x1 = self.convblock4(x1)
    x1 = self.pool1(x1)
    x1 = self.convblock5(x1)
    x1 = self.convblock6(x1)
    x1 = x1.view( x1.size(0), -1)
    x1o = torch.cat([x1,o], 1)
    x1u = torch.cat([x1o, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    x1 = F.relu(self.layer_1(x1u))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)

    return x1


In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):

  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr = 0.0003)
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr = 0.0003)
    self.max_action = max_action

  def select_action(self, state, orientation):
    # state input for list state
    #state = torch.Tensor(state.reshape(1, -1)).to(device)

    # state input for image state
    state = torch.Tensor(state).unsqueeze(0).unsqueeze(0).to(device)
    orientation = torch.Tensor(orientation).unsqueeze(0).to(device)
    return self.actor(state, orientation).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):

    for it in range(iterations):

      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
      batch_states, batch_orientation, batch_next_states, batch_next_orientation, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      #for cnn state calc
      state = torch.Tensor(batch_states).unsqueeze(1).to(device)
      next_state = torch.Tensor(batch_next_states).unsqueeze(1).to(device)
      # for cnn state calc

      #for sensor state calc
      # state = torch.Tensor(batch_states).to(device)
      # next_state = torch.Tensor(batch_next_states).to(device)
      # for sensor state calc

      orientation = torch.Tensor(batch_orientation).to(device)
      #print("orientation shape : ",orientation.shape)
      next_orientation = torch.Tensor(batch_next_orientation).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)



      # Step 5: From the next state s’, the Actor target plays the next action a’
      next_action = self.actor_target(next_state, next_orientation)

      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_state, next_orientation,  next_action)

      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
      target_Q = torch.min(target_Q1, target_Q2)

      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()

      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
      current_Q1, current_Q2 = self.critic(state, orientation, action)

      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()

      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
      if it % policy_freq == 0:
        # actor_loss,_ = -self.critic(state, orientation, self.actor(state))
        # actor_loss = actor_loss.mean()
        #print("debug : ",state.shape, type(state), orientation.shape, type(orientation), self.actor(state).shape, type(self.actor(state)))
        actor_loss = -self.critic.Q1(state, orientation, self.actor(state, orientation)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

  # Making a save method to save a trained model
  def save(self, filename = "temp", directory = "models"):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))

  # Making a load method to load a pre-trained model
  def load(self, filename = "temp", directory = "models"):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))



## Started KIWI environment

In [6]:
# Self Driving Car

# Importing the libraries
import numpy as np
from random import random, randint
import matplotlib.pyplot as plt
import time

# Importing the Kivy packages
from kivy.app import App
from kivy.uix.widget import Widget
from kivy.uix.image import Image
from kivy.uix.button import Button
from kivy.graphics import Color, Ellipse, Line
from kivy.config import Config
from kivy.properties import NumericProperty, ReferenceListProperty, ObjectProperty
from kivy.vector import Vector
from kivy.clock import Clock
from kivy.core.image import Image as CoreImage
from PIL import Image as PILImage
from kivy.graphics.texture import Texture

# Importing the Dqn object from our AI in ai.py
#from aiT3D import TD3, ReplayBuffer
import random
import cv2
from scipy import ndimage
#from PIL import Image
import scipy

[INFO   ] [Logger      ] Record log in C:\Users\abhi\.kivy\logs\kivy_20-05-08_1.txt
[INFO   ] [Kivy        ] v1.11.1
[INFO   ] [Kivy        ] Installed at "C:\Users\abhi\Anaconda3\lib\site-packages\kivy\__init__.py"
[INFO   ] [Python      ] v3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 14:00:49) [MSC v.1915 64 bit (AMD64)]
[INFO   ] [Python      ] Interpreter at "C:\Users\abhi\Anaconda3\python.exe"
[INFO   ] [Factory     ] 184 symbols loaded
[INFO   ] [Image       ] Providers: img_tex, img_dds, img_sdl2, img_pil, img_gif (img_ffpyplayer ignored)
[INFO   ] [Text        ] Provider: sdl2
[INFO   ] [Window      ] Provider: sdl2
[INFO   ] [GL          ] Using the "OpenGL" graphics system
[INFO   ] [GL          ] GLEW initialization succeeded
[INFO   ] [GL          ] Backend used <glew>
[INFO   ] [GL          ] OpenGL version <b'4.4.0 - Build 21.20.16.4664'>
[INFO   ] [GL          ] OpenGL vendor <b'Intel'>
[INFO   ] [GL          ] OpenGL renderer <b'Intel(R) HD Graphics 620'>
[INFO   ] [GL

In [7]:
# Adding this line if we don't want the right click to put a red point
Config.set('input', 'mouse', 'mouse,multitouch_on_demand')
Config.set('graphics', 'resizable', False)
Config.set('graphics', 'width', '1429')
Config.set('graphics', 'height', '660')

In [8]:
# model parameters START
seed = 0 # Random seed number
start_timesteps = 9e2 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e2 # How often the evaluation step is performed (after how many timesteps)
#max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 1.0 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 30 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated
done = True
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
episode_reward = 0
episode_timesteps = 0
reached_dest = 0

action_len = 1
state_len = 5
last_time_steps = 1
image_size = 60
orientation = -0.9
#obs = [0.23,1,1,0.5, -0.5]
# model parameters END

# model global params
replay_buffer = ReplayBuffer()
# model global params


# Introducing last_x and last_y, used to keep the last point in memory when we draw the sand on the map
last_x = 0
last_y = 0
n_points = 0
length = 0


In [9]:
# Getting our AI, which we call "brain", and that contains our neural network that represents our Q-function
max_action_agent = 40
brain = TD3(state_len,action_len,max_action_agent)
action2rotation = [0,5,-5]
reward = 0
scores = []
reward_window = []
im = CoreImage("./images/MASK1.png")
main_img = cv2.imread('./images/mask.png',0)

In [10]:
def save_cropped_image(img, x, y, name = ""):
    # print("entered")
    # data = np.array(img)# * 255.0
    # rescaled = data.astype(np.uint8)
    # im = Image.fromarray(rescaled)
    # im.save("./check/"+name+ "_" + "your_file"+str(x) +"_"+ str(y) +".png")
    return

## Function to fetch cropped & rotated image

- Let "AxA" the required shape
- Crop the image in a shape of 1.414xA
    - because the square_root 2 of side of a square == hypotenuse
- Rotate the image
- Again crop "AxA" shaped required image

__Note__ : If the above explanation is confusing. I am just __rotating the image__ in the angle of car in __get_target_image()__ 

In [11]:
def get_target_image(img, angle, center, size, fill_with = 100.0):
    angle = angle + 90
    center[0] -= 0
    img = np.pad(img, size, 'constant', constant_values = fill_with)
    init_size = 1.6*size
    center[0] += size
    center[1] += size
    cropped = img[int(center[0]-(init_size/2)) : int(center[0]+(init_size/2)) ,int(center[1]-(init_size/2)): int(center[1]+(init_size/2))]
    rotated = ndimage.rotate(cropped, angle, reshape = False, cval = 255.0)
    y,x = rotated.shape
    final = rotated[int(y/2-(size/2)):int(y/2+(size/2)),int(x/2-(size/2)):int(x/2+(size/2))]
    return cropped, rotated, final

In [12]:
first_update = True
def init():
    global sand
    global goal_x
    global goal_y
    global first_update
    sand = np.zeros((longueur,largeur))
    img = PILImage.open("./images/mask.png").convert('L')
    sand = np.asarray(img)/255
    goal_x = 1420
    goal_y = 622
    first_update = False
    global swap
    swap = 0


# Initializing the last distance
last_distance = 0


In [13]:
# Creating the car class

class Car(Widget):

    angle = NumericProperty(0)
    rotation = NumericProperty(0)
    velocity_x = NumericProperty(0)
    velocity_y = NumericProperty(0)
    velocity = ReferenceListProperty(velocity_x, velocity_y)
    sensor1_x = NumericProperty(0)
    sensor1_y = NumericProperty(0)
    sensor1 = ReferenceListProperty(sensor1_x, sensor1_y)
    sensor2_x = NumericProperty(0)
    sensor2_y = NumericProperty(0)
    sensor2 = ReferenceListProperty(sensor2_x, sensor2_y)
    sensor3_x = NumericProperty(0)
    sensor3_y = NumericProperty(0)
    sensor3 = ReferenceListProperty(sensor3_x, sensor3_y)
    signal1 = NumericProperty(0)
    signal2 = NumericProperty(0)
    signal3 = NumericProperty(0)

    def move(self, rotation):
        self.pos = Vector(*self.velocity) + self.pos
        self.rotation = rotation
        self.angle = self.angle + self.rotation
        

## This section of  code is long. So, mentioned the explanation as comments in between code

In [14]:
class Game(Widget):

    car = ObjectProperty(None)
    ball1 = ObjectProperty(None)
    ball2 = ObjectProperty(None)
    ball3 = ObjectProperty(None)

    def serve_car(self):
        # randomly choose points to initialise agent
        #my_rand_points = [((715, 360),0),((348,414),90),((127,350),95),((581,432),270),((882,71),20),((970,278),0)]
        # for simplicity have used a single point
        my_rand_points = [((715, 360),0)]
        (x,y),angle = random.choice(my_rand_points)
        self.car.center = (x,y)
        self.car.angle = angle
        self.car.velocity = Vector(4, 0)


    def update(self, dt):

        global brain
        global reward
        global scores
        global last_distance
        global goal_x
        global goal_y
        global longueur
        global largeur
        global swap
        global orientation


        global obs


        # NEW GLOBALS
        global replay_buffer
        global seed
        global start_timesteps
        global eval_freq
        #global max_timesteps
        global save_models
        global expl_noise
        global batch_size
        global discount
        global tau
        global policy_noise
        global noise_clip
        global policy_freq
        global done
        global total_timesteps
        global timesteps_since_eval
        global episode_num
        global episode_reward
        global reward_window

        global episode_timesteps
        global main_img
        global image_size
        global reached_dest
        global last_time_steps
        # NEW GLOBALS


        longueur = self.width
        largeur = self.height
        if first_update:
            init()

        if True :
          # If the episode is done
          if done:
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
              print("Total Timesteps: {} Episode Num: {} Timesteps diff: {} Reward: {} score: {}".format(total_timesteps, episode_num, total_timesteps - last_time_steps,episode_reward, episode_reward/(total_timesteps - last_time_steps)))
              brain.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
              last_time_steps = total_timesteps
            
            # if the agent have not reached destination and collided to walls then reinitialise the agent at a given point
            if  not  reached_dest:
                # initialize the car at new point
                self.serve_car()
                
                #cnn state calculation
                _,_,obs = get_target_image(main_img, self.car.angle, [self.car.x, self.car.y], image_size)
                save_cropped_image(obs, self.car.x, self.car.y, name = "initial")
                xx = goal_x - self.car.x
                yy = goal_y - self.car.y
                orientation = Vector(*self.car.velocity).angle((xx,yy))/180.
                orientation = [orientation, -orientation]
                #cnn state calculation


            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1
            reached_dest = 0

          # Before start_timesteps, we play random actions
          if total_timesteps < start_timesteps:
            action = [random.uniform(-max_action_agent * 1.0, max_action_agent * 1.0)]
          else: # After start_timesteps timesteps, we switch to the model
            action = brain.select_action(np.array(obs), np.array(orientation))
            # If the explore_noise parameter is not 0, we add noise to the action and we clip it
            if expl_noise != 0:
              action = (action + np.random.normal(0, expl_noise, size=action_len)).clip(-1*max_action_agent,max_action_agent)

          # The agent performs the action in the environment, then reaches the next state and receives the reward


          # ENV STEP PERFORM START
          if type(action) != type([]):
              self.car.move(action.tolist()[0])
          else:
              self.car.move(action[0])
          distance = np.sqrt((self.car.x - goal_x)**2 + (self.car.y - goal_y)**2)


          if sand[int(self.car.x),int(self.car.y)] > 0:
              self.car.velocity = Vector(1, 0).rotate(self.car.angle)

              reward = -0.5
              if distance < last_distance:
                reward = -0.2
          else: # otherwise
              self.car.velocity = Vector(2, 0).rotate(self.car.angle)
              reward = -0.22
              if distance < last_distance:
                  reward = 0.1
              # else:
              #     last_reward = last_reward +(-0.2)


          # in the below code elif condition refers to when the car reaches the walls then the reward starts increaseing even before the 5 steps to walls
          if self.car.x < 5:
              reward_window.append(-3)
              self.car.x = 5
              reward = -1
          elif self.car.x < 10:
              reward = -1# * (10-self.car.x)
              
          if self.car.x > self.width - 5:
              reward_window.append(-3)
              self.car.x = self.width - 5
              reward = -0.7
          elif self.car.x > self.width - 10:
              reward = -0.1 * (self.car.x- self.width +10)
              
          if self.car.y < 5:
              reward_window.append(-3)
              self.car.y = 5
              reward = -1
          elif self.car.y < 10:
              reward = -1# * (10-self.car.y)
              
          if self.car.y > self.height - 5:
              reward_window.append(-3)
              self.car.y = self.height - 5
              reward = -1
          elif self.car.y > self.height - 10:
              reward = -1# * (self.car.y- self.height+10)

          # when the car reaches near the destination
          if distance < 25:
              done = True
              reached_dest = 1
              if swap == 1:
                  goal_x = 1420
                  goal_y = 622
                  swap = 0
              else:
                  goal_x = 9
                  goal_y = 85
                  swap = 1
              reward = 2
          last_distance = distance

          # cnn state calculation
          _,_,new_obs = get_target_image(main_img, self.car.angle, [self.car.x, self.car.y], image_size)
          xx = goal_x - self.car.x
          yy = goal_y - self.car.y
          new_orientation = Vector(*self.car.velocity).angle((xx,yy))/180.
          new_orientation = [new_orientation, -new_orientation]
          save_cropped_image(new_obs, self.car.x, self.car.y, name = "")
          # cnn state calculation

       

          reward_window.append(reward)
        
          ## set the done condition if the reward reaches a particular negative treshold in previous N steps
          ## set the done condition after every 1200 steps
          if sum(reward_window[len(reward_window)-100:]) <= -188 or episode_timesteps % 1200 == 0 and episode_timesteps != 0:
              done = True
              reward_window = []


          # ENV STEP PERFORM END

        
          # We increase the total reward
          episode_reward += reward

          # We store the new transition into the Experience Replay memory (ReplayBuffer)
          replay_buffer.add((obs, orientation, new_obs, new_orientation, action, reward, done))

          # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
          obs = new_obs
          orientation = new_orientation
          episode_timesteps += 1
          total_timesteps += 1
          timesteps_since_eval += 1

In [15]:
class MyPaintWidget(Widget):

    def on_touch_down(self, touch):
        global length, n_points, last_x, last_y
        with self.canvas:
            Color(0.8,0.7,0)
            d = 10.
            touch.ud['line'] = Line(points = (touch.x, touch.y), width = 10)
            last_x = int(touch.x)
            last_y = int(touch.y)
            n_points = 0
            length = 0
            sand[int(touch.x),int(touch.y)] = 1
            img = PILImage.fromarray(sand.astype("uint8")*255)
            img.save("./images/sand.jpg")

    def on_touch_move(self, touch):
        global length, n_points, last_x, last_y
        if touch.button == 'left':
            touch.ud['line'].points += [touch.x, touch.y]
            x = int(touch.x)
            y = int(touch.y)
            length += np.sqrt(max((x - last_x)**2 + (y - last_y)**2, 2))
            n_points += 1.
            density = n_points/(length)
            touch.ud['line'].width = int(20 * density + 1)
            sand[int(touch.x) - 10 : int(touch.x) + 10, int(touch.y) - 10 : int(touch.y) + 10] = 1


            last_x = x
            last_y = y



In [16]:
class CarApp(App):

    def build(self):
        parent = Game()
        parent.serve_car()
        Clock.schedule_interval(parent.update, 1.0/60.0)
        self.painter = MyPaintWidget()
        clearbtn = Button(text = 'clear')
        savebtn = Button(text = 'save', pos = (parent.width, 0))
        loadbtn = Button(text = 'load', pos = (2 * parent.width, 0))
        clearbtn.bind(on_release = self.clear_canvas)
        savebtn.bind(on_release = self.save)
        loadbtn.bind(on_release = self.load)
        marker_home = Image(source = "./images/home.jpg", pos = (1340,582))
        marker_offce = Image(source = "./images/office.jpg", pos = (9,85))
        parent.add_widget(self.painter)
        parent.add_widget(clearbtn)
        parent.add_widget(savebtn)
        parent.add_widget(loadbtn)
        parent.add_widget(marker_home)
        parent.add_widget(marker_offce)
        return parent

    def clear_canvas(self, obj):
        global sand
        self.painter.canvas.clear()
        sand = np.zeros((longueur,largeur))

    def save(self, obj):
        print("saving brain...")
        brain.save()
        plt.plot(scores)
        plt.show()

    def load(self, obj):
        print("loading last saved brain...")
        brain.load()


In [None]:
# Running the whole thing
if __name__ == '__main__':
    CarApp().run()

Total Timesteps: 1201 Episode Num: 1 Timesteps diff: 1200 Reward: -977.100000000004 score: -0.8142500000000034
