<a href="https://colab.research.google.com/github/avadhutc/P2S10/blob/master/Self_Driving_Car_TD3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Flatten(torch.nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, latent_dim):
        super(Actor, self).__init__()
        self.encoder = torch.nn.ModuleList([  
            torch.nn.Conv2d(1, 8, 3), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(8),
            
            torch.nn.Conv2d(8, 8, 3), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(8),
            
            torch.nn.Conv2d(8, 16, 3, stride = 2), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.Conv2d(16, 16, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.Conv2d(16, 16, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.AdaptiveAvgPool2d((1, 1)),  
            Flatten(),  
        ])

        self.linear = torch.nn.ModuleList([
            torch.nn.Linear(latent_dim+2, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 8),
            torch.nn.ReLU(),
            torch.nn.Linear(8, action_dim),
            
        ])

        self.max_action = max_action

    def forward(self, x, o):

        for layer in self.encoder:
                        
            x = layer(x)
            
        counter = 0
        for layer in self.linear:
            counter += 1
            if counter == 1:
                x = torch.cat([x, o], 1) #concat orientation
                
                x = layer(x)
            else:
                x = layer(x)
            
            
        x = self.max_action * torch.tanh(x)
        
        return x
		
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, latent_dim):
        super(Critic, self).__init__()

        self.encoder_1 = torch.nn.ModuleList([  
            torch.nn.Conv2d(1, 8, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(8),
            
            torch.nn.Conv2d(8, 8, 3), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(8),
            
            torch.nn.Conv2d(8, 16, 3, stride = 2), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.Conv2d(16, 16, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.Conv2d(16, 16, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.AdaptiveAvgPool2d((1, 1)),
            Flatten(),  ## 
        ])

        self.linear_1 = torch.nn.ModuleList([
            torch.nn.Linear(latent_dim+2+action_dim, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 8),
            torch.nn.ReLU(),
            torch.nn.Linear(8,1),
        ])


        self.encoder_2 = torch.nn.ModuleList([  
            torch.nn.Conv2d(1, 8, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(8),
            
            torch.nn.Conv2d(8, 8, 3), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(8),
            
            torch.nn.Conv2d(8, 16, 3, stride = 2), 
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.Conv2d(16, 16, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.Conv2d(16, 16, 3),  
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(16),
            torch.nn.AdaptiveAvgPool2d((1, 1)),
            Flatten(),  
        ])
        self.linear_2 = torch.nn.ModuleList([
            torch.nn.Linear(latent_dim+2+action_dim, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 8),
            torch.nn.ReLU(),
            torch.nn.Linear(8,1),
        ])

    def forward(self, x, o, u):
        
        x1 = x
        for layer in self.encoder_1:
            x1 = layer(x1)
            
        counter = 0
        for layer in self.linear_1:
            counter += 1
            
            if counter == 1:
                x1 = torch.cat([x1, o], 1) #concat orientation
                x1 = torch.cat([x1, u], 1) #concat action
                x1 = layer(x1)
            else:
                x1 = layer(x1)

        x2 = x
        for layer in self.encoder_2:
            x2 = layer(x2)
        counter = 0
        for layer in self.linear_2:
            counter += 1
            if counter == 1:
                x2 = torch.cat([x2, o], 1) #concat orientation
                x2 = torch.cat([x2, u], 1) #concat action
                x2 = layer(x2)
            else:
                x2 = layer(x2)

        return x1, x2

    def Q1(self, x, o, u):

        for layer in self.encoder_1:
            x = layer(x)

        counter = 0
        for layer in self.linear_1:
            counter += 1
            if counter == 1:
                x = torch.cat([x, o], 1) #concat orientation
                x = torch.cat([x, u], 1) #concat action
                x = layer(x)
            else:
                x = layer(x)

        return x
        
class TD3(object):

    def __init__(self, state_dim, action_dim, max_action, latent_dim):
        self.actor = Actor(state_dim, action_dim, max_action, latent_dim).to(device)
        print(self.actor)
        self.actor_target = Actor(state_dim, action_dim, max_action, latent_dim).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim, latent_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim, latent_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action
        
    def select_action(self, state, orientation):
        state = state.unsqueeze(0).to(device) #add batch info
        orientation = torch.Tensor(orientation).unsqueeze(0).to(device) #add batch info
        #print(orientation.size())
        return self.actor(state, orientation).cpu().data.numpy().flatten()

    
    def train(self, replay_buffer, iterations, batch_size=100, \
        discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        
        for it in range(iterations):
                
            # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
            batch_states, batch_next_states, batch_orientation, batch_next_orientation, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            orientation = torch.Tensor(batch_orientation).to(device)
            next_orientation = torch.Tensor(batch_next_orientation).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)
            #print("iteration: ", it)
            # Step 5: From the next state s’, the Actor target plays the next action a’
            next_action = self.actor_target(next_state, next_orientation)
            #print("enter target")
            
            # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
            target_Q1, target_Q2 = self.critic_target(next_state, next_orientation, next_action) #add orientation
            
            # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)
            
            # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
            target_Q = reward + ((1 - done) * discount * target_Q).detach()
            
            # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
            current_Q1, current_Q2 = self.critic(state, orientation, action)
            
            # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            
            # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
            if it % policy_freq == 0:
                actor_loss = -self.critic.Q1(state, orientation, self.actor(state, orientation)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                    #print("successful")

                # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
    
    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
    
    # Making a load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))


In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

#define replay buffer
class ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(transition)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        batch_states, batch_next_states, batch_orientation, batch_next_orientation, batch_actions, batch_rewards, batch_dones = [], [], [], [], [], [], []
        for i in ind: 
            state, next_state, orientation, next_orientation, action, reward, done = self.storage[i]
            #state, next_state, action, reward = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_orientation.append(np.array(orientation, copy=False))
            batch_next_orientation.append(np.array(next_orientation, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states),np.array(batch_orientation), np.array(batch_next_orientation), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)


In [0]:
#:kivy 1.0.9
# ref: https://kivy.org/docs/tutorials/pong.html

<Car>:
    size: 20, 10
    origin: 10, 5
    canvas:
        PushMatrix
        Rotate:
            angle: self.angle
            origin: self.center
        Rectangle:
            pos: self.pos
            size: self.size
            source: "./images/car.png"
        PopMatrix



<Game>:
    car: game_car
    
    canvas:
        Rectangle:
            pos: self.pos
            size: 1429, 660
            source: "./images/citymap.png"

    Car:
        id: game_car
        center: self.parent.center
    

In [0]:
# Self Driving Car
# Importing the libraries
import numpy as np
from random import random, randint
import matplotlib.pyplot as plt
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from collections import deque
import time
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import torch.nn.functional as F
#from torchvision import transforms
import math


# Importing the Kivy packages
from kivy.app import App
from kivy.uix.widget import Widget
from kivy.uix.button import Button
from kivy.graphics import Color, Ellipse, Line
from kivy.config import Config
from kivy.properties import NumericProperty, ReferenceListProperty, ObjectProperty
from kivy.vector import Vector
from kivy.clock import Clock
from kivy.core.image import Image as CoreImage
from PIL import Image as PILImage
from kivy.graphics.texture import Texture
from PIL import ImageDraw
# Importing the Dqn object from our AI in ai.py
#rom ai import Dqn
from td3_cnn1 import TD3
from td3_utilities import ReplayBuffer
import cv2
from scipy import ndimage
from PIL import Image
import scipy

import os 


# Adding this line if we don't want the right click to put a red point
Config.set('input', 'mouse', 'mouse,multitouch_on_demand')
Config.set('graphics', 'resizable', False)
Config.set('graphics', 'width', '1429')
Config.set('graphics', 'height', '660')


# Introducing last_x and last_y, used to keep the last point in memory when we draw the sand on the map
last_x = 0
last_y = 0
n_points = 0
length = 0
max_action = 40#15 #reduced to prevent steep turns
save_models = True

env_name = "car_racing"
file_name = "%s_%s" % ("TD3", env_name)
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")
  
directory = "pytorch_models"


#function to extract car image
def extract_car(x, y, width, height, angle):
        car_ = np.array([(0, 0), (width, 0), (width, height), (0, height), (0, 0)])
        theta = (np.pi / 180.0) * angle
        R = np.array([[np.cos(theta), -np.sin(theta)],
                    [np.sin(theta), np.cos(theta)]])
        car_offset = np.array([x, y])
        cropped_car = np.dot(car_, R) + car_offset
        return cropped_car
#function to extract image and rotate
def get_roi(img, angle, center, size, fill_with = 255):
    angle = angle + 90
    center[0] -= 0
    img = np.pad(img, size, 'constant', constant_values = fill_with)
    ##plt.imshow(img,cmap='gray')
    ##plt.show()

    a_0 = center[0]
    a_1 = center[1]
    img_tmp = PILImage.fromarray(img)#.astype("uint8")*255)        
    draw = ImageDraw.Draw(img_tmp)
    extract_car_area = extract_car(x=int(a_1+80), y=int(a_0+80), width=10, height=20, angle = angle-90)#+180)
    draw.polygon([tuple(p) for p in extract_car_area], fill=128)

    init_size = 1.6*size
    
    center[0] += size
    center[1] += size

    img = np.asarray(img_tmp)
    
    cropped = img[int(center[0]-(init_size/2)) : int(center[0]+(init_size/2)) ,int(center[1]-(init_size/2)): int(center[1]+(init_size/2))]
    
    rotated = ndimage.rotate(cropped, angle, reshape = False, cval = 255.0)
    y,x = rotated.shape
    final = rotated[int(y/2-(size/2)):int(y/2+(size/2)),int(x/2-(size/2)):int(x/2+(size/2))]
    
    final = torch.from_numpy(np.array(final)).float().div(255)
    final = final.unsqueeze(0).unsqueeze(0)
    final = F.interpolate(final,size=(32,32))
    
    return final.squeeze(0)


#initialise variables
crop_dim = 80
state_dim = 5 
action_dim = 1
latent_dim = 16
brain = TD3(state_dim,action_dim,max_action,latent_dim)
replay_buffer = ReplayBuffer()
last_reward = 0
scores = []
im = CoreImage("./images/MASK1.png")
mask = cv2.imread('./images/mask.png',0)
#initialising variables for training:
seed = 0 # Random seed number
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 500000 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
start_timesteps = 10000 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
batch_size = 30 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated
expl_noise = 0.4
total_timesteps = 0
episode_num = 0
done = True
t0 = time.time()
#max_timesteps = 100000
state = torch.zeros([1,state_dim,state_dim]) #shape of the cropped image
episode_reward = 0
episode_timesteps = 0
sand_counter = 0
p_sand = 0
p_living = 0
lp_counter = 0
#decay expl noise every 4000 timestep
expl_noise_vals = np.linspace(0.1, int(max_action/1000), num=int(max_timesteps/4000), endpoint=True, retstep=False, dtype=None, axis=0) # Exploration noise - STD value of exploration Gaussian noise
reward_window = []
log_interval = 10  # print avg reward after interval

# Initializing the environment
first_update = True
def init():
    global sand
    global goal_x
    global goal_y
    global first_update
    sand = np.zeros((longueur,largeur))
    img = PILImage.open("./images/mask.png").convert('L')
    sand = np.asarray(img)/255
    #sand = np.pad(sand, 160, 'constant', constant_values = 1)
    goal_x = 1420
    goal_y = 622
    first_update = False
    global swap
    swap = 0


# Initializing the last distance
last_distance = 0


# Creating the car class
class Car(Widget):
    
    angle = NumericProperty(0)
    rotation = NumericProperty(0)
    velocity_x = NumericProperty(0)
    velocity_y = NumericProperty(0)
    velocity = ReferenceListProperty(velocity_x, velocity_y)
    
    def move(self, rotation):
        #signals have been removed from any computaion for TD3, but are still visible
        self.pos = Vector(*self.velocity) + self.pos
        self.rotation = rotation
        self.angle = self.angle + self.rotation
        


# Creating the game class

class Game(Widget):
    car = ObjectProperty(None)
   
    def serve_car(self):
        self.car.center = self.center
        self.car.velocity = Vector(6, 0)

    def update(self, dt):
        
        global brain
        global reward
        global scores
        global last_distance
        global goal_x
        global goal_y
        global longueur
        global largeur
        global swap
        global orientation
        global last_action
        global last_distance_travelled
        global start_timesteps
        global batch_size
        global discount
        global tau
        global policy_noise
        global noise_clip
        global policy_freq
        global expl_noise
        global reward_window
        global total_timesteps
        global episode_num
        global done
        global t0
        global max_timesteps
        global state
        global episode_reward
        global episode_timesteps
        global sand_counter
        global p_sand
        global p_living
        global lp_counter
        #decay expl noise every 4000 timestep
        global expl_noise_vals
        global crop_dim

        log_f = open("training_log.txt", "a+")
 
        longueur = self.width
        largeur = self.height
        #state = np.zeros(5)
        sand_time = []

        if first_update:
            init()

        
        # We start the main loop over 500,000 timesteps
        if total_timesteps < max_timesteps:
            # If the episode is done
            if done:
                # If we are not at the very beginning, we start the training process of the model
                if total_timesteps != 0:
                    #print("Total Timesteps: {} Episode Num: {} Reward: {}".format(self.total_timesteps,self.episode_num, self.episode_reward))
                    distance_travelled = np.sqrt((self.car.x - 715)**2 + (self.car.y - 360)**2)
                    distance = np.sqrt((self.car.x - goal_x)**2 + (self.car.y - goal_y)**2)
                    s_reward = round(float(episode_reward * p_sand/(p_sand+p_living)),2)
                    l_reward = round(float(episode_reward * p_living/(p_sand+p_living)),2)
                    
                    print("Time-Steps: ", total_timesteps, "Episode-Num: ",episode_num, "Episode-Reward: ", episode_reward)
                    print("Episode-Steps: ", episode_timesteps,"Traveled-Distance: ", round(float(distance_travelled),2))
                    print("Remaining-Distance: ", round(float(distance),2), "Sand-Penalty: ", s_reward, "Living-Penalty: ", l_reward)

                    log_f.write("Time-Steps: {}\t Episode-Num: {}\t Episode-Reward: {}\n".format(total_timesteps, episode_num, episode_reward))
                    log_f.write("Episode-Steps: {}\t Traveled-Distance: {}\t Remaining-Distance: {}\n".format(episode_timesteps, round(float(distance_travelled),2), round(float(distance),2)))
                    log_f.write("Sand-Penalty: {}\t Living-Penalty: {}\n".format(s_reward, l_reward))
                    log_f.flush()          
                if total_timesteps > start_timesteps:
                    
                    brain.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
                    print("Training-Mode..")                    
                
                pos_index = np.random.randint(0,3)

                if pos_index == 0:
                    #update car position
                    self.car.x = 715 #
                    self.car.y = 360 #
                    self.car.angle = 0
                
                elif pos_index == 1:
                    self.car.x = 137#+
                    self.car.y = 280 #+
                    self.car.angle = 0
                elif pos_index == 2:
                    self.car.x = 715#+ 
                    self.car.y = 540 #+ 
                    self.car.angle = 0
                

                self.car.velocity = Vector(6, 0)
                xx = goal_x - self.car.x
                yy = goal_y - self.car.y
                orientation = Vector(*self.car.velocity).angle((xx,yy))/180.
                orientation = [orientation, -orientation]

                #initialise 1st state after done, move it towards orientaation
                
                state = get_roi(mask, self.car.angle, [self.car.x, self.car.y], crop_dim)
                
                done = False
                last_action = [0]
                last_distance_travelled = 0
                # Set rewards and episode timesteps to zero
                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1
                sand_counter = 0
                lp_Counter = 0
                p_living = 0
                p_sand = 0
            # Before 10000 timesteps, we play random actions based on uniform distn
            if total_timesteps < start_timesteps:
                action = [random.uniform(-max_action * 1.0, max_action * 1.0)]
                
            else:
            
                action = brain.select_action(state, np.array(orientation))
                print("Orginal-action:", action)
                action = (action + np.random.normal(0, expl_noise)).clip(-max_action, max_action)
                print("Noise-action:", action)                     

            #The agent performs the action in the environment, then reaches the next state and receives the reward
            #debug
            if type(action) != type([]):
                self.car.move(action.tolist()[0])
            else:
                self.car.move(action[0])
            distance = np.sqrt((self.car.x - goal_x)**2 + (self.car.y - goal_y)**2)
                     
            
            sand_time = []
            
            # evaluating reward and done
            
            if sand[int(self.car.x),int(self.car.y)] > 0:# and self.total_timesteps < start_timesteps:
                self.car.velocity = Vector(0.5, 0).rotate(self.car.angle)
                sand_counter +=1
                reward = -1
                done = False
                p_sand += 0.1

            else: # otherwise
                self.car.velocity = Vector(2, 0).rotate(self.car.angle)
                sand_counter = 0
                reward = -0.2 #living penalty
                p_living += 0.1

                if distance < last_distance:
                    reward = 0.1
                    p_living -= 1
        
            if (self.car.x < 5) or (self.car.x > self.width - 5) or (self.car.y < 5) or (self.car.y > self.height - 5): #crude way to handle model failing near boundaries
                done = True
                reward = -1
                p_living += 1
            
            if distance < 100:                
                reward = 1
                if swap == 1:
                    goal_x = 1420
                    goal_y = 622
                    swap = 0
                    
                else:
                    goal_x = 9
                    goal_y = 85
                    swap = 1
                    
            last_distance = distance
            new_state = get_roi(mask, self.car.angle, [self.car.x, self.car.y], crop_dim)
            xx = goal_x - self.car.x
            yy = goal_y - self.car.y
            new_orientation = Vector(*self.car.velocity).angle((xx,yy))/180.
            new_orientation = [new_orientation, -new_orientation]
            
            
            distance_travelled = np.sqrt((self.car.x - 715)**2 + (self.car.y - 360)**2)
            
            reward_window.append(reward)

            if sum(reward_window[len(reward_window)-100:]) <= -99 or episode_timesteps % 2500 == 0 and episode_timesteps != 0:
                done = True
                reward = -2
                reward_window = []

            #end episode if more time on sand
            if sand_counter == 20:
                done = True
            
            # We increase the total reward
            episode_reward += reward

            sand_time.append(sand_counter) #not used


            # We store the new transition into the Experience Replay memory (ReplayBuffer)
            replay_buffer.add((state, new_state, orientation, new_orientation, action, reward, done))
            #print(self.state, new_state, action, reward, self.done)
            state = new_state
            orientation = new_orientation
            episode_timesteps += 1
            total_timesteps += 1
            last_action = action
            last_distance_travelled = distance_travelled
            
            
            if total_timesteps % 100 == 0:
                if not os.path.exists(directory):
                    os.mkdir(directory)
                brain.save(file_name, directory)
            



class MyPaintWidget(Widget):

    def on_touch_down(self, touch):
        global length, n_points, last_x, last_y
        with self.canvas:
            Color(0.8,0.7,0)
            d = 10.
            touch.ud['line'] = Line(points = (touch.x, touch.y), width = 10)
            last_x = int(touch.x)
            last_y = int(touch.y)
            n_points = 0
            length = 0
            sand[int(touch.x),int(touch.y)] = 1
            img = PILImage.fromarray(sand.astype("uint8")*255)
            img.save("./images/sand.jpg")

    def on_touch_move(self, touch):
        global length, n_points, last_x, last_y
        if touch.button == 'left':
            touch.ud['line'].points += [touch.x, touch.y]
            x = int(touch.x)
            y = int(touch.y)
            length += np.sqrt(max((x - last_x)**2 + (y - last_y)**2, 2))
            n_points += 1.
            density = n_points/(length)
            touch.ud['line'].width = int(20 * density + 1)
            sand[int(touch.x) - 10 : int(touch.x) + 10, int(touch.y) - 10 : int(touch.y) + 10] = 1

            
            last_x = x
            last_y = y

# Adding the API Buttons (clear, save and load)

class CarApp(App):

    def build(self):
        parent = Game()
        parent.serve_car()
        #Clock.max_iteration = 5
        Clock.schedule_interval(parent.update, 1.0/60.0)
        self.painter = MyPaintWidget()
        clearbtn = Button(text = 'clear')
        savebtn = Button(text = 'save', pos = (parent.width, 0))
        loadbtn = Button(text = 'load', pos = (2 * parent.width, 0))
        clearbtn.bind(on_release = self.clear_canvas)
        savebtn.bind(on_release = self.save)
        loadbtn.bind(on_release = self.load)
        parent.add_widget(self.painter)
        parent.add_widget(clearbtn)
        parent.add_widget(savebtn)
        parent.add_widget(loadbtn)
        return parent

    def clear_canvas(self, obj):
        global sand
        self.painter.canvas.clear()
        sand = np.zeros((longueur,largeur))

    def save(self, obj):
        print("saving brain...")
        brain.save(file_name, "pytorch_models")
        #plt.plot(scores)
        #plt.show()

    def load(self, obj):
        print("loading last saved brain...")
        brain.load(file_name, "pytorch_models")

# Running the whole thing
if __name__ == '__main__':
    CarApp().run()
    #f.close()



