<a href="https://colab.research.google.com/github/YovliDuvshani/RideHailling/blob/main/Commun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import 

#!pip install pyhailing
#!pip install --upgrade Pillow # Restart runtime

import torch.nn as nn
import torch
import numpy
import numpy as np #To improve
import pyhailing
from pyhailing import RidehailEnv
import tqdm
import random
from collections import OrderedDict
import time as t
import torch.nn.functional as F
import pandas as pd

random.seed(0)
torch.manual_seed(0)
numpy.random.seed(0)
device = torch.device("cpu") #torch.device('cuda')

## Distance functions

In [3]:
env = RidehailEnv()
speeds_data = env.speeds_data.reset_index()

In [4]:
def dist_manhattan(list_coord_depart, list_coord_arrivee):
  """ Retourne la distance de Manhattan entre deux points d'un plan 
  Prend en entrée les coordonnées des points de départ et d'arrivée :
  ([x_depart,y_depart],[x_arrivee,y_arrivee]) """
  
  depart = numpy.array(list_coord_depart)
  arrivee = numpy.array(list_coord_arrivee)
  return numpy.linalg.norm((depart - arrivee), ord=1)

In [5]:
def vitesse_normalisee(vitesse_moyenne, sigma):
  """ Retourne un tirage aléatoire d'une loi normale N(vitesse_moyenne,variance_vitesse) """
  
  #return numpy.random.randn(1)*sigma + vitesse_moyenne
  loi_normale = numpy.random.randn(1000)
  loi_normale = [loi_normale[i]*sigma + vitesse_moyenne for i in range(len(loi_normale))]
  limite = numpy.quantile(loi_normale, .10)
  normale_tronquee = []
  for i in range(len(loi_normale)):
    loi_normale[i] = numpy.array(loi_normale[i])[0]
    if loi_normale[i] >= limite:
      normale_tronquee.append(loi_normale[i])
  return random.choices(normale_tronquee)[0]

In [6]:
def duree_deplacement(list_coord_depart, list_coord_arrivee, time_):
  """ Retourne la duree d'un trajet en secondes
  Prend en entrée les coordonnées des points de départ et d'arrivée ainsi que l'heure :
  ([x_depart,y_depart],[x_arrivee,y_arrivee],time) """

  distance = dist_manhattan(list_coord_depart, list_coord_arrivee)
  depart = numpy.array(list_coord_depart)
  arrivee = numpy.array(list_coord_arrivee)
  zone_depart = env.xy_to_zone(depart)
  zone_arrivee = env.xy_to_zone(arrivee)
  tranche_horaire = int(time_/15/60)*15
  vitesse_moyenne = speeds_data[(speeds_data['puzone']==zone_depart) & (speeds_data['dozone']==zone_arrivee) & (speeds_data['min']==tranche_horaire)]['speed_mean']
  sigma = speeds_data[(speeds_data['puzone']==zone_depart) & (speeds_data['dozone']==zone_arrivee) & (speeds_data['min']==tranche_horaire)]['speed_stddev']
  #vitesse_associee = vitesse_normalisee(vitesse_moyenne, sigma) #WAY TO MUCH TIME TO PROCESS
  temps = distance/vitesse_moyenne
  return numpy.array(temps)

In [7]:
def distance_to_request(car_coord, req_coord, car_job, time, first_job_coord, second_job_coord, third_job_coord): #A revoir + Prendre en compte les plages horaires. #Vérifier sur exemple.
  """
  Return the distance to a request taking into account the jobs of the cars. #Distance -> Time
  """

  car_job = str(car_job[0]) + str(car_job[1]) + str(car_job[2])
  if car_job in ['044','444','104']: 
    return duree_deplacement(car_coord,req_coord,time)
  if car_job in ['344']:
    duree = duree_deplacement(car_coord,first_job_coord[1],time)
    duree += duree_deplacement(first_job_coord[1],req_coord,(time+duree)%86400)
    return duree
  if car_job == '234':
    duree = duree_deplacement(car_coord,first_job_coord[1],time) 
    duree += duree_deplacement(first_job_coord[1],second_job_coord[0],(time+duree)%86400)
    duree += duree_deplacement(second_job_coord[0],second_job_coord[1],(time+duree)%86400)
    duree += duree_deplacement(second_job_coord[1],req_coord,(time+duree)%86400)
    return duree
  duree = duree_deplacement(car_coord,first_job_coord[1],time) 
  duree += duree_deplacement(first_job_coord[1],second_job_coord[0],(time+duree)%86400)
  duree += duree_deplacement(second_job_coord[0],second_job_coord[1],(time+duree)%86400)
  duree += duree_deplacement(second_job_coord[1],third_job_coord[0],(time+duree)%86400)
  duree += duree_deplacement(third_job_coord[0],third_job_coord[1],(time+duree)%86400)
  duree += duree_deplacement(third_job_coord[1],req_coord,(time+duree)%86400)
  return duree

## Heuristic functions

In [8]:
def plus_proche_lot(coords_voiture, time):
  """ Retourne l'indice du lot le plus proche, prend en entrée la liste des coordonnées du véhicule d'intérêt
  [x,y] et l'heure 
  """
  lots = numpy.array(env.lots)
  durees = []
  for i in range(len(lots)):
    durees.append(duree_deplacement(coords_voiture, lots[i], time)[0])
  return np.argmin(durees)

In [9]:
def heuristic(state):
  """
  Returns the reposition according to the heuristic used.
  """
  triplets = triplets_jobs(state)
  reposition = [env.num_lots]*env.num_vehicles 
  if len(triplets['444']) > 0:
    for i in range(len(triplets['444'])):
      lot = plus_proche_lot(state['v_locs'][triplets['444'][i]], state['time'])
      reposition[triplets['444'][i]] = lot
  return np.array(reposition) #Créer l'array direct : improve

In [None]:
trips_data = env.trips_data

In [None]:
tab = np.zeros(len(env.lots['x']))
for i,(x,y) in enumerate(zip(env.lots['x'],env.lots['y'])):
  tab[i] = env.xy_to_zone(np.array([x,y]))
print(f'Nb_zones with iddling spots: {len(np.unique(tab))}')
print(f'Nb_zones: {env.num_zones}')

Nb_zones with iddling spots: 50
Nb_zones: 61


In [None]:
# Echantillonnage pour définition du milieu des zones

res = []
x_sampling = np.linspace(env.x_range[0],env.x_range[1],1000)
y_sampling = np.linspace(env.y_range[0],env.y_range[1],1000)

for x in x_sampling:
  for y in y_sampling:
    res += [[x,y,env.xy_to_zone(np.array([x,y]))]]

In [None]:
res_ = np.array(res)
res_ord = pd.DataFrame(data = res_,columns=['x','y','zone'])

In [None]:
res_ord[res_ord['zone']==13]['x'].mean()

586.4925185091538

In [None]:
# Process global (intéréssant à calculer : durée de déplacement médiane d'un trajet et moyenne + nb_requests en fct du créneau du jour)

# Définir un centre pour chaque zone (échantillonnage)
# Pré-process : Pour chaque zone et chaque timestep -> définir les zones atteignables (en bonus : plusieurs seuils, par exemple 3,4,5 minutes) + Gérer les zones n'ayant pas de lots proches(par exemple -> les enlevés).
# Recherche Optimum Global ou Heuristique (plusieurs optimums potentiellement) -> Maximiser le fait de ne pas rater de requêtes (prendre en compte les véhicules bientot disponibles)
# Prendre en compte les distances des véhicules aux locations pour à la fois que cette distance ne soit pas élevé mais aussi que les zones soit atteignables par ces véhicules 
#avant que l'on pense avoir des requêtes.(le nb de véhicules se relocalisant déjà, etc...)


## Environment

In [10]:
def triplets_jobs(state): #Statut
  """ Prend en entrée l'état de l'environnement et retourne les listes des indices des véhicules pour chaque
  triplet de jobs sous forme d'une liste de listes ayant chaucune le triplet en premier élément 
  """
  jobs = state['v_jobs']
  dic = {'044':[],'104':[],'234':[],'323':[],'344':[],'444':[]}
  for i in range(len(jobs)):
    triplet = str(jobs[i][0]) + str(jobs[i][1]) + str(jobs[i][2])
    dic[triplet] += [i]
  return dic

In [11]:
# Coordonate normalization
estimated_mean_x = 587.89
estimated_sigma_x = 0.94
estimated_mean_y = 4512.1
estimated_sigma_y = 3.21
def normalize_x(x):
  return (x-estimated_mean_x)/estimated_sigma_x
def normalize_y(y):
  return (y-estimated_mean_y)/estimated_sigma_y

## Q learning algorithm

In [12]:
# Memory

class ReplayBuffer():
    def __init__(self, max_size, device):
        self.max_size = max_size
        self.mem_cntr = 0

        self.state_cars_memory = [] #Precise
        self.state_global_memory = [] 
        self.new_state_cars_memory = [] 
        self.new_state_global_memory = []
        self.new_state_jobs_memory = []
        self.action_memory = [] #Consider action going from 0 to nb_car // Size : self.mem_size*nb_actions_to_make_in_state(it varies)
        self.timelapse_memory = []

        self.terminal_memory = [] #We could use arrays here
        self.reward_memory = [] #We could use arrays here
        self.device = device

    def push(self, state_cars, state_global, action, reward, new_state_cars, new_state_global, new_state_jobs, timelapse, done):
        """
        Add a new sample and replace oldest one if full
        """
        self.state_cars_memory += [state_cars]
        self.state_global_memory += [state_global]
        self.new_state_cars_memory += [new_state_cars]
        self.new_state_global_memory += [new_state_global]
        self.action_memory += [action]
        self.reward_memory += [reward]
        self.new_state_jobs_memory += [new_state_jobs]
        self.timelapse_memory += [timelapse]
        self.terminal_memory += [done]

        self.mem_cntr += 1

        # Supress 1st element if too many of them.
        if self.mem_cntr>self.max_size:
          self.state_cars_memory.pop(0)
          self.state_global_memory.pop(0)
          self.new_state_cars_memory.pop(0)
          self.new_state_global_memory.pop(0)
          self.action_memory.pop(0)
          self.reward_memory.pop(0)
          self.new_state_jobs_memory.pop(0)
          self.timelapse_memory.pop(0)
          self.terminal_memory.pop(0)

    def sample(self, batch_size):
        """
        Sample from the memory
        return : list of size 'batch_size' containing different observations.
        """
        max_mem = min(self.mem_cntr, self.max_size)
        batch = np.random.choice(max_mem, batch_size, replace=False) #It's probably not gonna work that way.

        states_cars = []
        states_global = []
        actions = []
        rewards = []
        new_states_cars = []
        new_states_global = []
        new_state_jobs = []
        terminal = []
        timelapse = []
        for ele in batch:
          states_cars += [self.state_cars_memory[ele]]
          states_global += [self.state_global_memory[ele]]
          actions += [self.action_memory[ele]]
          rewards += [self.reward_memory[ele]]
          new_states_cars += [self.new_state_cars_memory[ele]]
          new_states_global += [self.new_state_global_memory[ele]]
          new_state_jobs += [self.new_state_jobs_memory[ele]]
          terminal += [self.terminal_memory[ele]]
          timelapse += [self.timelapse_memory[ele]]

        return states_cars, states_global, actions, self.to_torch(rewards), new_states_cars, new_states_global, new_state_jobs, self.to_torch(timelapse), self.to_torch(terminal)

    def to_torch(self, x):
        return torch.tensor(x).to(self.device)

    def to_numpy(self, x):
        return x.detach().cpu().numpy()

    def __len__(self):
        return min(self.mem_cntr, self.max_size) 

In [13]:
# Network

class ReqN(nn.Module):
    def __init__(self, cars_input_size, global_input_size, nb_car, hidden_size_1=10,hidden_size_2=500):
        super().__init__()
        self.cars_input = cars_input_size
        self.global_input = global_input_size
        self.nb_car = nb_car
        self.linear_1 = nn.Linear(cars_input_size, hidden_size_1)
        self.linear_2 = nn.Linear(hidden_size_1, hidden_size_1)
        self.linear_3 = nn.Linear(hidden_size_1*nb_car+global_input_size, hidden_size_2)
        self.linear_4 = nn.Linear(hidden_size_2, hidden_size_2)
        self.linear_5 = nn.Linear(hidden_size_2, nb_car+1)
        self.relu = nn.ReLU()

    def forward(self, x, y): #x : batch_size*(Nb_car)*x_input  ; y : batch_size*(nb_caracteristics_global : y_input)
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.linear_2(x)
        x = self.relu(x)
        x = torch.flatten(x,start_dim=1)
        x = torch.cat((x,y),1)

        x = self.linear_3(x)
        x = self.relu(x)
        x = self.linear_4(x)
        x = self.relu(x)

        output = self.linear_5(x)
        if output.shape[0]==1:
          x = x.squeeze(0)
        return output

In [31]:
# Agent

class Agent(object):

    def __init__(self, 
                 n_actions,
                 memory, 
                 eps, eps_decay,
                 discount_rate, 
                 update_delay, 
                 device
                 ):
        
        self.action_space = [i for i in range(n_actions)] #n_actions = nb of cars + 1 (it's not possible to refuse an action)
        self.memory = memory 
        self.eps, self.eps_decay = eps, eps_decay
        self.discount_rate = discount_rate #Should be equal to 1 at some point!!
        self.update_delay = update_delay
        self.counter = 0
        self.device = device

        self.requests_repository = pd.DataFrame(columns=['time','day']) #Easier to store these values into a dataframe / Allows to have information on the lasts requests.

    def init_nets(self, Q_net, target_net, optimizer, batch_size):
        """
        initialize online and targets
        """
        self.Q_net = Q_net
        self.target_net = target_net
        self.optimizer = optimizer
        self.batch_size = batch_size

        self.cars_input = self.Q_net.cars_input 
        self.global_input = self.Q_net.global_input
        self.nb_car = self.Q_net.nb_car

        self.copy_weights()
        self.counter += 1
    
    def to_torch(self, x):
        return torch.tensor(x).to(self.device).float()

    def to_numpy(self, x):
        return x.detach().cpu().numpy()

    def create_mask(self, jobs, state_tensor_cars, state_tensor_global, time_max=250): 
      """
      Create a mask for the car that we cannot assign to a certain request and that for every request of a state.
      return: a list of 0 and 1 where 1 means that we mask the outcome, size: nb_request*(nb_car+1)
      """
      # Deal with cars, for which the distance is too high.
      nb_request = state_tensor_cars.shape[0]
      time = state_tensor_global[0][1]
      mask = torch.zeros(nb_request,self.nb_car+1).to(self.device)
      for i in range(nb_request):
        request_time = state_tensor_global[i][2]
        for j in range(self.nb_car):
          mask[i][j] = (time_max - state_tensor_cars[i][j][2] + request_time - time) < 0 #if not accesible then 1.

      # Deal with cars which already have too many jobs
      for car in (jobs['323']+jobs['234']):
        for i in range(nb_request):
          mask[i,car] = 1 
      return mask

    def train(self, states_cars, states_global, targets, mask, list_association):
        """
        Train online net for 1 step
        """
        self.optimizer.zero_grad()
        # Forward pass
        self.Q_net.train()
        Q_values = self.Q_net(states_cars,states_global).to(device) #nb_reqs(in the whole batch) *nb_actions 

        # Masking
        Q_values = (Q_values*mask).sum(-1)

        # Aggregate
        aggreg_link = self.aggregation_link(Q_values, list_association)
        Q_values = torch.transpose(aggreg_link,0,1) @ Q_values #batch_size*nb_actions

        # Computing loss
        loss = (targets.detach() - Q_values).pow(2).mean()
        loss.backward()
        
        # Apply gradients
        self.optimizer.step()
        self.Q_net.eval()

    def copy_weights(self):
        """
        Copy weights from online to target net
        """
        self.target_net.load_state_dict(self.Q_net.state_dict())

    def select_action(self, states_cars, states_global, mask): #Optimize it by taking the action : no assignement (in the beggining) very frequently.
        """
        Select an action with eps greedy as well as dealing with the overlapping issue
        """
        # Epsilon greedy

        list_action = []
        for i,(state_cars,state_global) in enumerate(zip(states_cars,states_global)):
          rand = np.random.random()
          if rand < self.eps: #If we choose randomly
            action = np.random.choice(self.action_space) 
            while (action in list_action or mask[i][action]==1) and action != self.nb_car: #Continue until we find an action we can realize. 
              action = np.random.choice(self.action_space)
            list_action += [action]
          else: #If we take the max Q value
            tensor_action = self.Q_net(state_cars.unsqueeze(0),state_global.unsqueeze(0)).to(self.device)
            tensor_action.masked_fill_(mask[i],-np.inf) #Other formulation : tensor_action += tensor_action*mask[i]*(-100000
            action = self.to_numpy(torch.max(tensor_action,1)[1])[0]
            # Deal with overlapping actions : 1st arrived 1st served.
            k = 2
            while (action.item() in list_action or mask[i,action]==1) and action.item() != self.nb_car: #Continue until we find an action we can realize.
              action = self.to_numpy(torch.topk(tensor_action.squeeze(0),k)[1][k-1])
              k += 1
            list_action += [action]
                                          
        return np.array(list_action) #To improve: we don't need to create a list

    def remember(self, *args):
        """
        Update memory
        args: state_cars, state_global, action, reward, new_state_cars, new_state_global, new_state_jobs, timelapses, done
        """

        self.memory.push(*args)

    def regroup_tensor(self, states_cars, states_global):
      """
      Regroup tensors stored into a list into a unique tensor
      return : a unique tensor composed of every input tensors with a list which associates the input with the output
      """
      # Create an association list between the elements of this batch and compute the total amount of request in this batch.
      n_element = 0
      list_association = []

      for i,(state_cars,state_global) in enumerate(zip(states_cars,states_global)):
        n_element += len(state_cars)
        for j in range(len(state_cars)):
          list_association += [[i,j]]
      
      # Fill the tensors we're going to use for the batch
      states_cars_ = torch.zeros(n_element,self.nb_car,self.cars_input).to(device)
      states_global_ = torch.zeros(n_element,self.global_input).to(device)

      count = 0
      for i,(state_cars,state_global) in enumerate(zip(states_cars,states_global)):
        for j in range(len(state_cars)):
          states_cars_[count] = state_cars[j]
          states_global_[count] = state_global[j]
          count += 1

      return states_cars_,states_global_,list_association

    def requests_repository_update(self,state):
      """
      Update the requests_repository with the current timestep.
      """
      time = state['time']
      day = state['dow']
      for req_time in state['request_times']:
        if req_time == time:
          self.requests_repository = self.requests_repository.append({'time': time,'day': day},ignore_index=True)
      self.requests_repository_filter(time,day)

    def requests_repository_filter(self,time,day,lim = 900):
      self.requests_repository = self.requests_repository[(self.requests_repository['time']>time-900) & (self.requests_repository['day']==day)]

    def dict_to_network(self,state):
      """
      Takes (in input) the state dict and transforms it into a tensor while selecting the right features.
      """
      # Build tensors 'cars' and 'global'.
      n = len(state['request_times'])
      cars_input = torch.zeros(n,self.nb_car,self.cars_input).to(self.device)
      global_input = torch.zeros(n,self.global_input).to(self.device)
      for i in range(n):
        for car in range(self.nb_car):
          cars_input[i][car][0] = normalize_x(state['v_locs'][car][0])
          cars_input[i][car][1] = normalize_y(state['v_locs'][car][1])
          # Calculate the distance to the request.
          first_job_coord = state['v_job_locs'][car][0] #We retrieve it even if it's not necessary
          second_job_coord = state['v_job_locs'][car][1] 
          third_job_coord = state['v_job_locs'][car][2]
          cars_input[i][car][2] = self.to_torch(distance_to_request(state['v_locs'][car], state['request_locs'][i][0], state['v_jobs'][car], state['time'], first_job_coord, second_job_coord, third_job_coord))
        global_input[i][0:5] = torch.Tensor(np.array([i==state['dow'] for i in range(5)])) #day / one_hot encode
        global_input[i][5] = state['time'] #Projeté dans un espace latent le time? 
        global_input[i][6] = state['request_times'][i] 
        # Amount of available cars (right away)
        jobs = triplets_jobs(state) #Could add it to the argument of the function
        global_input[i][7] = len(jobs['044']) + len(jobs['104']) + len(jobs['444'])  
        # Amount of requests
        global_input[i][8] = len(state['request_times'])
        # Amount of requests in the last 15 mins.
        global_input[i][9] = len(self.requests_repository['time'])
        # Requests coordinate
        global_input[i][10] = normalize_x(state['request_locs'][i][0][0]) #The place where we take the request : x coord
        global_input[i][11] = normalize_y(state['request_locs'][i][0][1]) #The place where we take the request : y coord
      return cars_input,global_input

    def action_to_tensor(self, actions):
      """
      Transforms the list of actions of a batch into a single tensor.
      """
      list_res = []
      for list_action in actions:
        for action in list_action:
          list_res += [action]
      return self.to_torch(np.array(list_res))
    
    def aggregation_link(self, Q_values, list_association):
      """
      Create a matrix of size : nb_requests(in the batch) * batch_size, 
      """
      mask = torch.zeros(Q_values.shape[0],self.batch_size).to(device) #Nb_reqs * batch_size 
      count = 0
      for j in range(self.batch_size):
        temp = []
        for pair in list_association:
          if pair[0] == j:
            temp += [count]
            count += 1
        for i in temp:
          mask[i][j] = 1/len(temp)
      return mask

    def step(self):
        """
        Apply deep Q-learning algorithm step
        """
        if len(self.memory) >= self.batch_size:

            # Sample from memory
            states_cars, states_global, actions, reward, new_states_cars, new_states_global, new_states_jobs, timelapses,done = self.memory.sample(self.batch_size) 

            # Compute the mask(action) on every element of the batch for new_state       
            actions_new_state = []
            for ele_cars,ele_global,ele_jobs in zip(new_states_cars,new_states_global,new_states_jobs):
              mask = self.create_mask(ele_jobs,ele_cars,ele_global)
              actions_new_state += [self.select_action(ele_cars,ele_global,mask)] #SARSA
            actions_new_state = self.action_to_tensor(actions_new_state)
            mask = F.one_hot(actions_new_state.long(),num_classes=21)

            # Regroup the states and compute the association lists
            states_cars, states_global, list_association = self.regroup_tensor(states_cars, states_global)
            new_states_cars, new_states_global, new_list_association = self.regroup_tensor(new_states_cars, new_states_global)

            # Compute target Q value
            target_Q_value = self.target_net(new_states_cars,new_states_global)

            # We 'add' the mask
            target_Q_value = (target_Q_value*mask).sum(-1)

            # Agglomerate the different Q_value
            aggreg_link = self.aggregation_link(target_Q_value,new_list_association)
            Agglomerate_Q_value = torch.transpose(aggreg_link,0,1) @ target_Q_value 

            # Compute target
            target_Q_value = reward + torch.pow(self.discount_rate*torch.ones(self.batch_size),timelapse/60).to(device)*Agglomerate_Q_value*done.to(device)  

            # Change actions into usable tensors
            actions = self.action_to_tensor(actions)

            # Compute mask for actions
            mask = F.one_hot(actions.long(),num_classes=21)

            # Train network         
            self.train(states_cars, states_global, target_Q_value, mask, list_association)

            # Copy weights
            if self.counter % self.update_delay == 0:
                self.copy_weights()

            # Update epsilon
            self.eps *= self.eps_decay

            self.counter += 1
        else:
            return
        
    

In [32]:
# Training Loop

env = RidehailEnv()

MEMORY_SIZE = 10000
CARS_INPUT_SIZE = 3
GLOBAL_INPUT_SIZE = 12
N_CAR = env.num_vehicles

LR = 0.001
BATCH_SIZE = 16

N_SIMULATION = 10
EPS = 0.9
EPS_DECAY = 0.9995
DISCOUNT_RATE = 0.99 # should depend of t
UPDATE_DELAY = 50 # delay between target_net parameters updates
DEVICE = "cpu" # "cuda" or "cpu"

# Model and target model 
Q_net = ReqN(CARS_INPUT_SIZE, GLOBAL_INPUT_SIZE, N_CAR).to(DEVICE)
target_net = ReqN(CARS_INPUT_SIZE, GLOBAL_INPUT_SIZE, N_CAR).to(DEVICE)

# Optimizer (only on Q_net)
optimizer = torch.optim.Adam(Q_net.parameters(), lr=LR)

# Memory
memory = ReplayBuffer(MEMORY_SIZE, DEVICE)

# Agent and initialization
agent = Agent(n_actions=N_CAR+1, 
              memory=memory, 
              eps=EPS, 
              eps_decay=EPS_DECAY,
              discount_rate=DISCOUNT_RATE,
              update_delay=UPDATE_DELAY, 
              device=DEVICE
              )

agent.init_nets(Q_net, target_net, optimizer, BATCH_SIZE)

all_scores = []
# Progress bar
with tqdm.tqdm(total=N_SIMULATION, position=0, leave=True) as pbar:
    for i in range(N_SIMULATION):
        done = False
        score = 0
        # Reset env
        state = env.reset()

        # Make sure that the first state is a state with request
        while len(state['request_times']) == 0:
          action_rep = heuristic(state)
          action = {'reposition': action_rep, 'req_assgts': np.array([]), 'req_rejections': np.array([])}
          state, reward, _, _ = env.step(action)
          score += reward

        #Store the states in a list for the analysis.
        list_state = []
        list_state += [state]
        state_tensor_cars,state_tensor_global = agent.dict_to_network(state)

        # Update request repositories
        agent.requests_repository_update(state)

        while not done:
            print('Running...')
            # Retrieve lists of triplets from state
            jobs = triplets_jobs(state)

            # Create the mask for every request (taking into account the distance)
            mask = agent.create_mask(jobs,state_tensor_cars,state_tensor_global)

            # Apply Heuristic
            action_rep = heuristic(state)

            # Select action
            action_req = agent.select_action(state_tensor_cars,state_tensor_global, mask)

            # Construct action # A not so probable error to correct : if request assign at the same time that a reposition is requested : gotta change the status!!!
            action = OrderedDict({'reposition': action_rep, 'req_assgts': action_req, 'req_rejections': np.zeros_like(action_req)}) #Need to deal with the rejections probably #May need to be an ordered dict

            # Print the action realised during this round
            print('Action:', action)
            day = state['dow']
            time = state['time']
            print(f'day : {day}/ time : {time}')

            # Execute action
            new_state, reward, done, _ = env.step(action) #Should compute the mean reward.
              
            # While no request for new_state : Apply heuristic and create a new action / There may be a problem with done.
            while len(new_state['request_times']) == 0 and not done:
              action_rep = heuristic(new_state)
              action = {'reposition': action_rep, 'req_assgts': np.array([]), 'req_rejections': np.array([])}
              new_state, reward_add, done, _ = env.step(action)
              reward += reward_add
              
            score += reward
            if not done: #to deal with the end of the environnment in the case that it happens during the while loop.
              # Update request repositories
              agent.requests_repository_update(new_state)

              # Transforms state into a usable tensor for the : request network
              new_state_tensor_cars,new_state_tensor_global = agent.dict_to_network(new_state)

              # Update memory
              new_jobs = triplets_jobs(new_state)
              timelapse = new_state['time'] - state['time']
              agent.remember(state_tensor_cars,state_tensor_global, action_req, reward, new_state_tensor_cars, new_state_tensor_global, new_jobs, timelapse,1-int(done))

              # Apply algorithm
              for i in range(1):
                agent.step() #We should maybe iterate to apply the step more often.

              # Update state
              state = new_state
              list_state += [state]
              state_tensor_cars,state_tensor_global = new_state_tensor_cars,new_state_tensor_global
        
        all_scores.append(score)

        pbar.set_description('score=' + str(score))
        pbar.update()

plt.plot(all_scores)
plt.show()

  0%|          | 0/10 [00:00<?, ?it/s]

Running...
Action: OrderedDict([('reposition', array([302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302,
       302, 302, 302, 302, 302, 302, 302])), ('req_assgts', array([15])), ('req_rejections', array([0]))])
day : 4/ time : 224.29111309871996
Running...
Action: OrderedDict([('reposition', array([302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302,
       302, 302, 302, 302, 302, 302, 302])), ('req_assgts', array([20])), ('req_rejections', array([0]))])
day : 4/ time : 484.9911660731291
Running...
Action: OrderedDict([('reposition', array([302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302,
       302, 302, 302, 302, 302, 302, 302])), ('req_assgts', array([19, 20])), ('req_rejections', array([0, 0]))])
day : 4/ time : 488.7343229001261
Running...
Action: OrderedDict([('reposition', array([302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302,
       302, 302, 302, 302, 302, 302, 302])), ('req_assgts', array([17])), ('req_rejections', 

  0%|          | 0/10 [00:39<?, ?it/s]


KeyboardInterrupt: ignored

### Test

In [None]:
score

2880.5767908354665

In [None]:
state = env.reset()
state['dow']

In [None]:
v = list_state[30]['v_locs'][1]
v1 = list_state[30]['request_locs'][0][0]
time = list_state[30]['time']
duree_deplacement(v,v1,time)

array([130.49006675])

In [None]:
list_state[29]['request_times']

In [None]:
list_state[29]['v_jobs']

In [None]:
list_state[6]['v_jobs']

In [None]:
v = new_state['v_locs'][0]
time = new_state['time']
v1 = np.array(env.lots)[i]

In [None]:
v1

In [None]:
heuristic(new_state)

In [None]:
action = env.get_random_action()
action 

In [None]:
OrderedDict([('reposition', array([302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302, 302,
       302])), ('req_assgts', array([20])), ('req_rejections', array([0]))])