Import necessary libraries

In [28]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

Load data from CSV

In [29]:
# Load small order data
order_data = pd.read_csv('data/order_small.csv')

# Load distance data
distance_data = pd.read_csv('data/distance.csv')

# Load truck data
truck_data = pd.read_csv('data/truck.csv')

# Display the first few rows of each dataframe to verify
print(order_data.head())
print(distance_data.head())
print(truck_data.head())

  Order_ID Material_ID                                   Item_ID   Source  \
0  A140109      B-6128  P01-79c46a02-e12f-41c4-9ec9-25e48597ebfe  City_61   
1  A140112      B-6128  P01-84ac394c-9f34-48e7-bd15-76f92120b624  City_61   
2  A140112      B-6128  P01-b70c94db-630a-497b-bb63-b0ad86a7dce6  City_61   
3  A140112      B-6128  P01-4534a7e8-6d73-4a2e-8363-a6645d9bc345  City_61   
4  A140112      B-6128  P01-7208eb61-2cc1-4e7c-b698-e1ab2327b658  City_61   

  Destination       Available_Time             Deadline Danger_Type   Area  \
0     City_54  2022-04-05 23:59:59  2022-04-11 23:59:59      type_1  38880   
1     City_54  2022-04-07 23:59:59  2022-04-13 23:59:59      type_1  38880   
2     City_54  2022-04-07 23:59:59  2022-04-13 23:59:59      type_1  38880   
3     City_54  2022-04-07 23:59:59  2022-04-13 23:59:59      type_1  38880   
4     City_54  2022-04-07 23:59:59  2022-04-13 23:59:59      type_1  38880   

     Weight  
0  30920000  
1  30920000  
2  30920000  
3  30920000 

Preprocessing

In [30]:
# Merge order_data with distance_data to add the distance column
order_data = order_data.merge(distance_data, how='left', left_on=['Source', 'Destination'], right_on=['Source', 'Destination'])

# Rename the Distance(M) column to Distance for clarity
order_data.rename(columns={'Distance(M)': 'Distance'}, inplace=True)

# Convert area to m2, weight to kg and distance to km
order_data['Area'] = order_data['Area'] / 10000
order_data['Weight'] = order_data['Weight'] / 10000
order_data['Distance'] = order_data['Distance']  / 1000

# Convert some Time attributes from str to datetime
order_data['Available_Time'] = pd.to_datetime(order_data['Available_Time'])
order_data['Deadline'] = pd.to_datetime(order_data['Deadline'])
# Display the updated order_data dataframe
print(order_data.head())

  Order_ID Material_ID                                   Item_ID   Source  \
0  A140109      B-6128  P01-79c46a02-e12f-41c4-9ec9-25e48597ebfe  City_61   
1  A140112      B-6128  P01-84ac394c-9f34-48e7-bd15-76f92120b624  City_61   
2  A140112      B-6128  P01-b70c94db-630a-497b-bb63-b0ad86a7dce6  City_61   
3  A140112      B-6128  P01-4534a7e8-6d73-4a2e-8363-a6645d9bc345  City_61   
4  A140112      B-6128  P01-7208eb61-2cc1-4e7c-b698-e1ab2327b658  City_61   

  Destination      Available_Time            Deadline Danger_Type   Area  \
0     City_54 2022-04-05 23:59:59 2022-04-11 23:59:59      type_1  3.888   
1     City_54 2022-04-07 23:59:59 2022-04-13 23:59:59      type_1  3.888   
2     City_54 2022-04-07 23:59:59 2022-04-13 23:59:59      type_1  3.888   
3     City_54 2022-04-07 23:59:59 2022-04-13 23:59:59      type_1  3.888   
4     City_54 2022-04-07 23:59:59 2022-04-13 23:59:59      type_1  3.888   

   Weight  Distance  
0  3092.0  2444.326  
1  3092.0  2444.326  
2  3092.0  244

In [31]:
def extract_city(s):
    '''Given a string of the form "City_X", return X'''
    return int(s.split("_")[1])

# Convert distance from m to km
distance_data['Distance(M)'] = distance_data['Distance(M)'] / 1000
print(distance_data.head())

distance_dict = dict()
for i in range(len(distance_data)):
    row = distance_data.iloc[i]
    source_city = extract_city(row['Source'])
    dest_city = extract_city(row['Destination'])
    distance = float(row['Distance(M)'])
    distance_dict[(source_city, dest_city)] = distance

    Source Destination  Distance(M)
0  City_24     City_47     1114.251
1  City_24     City_31       97.187
2  City_24     City_54     1716.028
3  City_24     City_53     1729.925
4  City_24     City_19     1594.107


In [32]:
# Split the Inner Size (m^2) column into length and width
truck_data[['Length', 'Width']] = truck_data['Inner Size (m^2)'].str.split('x', expand=True)

# Convert the Length and Width columns to float
truck_data['Length'] = truck_data['Length'].astype(float)
truck_data['Width'] = truck_data['Width'].astype(float)

# Calculate the area and store it in a new column called Inner Area (m^2)
truck_data['Inner Area (m^2)'] = truck_data['Length'] * truck_data['Width']

# Drop the original Inner Size (m^2) column as it's no longer needed
truck_data.drop(columns=['Inner Size (m^2)'], inplace=True)
truck_data.drop(columns=['Length', 'Width'], inplace=True)

# Display the updated truck_data dataframe
print(truck_data)

   Truck Type (length in m)  Weight Capacity (kg)  Cost Per KM  Speed (km/h)  \
0                      16.5                 10000            3            40   
1                      12.5                  5000            2            40   
2                       9.6                  2000            1            40   

   Inner Area (m^2)  
0             40.25  
1             30.25  
2             20.93  


In [22]:
num_orders = len(order_data)
num_truck_types = len(truck_data)
print("Number of orders:", num_orders)
print("Number of truck types:", num_truck_types)

Number of orders: 10
Number of truck types: 3


Problem constraints
- There are different types of trucks we can choose from. A truck has capacity limit on both area and weight. (We assume that there is no limit on the number of trucks for each type)
- An item is only available by a specific time. A truck can start only when all items assigned to it are available.
- The available time difference between the earliest and last available items in the same truck should be less than a user defined limit (e.g., 4 hours).
- All items need to be delivered to their destinations before their deadlines.
- Depending on the properties of products, some items can put in the same truck, but some cannot.
- A truck can have at most N stops, where N is a user defined number.
- A truck need to stay at each stop for M hours to unload the items, where M is a user defined number. Each stop will incur a fixed amount of cost in addition to the delivery cost.

State class implementation

In [23]:
from copy import deepcopy

class State:
    def __init__(self):
        self.num_trucks = 0 # so luong xe tai
        self.num_truck_types = num_truck_types
        self.cur_truck = [-1 for _ in range(num_orders)] # xe tai hien tai cua moi don hang
        self.truck_type = [] # loai xe cua moi xe
        self.cur_weight = [] # weight hien tai cua moi xe
        self.cur_area = [] # dien tich hien tai cua moi xe
        self.orders_in_truck = [] # danh sach cac order cua moi xe

    def assign_order_to_existing_truck(self, order_index, truck_index):
        '''Change the truck of an order by another existing truck'''
        order = order_data.iloc[order_index]
        old_truck_index = self.cur_truck[order_index]
        self.cur_weight[old_truck_index] -= order['Weight']
        self.cur_area[old_truck_index] -= order['Area']
        self.orders_in_truck[old_truck_index].remove(order_index)
        self.cur_weight[truck_index] += order['Weight']
        self.cur_area[truck_index] += order['Area']
        self.orders_in_truck[truck_index].add(order_index)
        self.cur_truck[order_index] = truck_index
        return self
    
    def assign_order_to_new_truck(self, order_index, truck_type_index):
        '''Change the truck of an order by a new truck'''
        order = order_data.iloc[order_index]
        old_truck_index = self.cur_truck[order_index]
        if old_truck_index != -1:
            self.cur_weight[old_truck_index] -= order['Weight']
            self.cur_area[old_truck_index] -= order['Area']
            self.orders_in_truck[old_truck_index].remove(order_index)
        self.cur_truck[order_index] = len(self.truck_type)
        self.num_trucks += 1
        self.truck_type.append(truck_type_index)
        self.cur_weight.append(order['Weight'])
        self.cur_area.append(order['Area'])
        new_truck_order = set()
        new_truck_order.add(order_index)
        self.orders_in_truck.append(new_truck_order)
        return self

    def assign_order_to_existing_truck_new_state(self, order_index, truck_index):
        '''Change the truck of an order by another existing truck. Return a new state.'''
        new_state = self.deepcopy()
        new_state = new_state.assign_order_to_existing_truck(order_index, truck_index)
        return new_state
    
    def assign_order_to_new_truck_new_state(self, order_index, truck_type_index):
        '''Change the truck of an order by a new truck. Return a new state'''
        new_state = self.deepcopy()
        new_state = new_state.assign_order_to_new_truck(order_index, truck_type_index)
        return new_state
    
    def calculate_cost_truck(self, truck_index):
        '''Each truck departs from City 61, travels to some assigned cities and then come back to City 61.
        The problem of minimizing total distance travelled of a truck is a variant of TSP problem (TSP with time constraints).
        This function implements a greedy algorithm for solving TSP.'''

        # starting_time = max([order_data.iloc[order]['Available Time'] for order in self.orders_in_truck[truck_index]])
        route = [61]
        cost = 0
        cities_list = set()
        city_deadline = dict() # for the earliest deadline of the orders to this city
        for order in self.orders_in_truck[truck_index]:
            city = extract_city(order_data.iloc[order]['Destination'])
            deadline = order_data.iloc[order]['Deadline']
            cities_list.add(city)
            if city not in city_deadline.keys():
                city_deadline[city] = deadline
            else:
                city_deadline[city] = min(city_deadline[city], deadline)

        while len(cities_list) > 0:
            next_city = None
            best_time_distance = None
            for city in cities_list:
                cur_time_distance = (city_deadline[city], distance_dict[(route[-1], city)])
                if best_time_distance is None or cur_time_distance < best_time_distance:
                    next_city = city
                    best_time_distance = cur_time_distance
            route.append(next_city)

            cost += best_time_distance[1]
            cities_list.remove(next_city)
        
        return cost, route

    def calculate_total_cost(self):
        total_cost = 0
        routes = []
        for truck_index in range(len(self.truck_type)):
            if len(self.orders_in_truck[truck_index]) > 0:
                cost, route = self.calculate_cost_truck(truck_index)
                total_cost += cost
                routes.append(route)
        return total_cost, routes

    def initialize(self):
        for order_index in range(num_orders):
            order = order_data.iloc[order_index]
            for truck_type_index in range(self.num_truck_types):
                truck_type = truck_data.iloc[truck_type_index]
                if order['Weight'] <= truck_type['Weight Capacity (kg)'] and order['Area'] <= truck_type['Inner Area (m^2)']:
                    self.assign_order_to_new_truck(order_index, truck_type_index)
                    break
        return self

    def print(self):
        print('Number of trucks used:', self.num_trucks)
        print('Order assignment to truck', self.cur_truck)
        print('Truck type:', self.truck_type)
        print('Current weight of each truck:', self.cur_weight)
        print('Current area used in each truck:', self.cur_area)
        print('Orders in each truck', self.orders_in_truck)
    
    def is_valid_state(self, max_time_diff, max_stops, unload_time):
        for truck_index in range(len(self.orders_in_truck)):
            orders = self.orders_in_truck[truck_index]
            if not orders:
                continue

            # Check capacity constraints
            truck_type_index = self.truck_type[truck_index]
            if self.cur_weight[truck_index] > truck_data.iloc[truck_type_index]['Weight Capacity (kg)']:
                return False
            if self.cur_area[truck_index] > truck_data.iloc[truck_type_index]['Inner Area (m^2)']:
                return False

            # Check time constraints
            # available_times = [order_data.iloc[order]['Available_Time'] for order in orders]
            # if max(available_times) - min(available_times) > max_time_diff:
            #     return False

            # Check delivery deadlines
            # for order in orders:
            #     if order_data.iloc[order]['Deadline'] < self.starting_time[truck_index] + unload_time * len(orders):
            #         return False

            # Check product compatibility
            for order in orders:
                if order_data.iloc[order]['Danger_Type'] == "type_2" and len(orders) > 1:
                    return False
                
            # Check number of stops
            if len(orders) > max_stops:
                return False

        return True
    
    # def is_valid_action(self, action):
    #     order_index, truck_index, truck_type_index = action
    #     if truck_index is None:
    #         return True
    #     elif truck_type_index is None:
    #         if order_index in self.orders_in_truck
    
    def deepcopy(self):
        new_state = State()
        new_state.num_truck_types = num_truck_types
        new_state.cur_truck = deepcopy(self.cur_truck)
        new_state.truck_type = deepcopy(self.truck_type[:])
        new_state.cur_weight = deepcopy(self.cur_weight)
        new_state.cur_area = deepcopy(self.cur_area)
        new_state.orders_in_truck = deepcopy(self.orders_in_truck)
        new_state.num_trucks = len(new_state.cur_truck)
        return new_state


    def state_to_tensor(self, max_trucks=num_orders):
        """
        Converts the state to a fixed-dimension tensor representation.
        Args:
            max_trucks (int): Maximum number of trucks to pad or truncate the truck-related features.
        Returns:
            torch.Tensor: A fixed-dimension tensor representation of the state.
        """
        # Normalize truck-related features
        max_weight_capacity = truck_data['Weight Capacity (kg)'].max()
        max_area_capacity = truck_data['Inner Area (m^2)'].max()

        # Normalize order-related features
        max_order_weight = order_data['Weight'].max()
        max_order_area = order_data['Area'].max()

        # Prepare truck features
        truck_weights = [
            self.cur_weight[i] / max_weight_capacity if max_weight_capacity > 0 else 0
            for i in range(len(self.cur_weight))
        ]
        truck_areas = [
            self.cur_area[i] / max_area_capacity if max_area_capacity > 0 else 0
            for i in range(len(self.cur_area))
        ]

        # Pad or truncate truck features to fixed size
        padded_weights = truck_weights[:max_trucks] + [0] * max(0, max_trucks - len(truck_weights))
        padded_areas = truck_areas[:max_trucks] + [0] * max(0, max_trucks - len(truck_areas))

        # Prepare order assignments
        order_assignments = [
            self.cur_truck[i] / max_trucks if self.num_trucks > 0 else -1
            for i in range(len(self.cur_truck))
        ]

        # Combine all features into a single tensor
        combined_features = padded_weights + padded_areas + order_assignments
        tensor_representation = torch.tensor(combined_features, dtype=torch.float32)

        return tensor_representation


In [24]:
# Some code for testing if the State class is implemented properly
test_state = State()
test_state.initialize()
test_state.print()
cost, route = test_state.calculate_total_cost()
print(cost)
print(route)
new_state = test_state.assign_order_to_existing_truck_new_state(7, 5)
new_state.print()

Number of trucks used: 10
Order assignment to truck [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Truck type: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Current weight of each truck: [3092.0, 3092.0, 3092.0, 3092.0, 3092.0, 764.0, 764.0, 764.0, 764.0, 764.0]
Current area used in each truck: [3.888, 3.888, 3.888, 3.888, 3.888, 0.984, 0.984, 0.984, 0.984, 0.984]
Orders in each truck [{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}]
24512.744999999995
[[61, 54], [61, 54], [61, 54], [61, 54], [61, 54], [61, 53], [61, 53], [61, 53], [61, 53], [61, 53]]
Number of trucks used: 10
Order assignment to truck [0, 1, 2, 3, 4, 5, 6, 5, 8, 9]
Truck type: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Current weight of each truck: [3092.0, 3092.0, 3092.0, 3092.0, 3092.0, 1528.0, 764.0, 0.0, 764.0, 764.0]
Current area used in each truck: [3.888, 3.888, 3.888, 3.888, 3.888, 1.968, 0.984, 0.0, 0.984, 0.984]
Orders in each truck [{0}, {1}, {2}, {3}, {4}, {5, 7}, {6}, set(), {8}, {9}]


In [25]:
max_trucks = num_orders

DQN, Agent, and Environment implementation

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

# Deep Q-Network (DQN)
class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc3 = nn.Linear(2*hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity

    def add(self, transition):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(transition)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

class DeepQLearningAgent:
    def __init__(self, state_size, action_size, hidden_size=64, gamma=0.99, lr=1e-4, buffer_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999

        self.policy_net = DQN(state_size, action_size, hidden_size).cuda()
        self.target_net = DQN(state_size, action_size, hidden_size).cuda()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(buffer_size)

    def select_action(self, state, available_actions, train=True):
        if np.random.rand() < self.epsilon and train:
            return random.choice(available_actions)
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).cuda()
                q_values = self.policy_net(state_tensor)
                action_values = [(a, q_values[0, a[3]].item()) for a in available_actions]
                return max(action_values, key=lambda x: x[1])[0]

    def update_replay_buffer(self, transition):
        self.replay_buffer.add(transition)

    def train(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return

        batch = self.replay_buffer.sample(batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.stack(states).cuda()
        actions = torch.tensor(actions, dtype=torch.long)[:, 3].cuda()
        rewards = torch.tensor(rewards, dtype=torch.float32).cuda()
        next_states = torch.stack(next_states).cuda()
        dones = torch.tensor(dones, dtype=torch.float32).cuda()

        q_values = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_net(next_states).max(1)[0]
        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)

        loss = self.criterion(q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

class VRPEnvironment:
    def __init__(self, initial_state, max_time_diff, max_stops, unload_time):
        self.state = initial_state
        self.max_time_diff = max_time_diff
        self.max_stops = max_stops
        self.unload_time = unload_time

    def reset(self):
        self.state.initialize()
        return self.state.state_to_tensor()

    def get_available_actions(self):
        actions = []
        index = 0
        for order_index in range(num_orders):
            for truck_index in range(max_trucks):
                actions.append((order_index, truck_index, 0, index))
                index += 1
            for truck_type_index in range(num_truck_types):
                actions.append((order_index, truck_type_index, 1, index))
                index += 1
        return actions

    def step(self, action):
        order_index, truck_index, type, index = action
        false_action = False

        if type:
            if len(self.state.truck_type) >= max_trucks:
                new_state = self.state
                false_action = True
            else:
                new_state = self.state.assign_order_to_new_truck_new_state(order_index, truck_index)
        else:
            if self.state.cur_truck[order_index] == -1 or truck_index >= len(self.state.truck_type) or truck_index == self.state.cur_truck[order_index]:
                new_state = self.state
                false_action = True
            else:
                new_state = self.state.assign_order_to_existing_truck_new_state(order_index, truck_index)

        if not new_state.is_valid_state(self.max_time_diff, self.max_stops, self.unload_time):
            false_action = True

        if false_action:
            reward = -1e6
            done = True
        else:
            # done = all(order != -1 for order in new_state.cur_truck)
            # if not done:
            #     reward = 1e2
            # else:
            done = False
            benchmark_state = State().initialize()
            old_cost, _ = benchmark_state.calculate_total_cost()
            new_cost, _ = new_state.calculate_total_cost()
            reward = old_cost - new_cost

        self.state = new_state
        return new_state.state_to_tensor(), reward, done

def train_dql_agent(agent, environment, num_episodes=1000, max_steps=100, C=20):
    for episode in range(num_episodes):
        state = environment.reset()
        total_reward = 0
        rewards = []

        for step in range(max_steps):
            available_actions = environment.get_available_actions()
            action = agent.select_action(state, available_actions)

            next_state, reward, done = environment.step(action)
            agent.update_replay_buffer((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward
            rewards.append(reward)

            agent.train(batch_size=64)
            if done or step == max_steps - 1:
                cost, _ = environment.state.calculate_total_cost()
                print(f"Total cost: {cost}")
                break
            if (step + 1) % C == 0:
                agent.update_target_network()

        agent.update_epsilon()
        agent.update_target_network()
        print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}, Rewards: {rewards}")

def infer(agent, environment, max_steps=100):
    best_state = None
    best_cost = float('inf')
    best_plan = None
    state = environment.reset()
    for step in range(max_steps):
        available_actions = environment.get_available_actions()
        action = agent.select_action(state, available_actions, train=False)
        cost, plan = environment.state.calculate_total_cost()
        if 0 < cost <= best_cost:
            best_state = environment.state
            best_cost = cost
            best_plan = plan
        next_state, reward, done = environment.step(action)
        if done or step == max_steps - 1:
            break
        state = next_state
    return best_state, best_cost, best_plan
        

Training

In [33]:
from time import time
initial_state = State()
max_time_diff, max_stops, unload_time = 10, 5, 0.5
environment = VRPEnvironment(initial_state, max_time_diff, max_stops, unload_time)

state_tensor = environment.reset()
state_size = state_tensor.size(0)
action_size = len(environment.get_available_actions())

agent = DeepQLearningAgent(state_size, action_size)
start_time = time()
train_dql_agent(agent, environment)
print("Time elapsed:", time() - start_time)



Total cost: 22072.807
Episode 1/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 2/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 3/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 4/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 5/1000, Total Reward: -1000000.0, Rewards: [0.0, -1000000.0]
Total cost: 22086.525
Episode 6/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, -1000000.0]
Total cost: 22086.525
Episode 7/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, 0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 8/1000, Total Reward: -1000000.0, Rewards: [-1000000.0]
Total cost: 22072.807
Episode 9/1000, Total Reward: -1000000.0, Rewards: [0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 10/1000, Total Reward: -1000000.0, Rewards: [0.

  state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).cuda()


Total cost: 22072.807
Episode 51/1000, Total Reward: -997555.674, Rewards: [0.0, 0.0, 0.0, 0.0, 2444.326000000001, -1000000.0]
Total cost: 22086.524999999994
Episode 52/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 53/1000, Total Reward: -1000000.0, Rewards: [0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 54/1000, Total Reward: -1000000.0, Rewards: [-1000000.0]
Total cost: 24512.744999999995
Episode 55/1000, Total Reward: -1000000.0, Rewards: [0.0, -1000000.0]
Total cost: 22072.807
Episode 56/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, -1000000.0]
Total cost: 22054.521999999997
Episode 57/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 58/1000, Total Reward: -1000000.0, Rewards: [-1000000.0]
Total cost: 24512.744999999995
Episode 59/1000, Total Reward: -1000000.0, Rewards: [0.0, 0.0, 0.0, -1000000.0]
Total cost: 24512.744999999995
Episode 60/1000, Total Rewar

Inference

In [40]:
state, cost, plan = infer(agent, environment)
print(f'Total cost estimated: {cost}')
for i in range(len(state.orders_in_truck)):
    if len(state.orders_in_truck[i]) > 0:
        print(f"Truck type: {state.truck_type[i] + 1}, Orders of the truck: {state.orders_in_truck[i]}")
for i in range(len(plan)):
    print(f"Route of the {i+1}-th truck: {plan[i]}")

  state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).cuda()


Total cost estimated: 24512.744999999995
Truck type: 1, Orders of the truck: {3}
Truck type: 1, Orders of the truck: {0}
Truck type: 1, Orders of the truck: {1}
Truck type: 1, Orders of the truck: {2}
Truck type: 1, Orders of the truck: {4}
Truck type: 1, Orders of the truck: {5}
Truck type: 1, Orders of the truck: {6}
Truck type: 1, Orders of the truck: {7}
Truck type: 1, Orders of the truck: {8}
Truck type: 1, Orders of the truck: {9}
Route of the 1-th truck: [61, 54]
Route of the 2-th truck: [61, 54]
Route of the 3-th truck: [61, 54]
Route of the 4-th truck: [61, 54]
Route of the 5-th truck: [61, 54]
Route of the 6-th truck: [61, 53]
Route of the 7-th truck: [61, 53]
Route of the 8-th truck: [61, 53]
Route of the 9-th truck: [61, 53]
Route of the 10-th truck: [61, 53]
