In [1]:
import os 
import gymnasium as gym
import numpy as np 
import torch 
import torch.nn as nn 

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3 import PPO

2023-07-21 03:22:11.707287: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class PSExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 16, sol_input_dim: int = 12):
        super().__init__(observation_space, features_dim=features_dim)

        self.sol_embed_head = nn.Sequential(*[
            nn.Conv1d(sol_input_dim, sol_input_dim, 1),
            nn.BatchNorm1d(sol_input_dim)
        ])
        self.sol_self_attn = nn.MultiheadAttention(sol_input_dim, 4, batch_first=True)
        self.sol_residual = nn.Sequential(*[
            nn.Conv1d(sol_input_dim, sol_input_dim, 1),
            nn.ReLU(),
            nn.Conv1d(sol_input_dim, sol_input_dim, 1)
        ])
        self.sol_norm_after = nn.BatchNorm1d(sol_input_dim)
    
    def forward(self, observations):
        solution_features, problem_features = observations['solution'], observations['problem']
        solution_features = torch.transpose(solution_features, 1, 2)
        sol_embed = self.sol_embed_head(solution_features)
        sol_embed = torch.transpose(sol_embed, 1, 2)
        sol_embed, _ = self.sol_self_attn(sol_embed, sol_embed, sol_embed)
        sol_embed = torch.transpose(sol_embed, 1, 2)
        identity = sol_embed
        sol_out = self.sol_residual(sol_embed)
        sol_out += identity
        sol_out = self.sol_norm_after(sol_out)
        sol_out = torch.sum(sol_out, dim=2)
        out = torch.concat([sol_out, problem_features], dim=1)
        return out 

In [3]:
class SaveBestSolCallback(BaseCallback):
    def __init__(self, log_dir: str, instance_name: str, verbose: int = 0):
        super().__init__(verbose=verbose)
        self.cur_best_cost = np.inf 
        self.cur_best_sol = None
        self.rollout_best_cost = np.inf
        self.rollout_best_sol_at_step = 0
        self.log_dir = log_dir
        self.instance_name = instance_name

    def _init_callback(self):
        os.makedirs(self.log_dir, exist_ok=True)

    def _on_step(self) -> bool:
        best_costs = self.training_env.get_attr('best_cost')
        best_cost_index = np.argmin(best_costs)
        best_cost = best_costs[best_cost_index]
        if self.cur_best_cost > best_cost:
            self.cur_best_cost = best_cost
            best_sol = self.training_env.get_attr('best_solution')[best_cost_index]
            self.cur_best_sol = best_sol 
            best_sol_at_step = self.training_env.get_attr('best_sol_at_step')[best_cost_index]
            self.logger.record('best_cost', best_cost)
            self.logger.record('best_sol_at_step', best_sol_at_step)
            if self.verbose >= 1:
                print(f'Best solution cost: {best_cost}, found at {best_sol_at_step} step')
        if self.rollout_best_cost > best_cost:
            self.rollout_best_cost = best_cost
            best_sol_at_step = self.training_env.get_attr('best_sol_at_step')[best_cost_index]
            self.rollout_best_sol_at_step = best_sol_at_step
        return True
    
    def _on_rollout_end(self):
        self.logger.record('rollout_best_cost', self.rollout_best_cost)
        self.logger.record('rollout_best_sol_at_step', self.rollout_best_sol_at_step)
        if self.verbose >= 1:
            print(f'Rollout best solution cost: {self.rollout_best_cost}, found at {self.rollout_best_sol_at_step} step')
        self.rollout_best_cost = np.inf 
        self.rollout_best_sol_at_step = 0
    
    def _on_training_end(self):
        if self.cur_best_sol:
            f = open(os.path.join(self.log_dir, f'{self.instance_name}.{int(self.cur_best_cost)}.tour'), 'w')
            f.write(str(self.cur_best_sol))
            f.close()


In [4]:
import numpy as np
from src.env import MultiODEnv
from src.problem import MultiODProblem
from src.utils import read_instance_data

In [5]:
instance_dir = os.path.join('data', 'tsppdlib', 'instances', 'random-uniform')
instances = [i for i in os.listdir(instance_dir) if i.endswith('.tsp')]
num_O = ["010", "020", "050"]

In [6]:
instance = '/home/fangbowen/LAHR/data/tsppdlib/instances/random-uniform/random-010-05876.tsp'
locations = read_instance_data(instance)
problem = MultiODProblem(locations=locations, ignore_to_dummy_cost=False)
env = MultiODEnv(problem=problem, max_length=int(1e3))

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [7]:
features_dim = env.observation_space['solution'].shape[-1] + env.observation_space['problem'].shape[0]

policy_kwargs = dict(
    features_extractor_class=PSExtractor,
    features_extractor_kwargs=dict(features_dim=features_dim, 
                                   sol_input_dim=env.observation_space['solution'].shape[-1]),
)
model = PPO("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=1, n_steps=int(1e3), batch_size=100, tensorboard_log='../tmp/ppo')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
model.learn(int(1e4), callback=SaveBestSolCallback(log_dir='../tmp/ppo', instance_name='random-010-05876', verbose=1))

Logging to ../tmp/ppo/PPO_9
Best solution cost: 9677.033544384652, found at 0 step
Best solution cost: 5812.027414470452, found at 7 step
Best solution cost: 5588.528333677959, found at 8 step
Best solution cost: 5580.56905629529, found at 12 step
Best solution cost: 4923.517463159359, found at 13 step
Best solution cost: 4642.525514890449, found at 172 step
Best solution cost: 4542.076768457993, found at 592 step
Best solution cost: 4514.604730870254, found at 852 step
Rollout best solution cost: 4514.604730870254, found at 852 step
---------------------------------------
| best_cost                | 4.51e+03 |
| best_sol_at_step         | 852      |
| rollout/                 |          |
|    ep_len_mean           | 1e+03    |
|    ep_rew_mean           | 6.58e+04 |
| rollout_best_cost        | 4.51e+03 |
| rollout_best_sol_at_step | 852      |
| time/                    |          |
|    fps                   | 424      |
|    iterations            | 1        |
|    time_elapsed   

<stable_baselines3.ppo.ppo.PPO at 0x7f5bca0777f0>