In [1]:
import os 
import numpy as np
import torch 
from stable_baselines3 import PPO, DQN

from src.env import MultiODEnv, SparseMultiODEnv
from src.problem import MultiODProblem
from src.utils import read_instance_data
from src.rl.stable_baselines3.nn import PSExtractor
from src.rl.stable_baselines3.callback import SaveBestSolCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
instance_dir = os.path.join('data', 'tsppdlib', 'instances', 'random-uniform')
instances = [i for i in os.listdir(instance_dir) if i.endswith('.tsp')]
num_Os = ["005", "010", "020", "050"]
num_O = '020'

In [3]:
episode_max_time_length = int(1e3)
episode_max_length = int(2e4)
n_steps = episode_max_length
learn_totoal_steps = int(2e3) * episode_max_length
verbose = 1
batch_size = 1000
tensorboard_log = '../tmp/ppo'
callback_log_dir = '../tmp/paths'

In [4]:
instance_name = 'random-020-12952.tsp'
target_cost = 6393
instance = f'/home/fangbowen/LAHR/data/tsppdlib/instances/random-uniform/{instance_name}'
locations = read_instance_data(instance)
problem = MultiODProblem(locations=locations, ignore_to_dummy_cost=False)
env = MultiODEnv(problem=problem, max_length=episode_max_length, max_time_length=episode_max_time_length)

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [5]:
features_dim = env.observation_space['solution'].shape[-1] + env.observation_space['problem'].shape[0]
hidden_dim = 64
num_heads = 4
lr = 0.001

policy_kwargs = dict(
    features_extractor_class=PSExtractor,
    features_extractor_kwargs=dict(features_dim=features_dim, 
                                   sol_input_dim=env.observation_space['solution'].shape[-1],
                                   hidden_dim=hidden_dim,
                                   num_heads=num_heads),
    net_arch=dict(pi=[64, 64], vf=[64, 64])
)
model = PPO("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose, n_steps=n_steps, batch_size=batch_size, learning_rate=lr, tensorboard_log=tensorboard_log)
# model = DQN("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose, train_freq=n_steps, batch_size=batch_size, tensorboard_log=tensorboard_log)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
instance_save_as = instance_name[:instance_name.index('.tsp')]
model.learn(learn_totoal_steps, 
            tb_log_name=instance_save_as,
            callback=SaveBestSolCallback(log_dir=callback_log_dir, 
                                         instance_name=instance_save_as, 
                                         verbose=verbose,
                                         target_cost=target_cost)
            )

Logging to ../tmp/ppo/random-020-12952_2
Best solution cost: 19293.63056263959, found at 1 step, 1.53 seconds used
Best solution cost: 19083.379162339475, found at 2 step, 1.55 seconds used
Best solution cost: 14130.878682037415, found at 3 step, 1.56 seconds used
Best solution cost: 12386.298489789795, found at 4 step, 1.57 seconds used
Best solution cost: 11585.866652596698, found at 7 step, 1.61 seconds used
Best solution cost: 8993.36976806637, found at 12 step, 1.63 seconds used
Best solution cost: 7214.991073170043, found at 13 step, 1.71 seconds used
Best solution cost: 7153.316205472792, found at 17 step, 1.76 seconds used
Best solution cost: 7018.93954486136, found at 131 step, 2.68 seconds used
Best solution cost: 6887.649184785583, found at 279 step, 3.52 seconds used
Best solution cost: 6869.884121503525, found at 383 step, 4.32 seconds used
Best solution cost: 6613.545453372177, found at 516 step, 5.30 seconds used
Best solution cost: 6545.163756974672, found at 1610 step,