In [1]:
%load_ext tensorboard

In [2]:
import os 
import numpy as np
import torch 
from stable_baselines3 import PPO, DQN, HerReplayBuffer
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy

from src.env import MultiODEnv, SparseMultiODEnv
from src.solution import MultiODSolution
from src.problem import MultiODProblem
from src.utils import read_instance_data, get_lkh3_tour
from src.rl.stable_baselines3.nn import PSExtractor
from src.rl.stable_baselines3.callback import SaveBestSolCallback

  if not hasattr(tensorboard, "__version__") or LooseVersion(
  ) < LooseVersion("1.15"):


In [3]:
instance_dir = os.path.join('data', 'tsppdlib', 'instances', 'random-uniform')
instances = [i for i in os.listdir(instance_dir) if i.endswith('.tsp')]
num_Os = ["005", "010", "020", "050"]
num_O = '020'

In [4]:
episode_max_time_length = int(1e3)
episode_max_length = int(4e3)
n_steps = episode_max_length
learn_totoal_steps = int(5e2) * episode_max_length
verbose = 1
batch_size = 100
tensorboard_log = '../tmp/ppo'
callback_log_dir = '../tmp/paths'

In [5]:
sub_instances = [i for i in instances if '-' + num_O + '-' in i]

In [6]:
lkh3_dir = os.path.join('/home/fangbowen/', 'U')
lkh3_results = os.listdir(lkh3_dir)

In [6]:
for i in range(len(sub_instances)):
    instance_name = sub_instances[i]
    instance_name_head = instance_name[:instance_name.index('.tsp')]
    print(f'instance: {instance_name_head}')
    lkh3_instance_result = [i for i in lkh3_results if instance_name_head in i][0]
    lkh3_tour = get_lkh3_tour(os.path.join(lkh3_dir, lkh3_instance_result))
    instance = f'/home/fangbowen/LAHR/data/tsppdlib/instances/random-uniform/{instance_name}'
    locations = read_instance_data(instance)
    problem = MultiODProblem(locations=locations, ignore_to_dummy_cost=False)
    problem.convert_distance_matrix_to_int()
    lkh3_tour = MultiODSolution([lkh3_tour], problem)
    env = MultiODEnv(problem=problem, max_length=episode_max_length, max_time_length=episode_max_time_length)
    # env = SparseMultiODEnv(target_cost=int(target_cost * (1 + 0.05)), problem=problem, max_length=episode_max_length, max_time_length=episode_max_time_length)
    features_dim = env.observation_space['solution'].shape[-1] + env.observation_space['problem'].shape[0]
    hidden_dim = 64
    num_heads = 4
    lr = 0.001

    policy_kwargs = dict(
        features_extractor_class=PSExtractor,
        features_extractor_kwargs=dict(features_dim=features_dim, 
                                    sol_input_dim=env.observation_space['solution'].shape[-1],
                                    hidden_dim=hidden_dim,
                                    num_heads=num_heads),
        net_arch=[256, 256]
    )
    goal_selection_strategy = 'future'
    replay_buffer_kwargs=dict(n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy)
    model = PPO("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose, n_steps=n_steps, batch_size=batch_size, learning_rate=lr, tensorboard_log=tensorboard_log)
    # model = DQN("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose,
    #             train_freq=n_steps, batch_size=batch_size, tensorboard_log=tensorboard_log,
    #             replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=replay_buffer_kwargs
    #             )
    instance_save_as = instance_name[:instance_name.index('.tsp')]
    model.learn(learn_totoal_steps, 
                tb_log_name=instance_save_as,
                callback=SaveBestSolCallback(log_dir=callback_log_dir, 
                                            instance_name=instance_save_as, 
                                            verbose=verbose,
                                            target_tour=lkh3_tour)
                )


Processing num-005:   0%|                                | 0/25 [00:00<?, ?it/s]

instance: random-020-05654
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ../tmp/ppo/random-005-03942_2


  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Logging to ../tmp/ppo/random-020-05654_7
Target cost: 6287
Best solution cost: 18928, found at 1 step, 1.74 seconds used
Best solution cost: 17276, found at 2 step, 1.74 seconds used
Best solution cost: 16454, found at 3 step, 1.75 seconds used
Best solution cost: 14284, found at 4 step, 1.75 seconds used
Best solution cost: 10343, found at 7 step, 1.77 seconds used
Best solution cost: 7974, found at 16 step, 1.86 seconds used
Best solution cost: 7941, found at 20 step, 1.88 seconds used
Best solution cost: 7715, found at 50 step, 2.16 seconds used
Best solution cost: 7704, found at 53 step, 2.18 seconds used
Best solution cost: 6989, found at 69 step, 2.36 seconds used
Best solution cost: 6893, found at 283 step, 4.07 seconds used
Best solution cost: 6609, found at 296 step, 4.16 seconds used
Best solution cost: 6599, found at 2058 step, 17.23 seconds used
Rollout best solution cost: 6599, 
                  found at 2058 step, 17.15 seconds used. 
                  Convergence gap: 6

KeyboardInterrupt: 

In [None]:
test_epoch_length = int(4e4)


for _ in range(test_epoch_length):
    pass 