In [1]:
%load_ext tensorboard

In [2]:
import os 
import numpy as np
import torch 
from stable_baselines3 import PPO, DQN
import re
from tqdm import tqdm

from src.env import MultiODEnv, SparseMultiODEnv
from src.problem import MultiODProblem
from src.utils import read_instance_data
from src.rl.stable_baselines3.nn import PSExtractor
from src.rl.stable_baselines3.callback import SaveBestSolCallback

  if not hasattr(tensorboard, "__version__") or LooseVersion(
  ) < LooseVersion("1.15"):


In [3]:
instance_dir = os.path.join('data', 'tsppdlib', 'instances', 'random-uniform')
instances = [i for i in os.listdir(instance_dir) if i.endswith('.tsp')]
num_Os = ["005", "010", "020", "050"]

In [4]:
def get_target_cost(instance):
    target = re.search(r"random-\d+-\d+", instance).group()
    target_cost_dir = os.path.join('data', 'U')
    files = [file for file in os.listdir(target_cost_dir) if file.endswith('.tour')]
    found_file = next(file for file in files if target in file)
    target_cost = int(re.search(r"\.(.*?)\.", found_file).group(1))
    return target_cost

In [5]:
def run_drl_experiment(instance):
    episode_max_time_length = int(1e3)
    episode_max_length = int(2e4)
    n_steps = episode_max_length
    learn_total_steps = int(2e3) * episode_max_length
    verbose = 1
    batch_size = 1000
    tensorboard_log = '../tmp/ppo'
    callback_log_dir = '../tmp/paths'
    
    target_cost = get_target_cost(instance)
    locations = read_instance_data(instance)
    problem = MultiODProblem(locations=locations, ignore_to_dummy_cost=False)
    env = MultiODEnv(problem=problem, max_length=episode_max_length, max_time_length=episode_max_time_length)
    
    features_dim = env.observation_space['solution'].shape[-1] + env.observation_space['problem'].shape[0]
    hidden_dim = 64
    num_heads = 4
    lr = 0.001
    
    policy_kwargs = dict(
        features_extractor_class=PSExtractor,
        features_extractor_kwargs=dict(features_dim=features_dim, 
                                       sol_input_dim=env.observation_space['solution'].shape[-1],
                                       hidden_dim=hidden_dim,
                                       num_heads=num_heads),
        net_arch=dict(pi=[64, 64], vf=[64, 64])
    )
    model = PPO("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose, n_steps=n_steps, batch_size=batch_size, learning_rate=lr, tensorboard_log=tensorboard_log)
    # model = DQN("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose, train_freq=n_steps, batch_size=batch_size, tensorboard_log=tensorboard_log)
    
    instance_save_as = re.search(r"random-\d+-\d+", instance).group()
    model.learn(learn_total_steps, 
                tb_log_name=instance_save_as,
                callback=SaveBestSolCallback(log_dir=callback_log_dir, 
                                         instance_name=instance_save_as, 
                                         verbose=verbose,
                                         target_cost=target_cost)
               )

In [6]:
for num in num_Os:
    pattern = re.compile(".*-" + num + "-.*")
    selected_files = [file_name for file_name in instances if re.match(pattern, file_name)]
    for selected_file in tqdm(selected_files, desc=f"Processing num-{num}"):
        instance = os.path.join(instance_dir, selected_file)
        run_drl_experiment(instance)

Processing num-005:   0%|                                | 0/25 [00:00<?, ?it/s]

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ../tmp/ppo/random-005-03942_2


  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Best solution cost: 5833.852399508786, found at 1 step, 0.00 seconds used
Best solution cost: 5319.505335000748, found at 3 step, 0.01 seconds used
Best solution cost: 4701.673969203509, found at 6 step, 0.01 seconds used
Best solution cost: 3755.511978843458, found at 10 step, 0.02 seconds used
Best solution cost: 3633.374948569405, found at 19 step, 0.02 seconds used
Best solution cost: 3599.6670062438134, found at 80 step, 0.06 seconds used
Rollout best solution cost: 3599.6670062438134, 
                  found at 80 step, 
                  0.06 seconds used. 

                  Convergence gap: 3599.6670062438134. Target gap: 0.6670062438133755
---------------------------------------------
| best/                          |          |
|    best_cost                   | 3.6e+03  |
|    best_sol_at_step            | 80       |
|    best_sol_found_time         | 0.0608   |
| rollout/                       |          |
|    convergence_gap             | 3.6e+03  |
|    ep_len_mean   

Rollout best solution cost: 3599.6670062438134, 
                  found at 593 step, 
                  0.31 seconds used. 

                  Convergence gap: 0.0. Target gap: 0.6670062438133755
-------------------------------------------------
| rollout/                       |              |
|    convergence_gap             | 0            |
|    ep_len_mean                 | 2e+04        |
|    ep_rew_mean                 | 2.01e+06     |
|    rollout_best_cost           | 3.6e+03      |
|    rollout_best_sol_at_step    | 593          |
|    rollout_best_sol_found_time | 0.312        |
|    target_gap                  | 0.667        |
| time/                          |              |
|    fps                         | 377          |
|    iterations                  | 7            |
|    time_elapsed                | 370          |
|    total_timesteps             | 140000       |
| train/                         |              |
|    approx_kl                   | 0.0046183593 |
|  

Processing num-005:   0%|                                | 0/25 [09:19<?, ?it/s]


KeyboardInterrupt: 

In [None]:
%tensorboard --logdir logs