In [1]:
import os 
import random 
import numpy as np
import torch 
from stable_baselines3 import PPO, DQN, HerReplayBuffer
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.vec_monitor import VecMonitor

from src.env import MultiODEnv, SparseMultiODEnv
from src import actions, operators
from src.solution import MultiODSolution
from src.problem import MultiODProblem
from src.utils import read_instance_data, get_lkh3_tour, get_ortools_tour
from src.rl.stable_baselines3.nn import PSExtractor
from src.rl.stable_baselines3.callback import SaveBestSolCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
instance_dir = os.path.join('data', 'tsppdlib', 'instances', 'random-uniform')
instances = [i for i in os.listdir(instance_dir) if i.endswith('.tsp')]
num_Os = ["005", "010", "020", "030", "050"]
num_O = '030'

In [3]:
# 5 random instances for each num_O, with random.sample(sub_instances, k=5)
sub_instances = [i for i in instances if '-' + num_O + '-' in i]
resample = False  

if resample:
    sub_instances = random.sample(sub_instances, k=5)
else:
    if num_O == "005": 
        sub_instances = [
            'random-005-06203.tsp',
            'random-005-14680.tsp',
            'random-005-27025.tsp',
            'random-005-22010.tsp',
            'random-005-27053.tsp']
    elif num_O == "010":
        sub_instances = [
            'random-010-05876.tsp',
            'random-010-13200.tsp',
            'random-010-07248.tsp',
            'random-010-11763.tsp',
            'random-010-20971.tsp']
    elif num_O == "020":
        sub_instances = [
            'random-020-13151.tsp',
            'random-020-32388.tsp',
            'random-020-19723.tsp',
            'random-020-02593.tsp',
            'random-020-10770.tsp']
    elif num_O == "050":
        sub_instances = [
            # 'random-050-13219.tsp',
            # 'random-050-29393.tsp',
            'random-050-04371.tsp',
            'random-050-12086.tsp',
            'random-050-21722.tsp']
    elif num_O == "100":
        sub_instances = [
            'random-100-19642.tsp',
            'random-100-00562.tsp',
            'random-100-17825.tsp',
            'random-100-18734.tsp',
            'random-100-26486.tsp']
    elif num_O == "200":
        sub_instances = [
            'random-200-02527.tsp',
            'random-200-04236.tsp',
            'random-200-04282.tsp',
            'random-200-13517.tsp', 
            'random-200-16462.tsp']
    else:
        sub_instances = random.sample(sub_instances, k=5)

In [None]:
lkh3_dir = os.path.join('/home/fangbowen/', 'U')
lkh3_results = os.listdir(lkh3_dir)

ortools_dir = os.path.join('/home/fangbowen/', 'tmp', 'ortools')
ortools_results = os.listdir(ortools_dir)

In [None]:
# params
hidden_dim = 256
num_heads = 16
lr = 0.001
net_arch = [256, 256]
batch_size = 1000

# env
episode_max_time_length = int(1e3)
episode_max_length = int(1e4)
n_steps = episode_max_length
n_gradient_steps = 50
learn_totoal_steps = int(5e2) * episode_max_length
k_recent = 5
nenv = 1

# callback
verbose = 1
early_stop = True
tensorboard_log = '../tmp/ppo'
callback_log_dir = '../tmp/paths'

use_sparse_reward = False 
use_her = False 

# HER
n_sampled_goal = 4
goal_selection_strategy = 'future'
replay_buffer_kwargs=dict(n_sampled_goal=n_sampled_goal, goal_selection_strategy=goal_selection_strategy)

# action_dict
use_naive_action = True 

def get_naive_action_dict(env_instance):
    _actions = [ 
               'actions.PathAction({idx}, operator=operators.ExchangeOperator())',
               'actions.PathAction({idx}, operator=operators.InsertOperator())',
               ]
    _action_dict = {idx: eval(_action.format(idx=idx)) for idx, _action in enumerate(_actions, start=1)}
    _action_dict[0] = env_instance.problem.generate_feasible_solution
    return _action_dict
action_dict = None if not use_naive_action else get_naive_action_dict

In [None]:
from copy import deepcopy

class MultiODEnvMaker:
    def __init__(self, problem, action_dict, max_length, max_time_length, k_recent):
        self.problem = problem
        self.action_dict = action_dict
        self.max_length = max_length
        self.max_time_length = max_time_length
        self.k_recent = k_recent

    def __call__(self):
        problem = deepcopy(self.problem)
        env = MultiODEnv(problem=problem, 
                         action_dict=self.action_dict,
                         max_length=self.max_length, 
                         max_time_length=self.max_time_length,
                         k_recent=self.k_recent)
        return env


In [None]:
for i in range(len(sub_instances)):
    instance_name = sub_instances[i]
    instance_name_head = instance_name[:instance_name.index('.tsp')]
    print(f'instance: {instance_name_head}')
    
    lkh3_instance_result = [i for i in lkh3_results if instance_name_head in i][0]
    lkh3_tour = get_lkh3_tour(os.path.join(lkh3_dir, lkh3_instance_result))
    ortools_instance_result = [i for i in ortools_results if instance_name_head in i][0]
    ortools_tour = get_ortools_tour(os.path.join(ortools_dir, ortools_instance_result))
    
    instance =  os.path.join(instance_dir, instance_name)
    locations = read_instance_data(instance)
    problem = MultiODProblem(locations=locations, ignore_to_dummy_cost=False, ignore_from_dummy_cost=False)
    problem.convert_distance_matrix_to_int()

    lkh3_tour = MultiODSolution([lkh3_tour], problem)
    ortools_tour = MultiODSolution([ortools_tour], problem)
    lkh3_cost, ortools_cost = problem.calc_cost(lkh3_tour), problem.calc_cost(ortools_tour)
    print(f'LKH3 cost: {lkh3_cost}, ortools cost: {ortools_cost}')
    if lkh3_cost < ortools_cost:
        target_tour = lkh3_tour  
        print('Target tour is LKH3')
    else:
        target_tour = ortools_tour
        print('Target tour is ortools')
    
    if use_sparse_reward:
        env = SparseMultiODEnv(target_cost=int(problem.calc_cost(target_tour) * (1 + 0.05)), 
                               problem=problem, 
                               action_dict=action_dict,
                               max_length=episode_max_length, 
                               max_time_length=episode_max_time_length,
                               k_recent=k_recent)
    else:
        # env = VecMonitor(SubprocVecEnv([MultiODEnvMaker(problem, action_dict, episode_max_length, episode_max_time_length, k_recent) for _ in range(nenv)]))
        env = MultiODEnv(problem=problem, action_dict=action_dict, max_length=episode_max_length, max_time_length=episode_max_time_length, k_recent=k_recent)
    
    features_dim = env.observation_space['solution'].shape[-1] + env.observation_space['problem'].shape[0]
    
    policy_kwargs = dict(
        features_extractor_class=PSExtractor,
        features_extractor_kwargs=dict(features_dim=features_dim, 
                                    sol_input_dim=env.observation_space['solution'].shape[-1],
                                    hidden_dim=hidden_dim,
                                    num_heads=num_heads),
        net_arch=net_arch
    )
    
    if use_her:
        model = DQN("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose,
                train_freq=n_steps, batch_size=batch_size, tensorboard_log=tensorboard_log,
                replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=replay_buffer_kwargs
                )
    else:
        model = PPO("MultiInputPolicy", env, policy_kwargs=policy_kwargs, verbose=verbose, n_steps=n_steps, batch_size=batch_size, learning_rate=lr, n_epochs=n_gradient_steps, tensorboard_log=tensorboard_log)
    
    instance_save_as = instance_name[:instance_name.index('.tsp')]
    model.learn(learn_totoal_steps, 
                tb_log_name=instance_save_as,
                callback=SaveBestSolCallback(log_dir=callback_log_dir, 
                                            instance_name=instance_save_as, 
                                            verbose=verbose,
                                            target_tour=target_tour,
                                            early_stop=early_stop)
                )

In [None]:
test_epoch_length = int(4e4)
instance_name = 'random-050-13219.tsp'
instance_name_head = instance_name[:instance_name.index('.tsp')]
instance =  os.path.join(instance_dir, instance_name)
locations = read_instance_data(instance)
problem = MultiODProblem(locations=locations, ignore_to_dummy_cost=False, ignore_from_dummy_cost=False)
problem.convert_distance_matrix_to_int()

if use_sparse_reward:
    env = SparseMultiODEnv(target_cost=int(problem.calc_cost(target_tour) * (1 + 0.05)), 
                               problem=problem, 
                               max_length=episode_max_length, 
                               max_time_length=episode_max_time_length,
                               k_recent=k_recent)
else:
    env = MultiODEnv(problem=problem, 
                         max_length=episode_max_length, 
                         max_time_length=episode_max_time_length,
                         k_recent=k_recent)

saved_best_model = sorted([i for i in os.listdir(callback_log_dir) if instance_name_head in i and 'model' in i], key=lambda x: int(x[len(instance_name_head) + 1: x.index('.model')]))[0]

if use_her:
    model = DQN.load(os.path.join(callback_log_dir, saved_best_model), print_system_info=True)
else:
    model = PPO.load(os.path.join(callback_log_dir, saved_best_model), print_system_info=True)

In [None]:
from tqdm import tqdm 

best_cost = np.inf 
obs, info = env.reset()
for _ in tqdm(range(test_epoch_length)):
    action, _states = model.predict(obs) 
    obs, reward, terminated, truncated, info = env.step(int(action))
    if terminated or truncated:
        best_cost = min(best_cost, env.best_cost)
        obs, info = env.reset() 
print(best_cost)