# Train the agents 

this file contains the required code to train the agents. Training is done using the stable-baselines library. 

The thesis explores the use of two training algorithms, namely the A2C and PPO methods. 


In [None]:
import gym
import numpy as np

from stable_baselines.sac.policies import MlpPolicy , CnnPolicy , LnMlpPolicy 

from stable_baselines.common.policies import MlpPolicy as MlpPolicyCommon
from stable_baselines.common.policies import CnnPolicy as CnnPolicyCommon
from stable_baselines.common.policies import MlpLnLstmPolicy , CnnLnLstmPolicy , LstmPolicy 
from stable_baselines.bench import Monitor
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines import PPO1
from stable_baselines import SAC


from stable_baselines.common import make_vec_env
from stable_baselines import A2C
from stable_baselines import ACKTR

from stable_baselines.ddpg import NormalActionNoise
import matplotlib.pyplot as plt

from stable_baselines.common.policies import FeedForwardPolicy, register_policy

from stable_baselines.common.callbacks import CheckpointCallback
from pathlib import Path
from stable_baselines import PPO1

import os, logging



In [None]:
# to make this work the Frankfurt-v0 environment must be instaled and data available. 
import gym 
import envs
env = gym.make('Frankfurt-v0')


In [None]:
#[128, dict(vf=[256], pi=[16])]
net_arch_final = [  256  , dict( vf = [ 128 , 128] , pi = [ 128 , 128 , 107 ] )  ],
# this defines the policies used by the algorithms 
class CustomPolicyPPO(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicyPPO, self).__init__(*args, **kwargs,
                                           net_arch = [  256, dict( vf = [ 128 , 128] , pi = [ 128 , 128 , 107 ] )  ] ,
                                           feature_extraction="mlp")
class CustomPolicyCnnPPO(CnnPolicyCommon):
    def __init__(self, *args, **kwargs):
        super(CustomPolicyCnnPPO, self).__init__(*args, **kwargs,
                                           net_arch = [  128, dict( vf = [ 128 , 128] , pi = [ 128 , 128 , 107 ] )  ] ,
                                           )
        
class MLPLSTMPolicyFinal( LstmPolicy ):
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs):
        super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
                         net_arch=[  256 , 256 ,  "lstm"  , dict( vf = [ 128 , 128] , pi = [ 256 , 256 , 107 ] )  ],
                         layer_norm=True, feature_extraction="mlp", **_kwargs)
        
        

In [None]:
def train( policy , checkpoint_dir , tensorboard_log_dir ,  name_prefix , tb_name ,  monitor_dir , steps = 6000000 ):
    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
    Path(tensorboard_log_dir).mkdir(parents=True, exist_ok=True)
    Path(monitor_dir).mkdir(parents=True, exist_ok=True)
    checkpoint_callback = CheckpointCallback(save_freq=50000, save_path=checkpoint_dir, name_prefix=name_prefix )
    #envs = make_vec_env('Frankfurt-v0', n_envs=4)
    cpu = 4
    
    #envs = SubprocVecEnv([lambda : Monitor( env , monitor_dir  ) for _ in range(cpu)])
    envs = make_vec_env('Frankfurt-v0', n_envs=4)
    model = A2C( policy, envs, gamma = 1.0 , verbose=1  , seed = 666 , learning_rate = 1e-5 , full_tensorboard_log=True , tensorboard_log=tensorboard_log_dir)    
    #model = PPO1( policy, env , gamma = 1.0 , verbose=1  , seed = 666, full_tensorboard_log=True , tensorboard_log=tensorboard_log_dir , n_cpu_tf_sess = 4 )
    model.learn(total_timesteps=steps , tb_log_name = tb_name ,callback=checkpoint_callback , log_interval=10000)
    model.save("../model_final/{}".format( tb_name ) )
    
    return model

In [None]:
# train and save the policy 
m = train(MLPLSTMPolicyFinal , "../model_checkpoints/ppo_lstm_rw2_6M" , "../tensorboard_logs/ppo_lstm_rw2_6M" , "ppo_lstm_prw2_6M" , "ppo_lstm_rw2_6M" , "../monitor/ppo_lstm_rw2_6M"   )
