In [1]:
# to access functions from root directory
import sys
sys.path.append('/data/ad181/RemoteDir/multilevel_ppo')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from tqdm.notebook import trange, tqdm

import gym
from stable_baselines3.ppo import PPO, MlpPolicy
from stable_baselines3.ppo_multi_level import PPO_ML
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env.subproc_vec_multi_level_env import SubprocVecMultiLevelEnv
from stable_baselines3.common.envs.multi_level_ressim_env import MultiLevelRessimEnv
from stable_baselines3.common.logger import configure

from utils.custom_eval_callback import CustomEvalCallback, CustomEvalCallbackParallel
from utils.plot_functions import plot_learning
from utils.env_evaluate_functions import eval_actions

In [3]:
seed=1
case='mlmc_analysis'
data_dir='./data'
log_dir='./data/'+case

In [4]:
os.makedirs(data_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [5]:
with open('../envs_params/env_data_v1/env_train_dict.pkl', 'rb') as input:
    env_ck_dict = pickle.load(input)

In [6]:
# generate dictionaries for env (env_dict_), n_steps (T_ml) and minibatch (M_ml) for `n_level`s
n_levels=3
fine_level = len(env_ck_dict)
env_dict_ = {}
for i,l in enumerate(range(fine_level-n_levels, fine_level)):
    print(i+1,'->',l+1)
    env_dict_[i+1] = env_ck_dict[l+1]

1 -> 3
2 -> 4
3 -> 5


In [None]:
for seed in range(1,2):
    if True: 
        print(f'seed {seed}')
        log_dir = './data/'+case+'/seed_'+str(seed)
        os.makedirs(log_dir, exist_ok=True)
        T = {1:50, 2:50, 3:50} # n_steps
        N = 50 # number of actors
        M = {1:250, 2:250, 3:250} # minibatch size
        I = 300 # number of iterations
        K = 20 # number of epochs
        
        log_interval = I/30
        
        fine_level = len(env_dict_)
        
        print('generate callback ...')
        eval_callback = CustomEvalCallback( env_dict_[fine_level], 
                                            best_model_save_path=None, 
                                            n_eval_episodes=1,
                                            log_path=str(log_dir)+'/results_eval', 
                                            eval_freq=log_interval*T[1]  )
        
        print('vectorize environment ...')
        
        # generate PPO_ML parameters for MLMC analysis. 
        # we choose same n_steps and batch_size values on levels because only fine level values are used in the analysis
        env_dict = {}
        n_steps_dict = {}
        batch_size_dict = {}
        for env, level in zip(env_dict_.values(), env_dict_.keys()):
            print(f"vectorize env level {level}")
            env_dict[level] = make_vec_env( MultiLevelRessimEnv, 
                                    n_envs=N, 
                                    seed=seed, 
                                    env_kwargs= {"ressim_params":env.ressim_params, "level":env.level}, 
                                    vec_env_cls=SubprocVecMultiLevelEnv )
            n_steps_dict[level] = T[level]
            batch_size_dict[level] = M[level]
        
        print(env_dict_[level].observation_space)
        print('model definition ..')
        model = PPO_ML(policy=MlpPolicy,
                           env=env_dict,
                           learning_rate = 1e-5,
                           n_steps = n_steps_dict,
                           batch_size = batch_size_dict,
                           n_epochs = K,
                           clip_range = 0.1,
                           ent_coef = 0.001,
                           vf_coef = 0.5,
                           policy_kwargs = dict(net_arch=[150,100,80], log_std_init=-2.9),
                           verbose = 1,
                           seed = seed,
                           target_kl = 0.05,
                           device = "auto")
        # set logger for the model
        new_logger = configure(log_dir)
        model.set_logger(new_logger)
        print('policy learning and analysis ..')
        e2 = [1e-2, 1e-3, 1e-4]
        analysis_interval = I/10
        model.mlmc_analysis(total_timesteps=N*T[1]*I, 
                            n_expt= 100000, 
                            eps_array=np.sqrt(e2),
                            log_interval=log_interval,
                            analysis_interval=analysis_interval, 
                            step_comp_time_dict={1:0.06,2:0.15,3:1.0})
        model.save(log_dir+'/PPO', exclude=['env_dict'])
        del model
        for level in env_dict.keys():
            env_dict[level].close()


seed 1
generate callback ...
vectorize environment ...
vectorize env level 1
vectorize env level 2
vectorize env level 3
Box(-1.0, 1.0, (96,), float64)
model definition ..
Using cuda device
Logging to ./data/mlmc_analysis/seed_1
policy learning and analysis ..
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.579       |
| time/                   |             |
|    fps                  | 10          |
|    iterations           | 10          |
|    time_elapsed         | 2412        |
|    total_timesteps      | 25000       |
| train/                  |             |
|    approx_kl            | 0.005613636 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.1         |
|    entropy_loss         | 94.8        |
|    explained_variance   | -0.664      |
|    learning_rate        | 1e-05       |
|    loss                 | 0.104       |
|    n_updates           

100%|██████████| 2000/2000 [2:46:49<00:00,  5.00s/it]  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.578        |
| time/                   |              |
|    fps                  | 4            |
|    iterations           | 30           |
|    time_elapsed         | 16782        |
|    total_timesteps      | 75000        |
| train/                  |              |
|    approx_kl            | 0.0050280183 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.1          |
|    entropy_loss         | 94.8         |
|    explained_variance   | 0.261        |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0883       |
|    n_updates            | 580          |
|    policy_gradient_loss | -0.00794     |
|    std                  | 0.055        |
|    value_loss           | 0.0234       |
------------------------------------------





analysis of MLMC estimator for 100000 number of experimets...
------------------------------------------------------------------
| expt_results/           |                                      |
|    C_l                  | [0.06, 0.15, 1.0]                    |
|    N                    | 100000                               |
|    P_l                  | [-0.0835, -0.0826, -0.0817]          |
|    V_l                  | [1.0, 1.0, 1.0]                      |
| mc_results/             |                                      |
|    C_mc                 | 1                                    |
|    N_mc                 | [200, 1997, 19963]                   |
|    P_mc                 | [-0.0352, -0.0462, -0.076]           |
|    V_mc                 | 1                                    |
|    eps_mc               | [0.1        0.03162278 0.01      ]   |
| mlmc_results/           |                                      |
|    C_ml                 | [[0.06, 0.21, 1.15], [0.06, 0.21,... |


100%|██████████| 2000/2000 [2:44:49<00:00,  4.94s/it]  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5            |
|    ep_rew_mean          | 0.594        |
| time/                   |              |
|    fps                  | 4            |
|    iterations           | 60           |
|    time_elapsed         | 33322        |
|    total_timesteps      | 150000       |
| train/                  |              |
|    approx_kl            | 0.0052863974 |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.1          |
|    entropy_loss         | 94.8         |
|    explained_variance   | 0.55         |
|    learning_rate        | 1e-05        |
|    loss                 | 0.0848       |
|    n_updates            | 1180         |
|    policy_gradient_loss | -0.00665     |
|    std                  | 0.055        |
|    value_loss           | 0.0143       |
------------------------------------------





analysis of MLMC estimator for 100000 number of experimets...
------------------------------------------------------------------
| expt_results/           |                                      |
|    C_l                  | [0.06, 0.15, 1.0]                    |
|    N                    | 100000                               |
|    P_l                  | [-0.0858, -0.0852, -0.0847]          |
|    V_l                  | [1.0, 1.0, 1.0]                      |
| mc_results/             |                                      |
|    C_mc                 | 1                                    |
|    N_mc                 | [201, 2006, 20053]                   |
|    P_mc                 | [-0.1652, -0.0765, -0.09]            |
|    V_mc                 | 1                                    |
|    eps_mc               | [0.1        0.03162278 0.01      ]   |
| mlmc_results/           |                                      |
|    C_ml                 | [[0.06, 0.21, 1.15], [0.06, 0.21,... |


100%|██████████| 2000/2000 [2:44:50<00:00,  4.95s/it]  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | 0.598       |
| time/                   |             |
|    fps                  | 4           |
|    iterations           | 90          |
|    time_elapsed         | 49864       |
|    total_timesteps      | 225000      |
| train/                  |             |
|    approx_kl            | 0.005297703 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.1         |
|    entropy_loss         | 94.8        |
|    explained_variance   | 0.666       |
|    learning_rate        | 1e-05       |
|    loss                 | 0.083       |
|    n_updates            | 1780        |
|    policy_gradient_loss | -0.00719    |
|    std                  | 0.055       |
|    value_loss           | 0.0114      |
-----------------------------------------





analysis of MLMC estimator for 100000 number of experimets...
