In [1]:
# to access functions from other locations
import sys
sys.path.append('/data/ad181/RemoteDir/k_variability_in_ressim_env/SPE10_like_envs/')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt

import gym
import torch as th
from stable_baselines3.a2c import A2C, MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.callbacks import CallbackList
from utils.custom_eval_callback import CustomEvalCallback
from typing import Callable
from utils.env_wrappers import StateCoarse

from utils.plot_functions import plot_learning

from model.ressim import Grid
from ressim_env import ResSimEnv_v0, ResSimEnv_v1, ResSimEnv_v2
from k_distributions.generate_constr_k import generate_cond_
from utils.env_wrappers import StepReset

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
seed=1
case='1ph_v0_state_wells_sp'

In [4]:
os.makedirs('./data', exist_ok=True)
os.makedirs('./data/'+case, exist_ok=True)

In [5]:
case_ = '1ph_v0'
with open('../envs_params/env_data_v1/env_'+case_+'_train_cluster.pkl', 'rb') as input:
    env_train = pickle.load(input)

rl_indices = [24,5,14,1,6,12,0,25,22]
with open('../envs_params/env_data_v1/env_'+case_+'_eval_cluster.pkl', 'rb') as input:
    env_eval = pickle.load(input)
k_list_rl = env_eval.k_list[rl_indices]
env_eval.set_k(k_list_rl)

In [7]:
# env wrapper to reduce state space
x_coords, y_coords = env_train.p_x, env_train.p_y

def env_wrappers(env, x_coords, y_coords):
    env = StateCoarse(env, x_coords, y_coords, include_well_pr=True)
    env = StepReset(env)
    return env

In [8]:
def make_env(env, rank: int, seed: int = 0) -> Callable:
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env_ = env
        env_.seed(seed + rank)
        return env_
    return _init

In [None]:
for seed in range(1,4):
    if True:
        print(f'seed {seed}')
        log_dir = './data/'+case+'/seed_'+str(seed)
        os.makedirs(log_dir, exist_ok=True)
        num_cpu = 32
        env_train.seed(seed)
        env_eval.seed(seed)
        env_train_ = env_wrappers(env_train, x_coords, y_coords)
        env_eval_ = env_wrappers(env_eval, x_coords, y_coords)
        train_callback = CustomEvalCallback(env_train_, 
                                            best_model_save_path=None, 
                                            n_eval_episodes=1,
                                            log_path=str(log_dir)+'/results_train', 
                                            eval_freq=100)
        callback_list = [train_callback]
        eval_callback = CustomEvalCallback(env_eval_, 
                                           best_model_save_path=str(log_dir)+'/best_model_eval', 
                                           n_eval_episodes=1,
                                           log_path=str(log_dir)+'/results_eval', 
                                           eval_freq=100)
        callback_list.append(eval_callback)
        callback = CallbackList(callback_list)
        env = SubprocVecEnv([make_env(env_train_, i, seed) for i in range(num_cpu)])
        print(env.observation_space)
#     env = VecMonitor(env, filename=log_dir)
        print(f'seed {seed}: model definition ..')
        model = A2C(policy=MlpPolicy,
                    env=env,
                    learning_rate = 1e-4,
                    n_steps = 5,
                    gamma = 0.99,
                    gae_lambda = 0.95,
                    ent_coef = 0.001,
                    vf_coef = 0.5,
                    max_grad_norm = 0.5,
                    use_sde= False,
                    create_eval_env= False,
                    policy_kwargs = dict(net_arch=[20,20], 
                                         log_std_init=-2.9),
                    verbose = 1,
                    seed = seed,
                    device = "auto")
#         model_params = model_pretrained.get_parameters()
#         model.set_parameters(model_params, exact_match=False)
        model.get_parameters()['policy']['mlp_extractor.shared_net.0.weight'][:,4:] = 0
        print(f'seed {seed}: learning ..')
        model.learn(total_timesteps=300000, callback=callback, log_interval=10)
        model.save(log_dir+'/A2C')
        fig = plot_learning(log_dir, case='train')
        fig.savefig(log_dir+'/learn_train.png')
        fig = plot_learning(log_dir, case='eval')
        fig.savefig(log_dir+'/learn_eval.png')

seed 1
Box(-100000.0, 100000.0, (9,), float64)
seed 1: model definition ..
Using cuda device
seed 1: learning ..




------------------------------------
| time/                 |          |
|    fps                | 141      |
|    iterations         | 10       |
|    time_elapsed       | 11       |
|    total_timesteps    | 1600     |
| train/                |          |
|    entropy_loss       | 5.92     |
|    explained_variance | -10.9    |
|    learning_rate      | 0.0001   |
|    n_updates          | 9        |
|    policy_loss        | -0.938   |
|    std                | 0.055    |
|    value_loss         | 0.0664   |
------------------------------------
Eval num_timesteps=3200, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.62     |
| time/                 |          |
|    fps                | 105      |
|    iterat

------------------------------------
| time/                 |          |
|    fps                | 110      |
|    iterations         | 130      |
|    time_elapsed       | 188      |
|    total_timesteps    | 20800    |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | -0.133   |
|    learning_rate      | 0.0001   |
|    n_updates          | 129      |
|    policy_loss        | -0.204   |
|    std                | 0.055    |
|    value_loss         | 0.0168   |
------------------------------------
Eval num_timesteps=22400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=22400, episode_reward=0.64 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.637    |
| time/                 |          |
|    fps                | 107      |
|    iter

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 250      |
|    time_elapsed       | 365      |
|    total_timesteps    | 40000    |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | 0.398    |
|    learning_rate      | 0.0001   |
|    n_updates          | 249      |
|    policy_loss        | -0.235   |
|    std                | 0.055    |
|    value_loss         | 0.0142   |
------------------------------------
Eval num_timesteps=41600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=41600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.631    |
| time/                 |          |
|    fps                | 108      |
|    iterations         | 260      |
|    time_elapse

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 370      |
|    time_elapsed       | 542      |
|    total_timesteps    | 59200    |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | 0.635    |
|    learning_rate      | 0.0001   |
|    n_updates          | 369      |
|    policy_loss        | -0.11    |
|    std                | 0.055    |
|    value_loss         | 0.0109   |
------------------------------------
Eval num_timesteps=60800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=60800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.633    |
| time/                 |          |
|    fps                | 108      |
|    iterations         | 380  

------------------------------------
| time/                 |          |
|    fps                | 108      |
|    iterations         | 490      |
|    time_elapsed       | 719      |
|    total_timesteps    | 78400    |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | 0.683    |
|    learning_rate      | 0.0001   |
|    n_updates          | 489      |
|    policy_loss        | -0.23    |
|    std                | 0.0549   |
|    value_loss         | 0.0116   |
------------------------------------
Eval num_timesteps=80000, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.631    |
| time/                 |          |
|    fps                | 108      |
|    iterations         | 500  

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 610      |
|    time_elapsed       | 894      |
|    total_timesteps    | 97600    |
| train/                |          |
|    entropy_loss       | 5.94     |
|    explained_variance | 0.664    |
|    learning_rate      | 0.0001   |
|    n_updates          | 609      |
|    policy_loss        | -0.154   |
|    std                | 0.0548   |
|    value_loss         | 0.0109   |
------------------------------------
Eval num_timesteps=99200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=99200, episode_reward=0.64 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.635    |
| time/                 |          |
|    fps                | 108      |
|    iterations         | 620      |
|    time_elapse

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 730      |
|    time_elapsed       | 1069     |
|    total_timesteps    | 116800   |
| train/                |          |
|    entropy_loss       | 5.94     |
|    explained_variance | 0.542    |
|    learning_rate      | 0.0001   |
|    n_updates          | 729      |
|    policy_loss        | -0.164   |
|    std                | 0.0548   |
|    value_loss         | 0.0146   |
------------------------------------
Eval num_timesteps=118400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=118400, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.63     |
| time/                 |          |
|    fps                | 108      |
|    iterations         | 740      |
|    time_elap

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 850      |
|    time_elapsed       | 1242     |
|    total_timesteps    | 136000   |
| train/                |          |
|    entropy_loss       | 5.95     |
|    explained_variance | 0.719    |
|    learning_rate      | 0.0001   |
|    n_updates          | 849      |
|    policy_loss        | -0.136   |
|    std                | 0.0547   |
|    value_loss         | 0.00984  |
------------------------------------
Eval num_timesteps=137600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=137600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.632    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 860      |
|    time_elap

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 970      |
|    time_elapsed       | 1415     |
|    total_timesteps    | 155200   |
| train/                |          |
|    entropy_loss       | 5.95     |
|    explained_variance | 0.59     |
|    learning_rate      | 0.0001   |
|    n_updates          | 969      |
|    policy_loss        | -0.168   |
|    std                | 0.0546   |
|    value_loss         | 0.0134   |
------------------------------------
Eval num_timesteps=156800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=156800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.632    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 980      |
|    time_elap

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 1090     |
|    time_elapsed       | 1588     |
|    total_timesteps    | 174400   |
| train/                |          |
|    entropy_loss       | 5.96     |
|    explained_variance | 0.623    |
|    learning_rate      | 0.0001   |
|    n_updates          | 1089     |
|    policy_loss        | -0.107   |
|    std                | 0.0545   |
|    value_loss         | 0.0121   |
------------------------------------
Eval num_timesteps=176000, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=176000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.634    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 1100     |
|    time_elap

------------------------------------
| time/                 |          |
|    fps                | 109      |
|    iterations         | 1210     |
|    time_elapsed       | 1760     |
|    total_timesteps    | 193600   |
| train/                |          |
|    entropy_loss       | 5.96     |
|    explained_variance | 0.652    |
|    learning_rate      | 0.0001   |
|    n_updates          | 1209     |
|    policy_loss        | -0.166   |
|    std                | 0.0545   |
|    value_loss         | 0.012    |
------------------------------------
Eval num_timesteps=195200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=195200, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.632    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 1220     |
|    time_elap