In [1]:
# to access functions from other locations
import sys
sys.path.append('/data/ad181/RemoteDir/k_variability_in_ressim_env/SPE10_like_envs/')

In [2]:
%matplotlib notebook
import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt

import gym
import torch as th
from stable_baselines3.a2c import A2C, MlpPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.callbacks import CallbackList
from utils.custom_eval_callback import CustomEvalCallback
from typing import Callable
from utils.env_wrappers import StateCoarse

from utils.plot_functions import plot_learning

from model.ressim import Grid
from ressim_env import ResSimEnv_v0, ResSimEnv_v1, ResSimEnv_v2
from k_distributions.generate_constr_k import generate_cond_
from utils.env_wrappers import StepReset

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
seed=1
case='1ph_v0_state_wells_sp_large_batch'

In [4]:
os.makedirs('./data', exist_ok=True)
os.makedirs('./data/'+case, exist_ok=True)

In [5]:
case_ = '1ph_v0'
with open('../envs_params/env_data_v1/env_'+case_+'_train_cluster.pkl', 'rb') as input:
    env_train = pickle.load(input)

rl_indices = [24,5,14,1,6,12,0,25,22]
with open('../envs_params/env_data_v1/env_'+case_+'_eval_cluster.pkl', 'rb') as input:
    env_eval = pickle.load(input)
k_list_rl = env_eval.k_list[rl_indices]
env_eval.set_k(k_list_rl)

In [6]:
# env wrapper to reduce state space
x_coords, y_coords = env_train.p_x, env_train.p_y

def env_wrappers(env, x_coords, y_coords):
    env = StateCoarse(env, x_coords, y_coords, include_well_pr=True)
    env = StepReset(env)
    return env

In [7]:
def make_env(env, rank: int, seed: int = 0) -> Callable:
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env_ = env
        env_.seed(seed + rank)
        return env_
    return _init

In [8]:
for seed in range(1,4):
    if True:
        print(f'seed {seed}')
        log_dir = './data/'+case+'/seed_'+str(seed)
        os.makedirs(log_dir, exist_ok=True)
        num_cpu = 32
        env_train.seed(seed)
        env_eval.seed(seed)
        env_train_ = env_wrappers(env_train, x_coords, y_coords)
        env_eval_ = env_wrappers(env_eval, x_coords, y_coords)
        train_callback = CustomEvalCallback(env_train_, 
                                            best_model_save_path=None, 
                                            n_eval_episodes=1,
                                            log_path=str(log_dir)+'/results_train', 
                                            eval_freq=100)
        callback_list = [train_callback]
        eval_callback = CustomEvalCallback(env_eval_, 
                                           best_model_save_path=str(log_dir)+'/best_model_eval', 
                                           n_eval_episodes=1,
                                           log_path=str(log_dir)+'/results_eval', 
                                           eval_freq=100)
        callback_list.append(eval_callback)
        callback = CallbackList(callback_list)
        env = SubprocVecEnv([make_env(env_train_, i, seed) for i in range(num_cpu)])
        print(env.observation_space)
#     env = VecMonitor(env, filename=log_dir)
        print(f'seed {seed}: model definition ..')
        model = A2C(policy=MlpPolicy,
                    env=env,
                    learning_rate = 1e-4,
                    n_steps = 50,
                    gamma = 0.99,
                    gae_lambda = 0.95,
                    ent_coef = 0.001,
                    vf_coef = 0.5,
                    max_grad_norm = 0.5,
                    use_sde= False,
                    create_eval_env= False,
                    policy_kwargs = dict(net_arch=[20,20], 
                                         log_std_init=-2.9),
                    verbose = 1,
                    seed = seed,
                    device = "auto")
#         model_params = model_pretrained.get_parameters()
#         model.set_parameters(model_params, exact_match=False)
        model.get_parameters()['policy']['mlp_extractor.shared_net.0.weight'][:,4:] = 0
        print(f'seed {seed}: learning ..')
        model.learn(total_timesteps=300000, callback=callback, log_interval=10)
        model.save(log_dir+'/A2C')
        fig = plot_learning(log_dir, case='train')
        fig.savefig(log_dir+'/learn_train.png')
        fig = plot_learning(log_dir, case='eval')
        fig.savefig(log_dir+'/learn_eval.png')

seed 1
Box(-100000.0, 100000.0, (9,), float64)
seed 1: model definition ..
Using cuda device
seed 1: learning ..




Eval num_timesteps=3200, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=9600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=9600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=16000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=16000, episode_reward=0.62 +/- 0.00
Episode le

Eval num_timesteps=89600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=92800, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=92800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=96000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=96000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.627    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 60       |
|    time_elapsed       | 880      |
|    total_timesteps    | 96000    |
| train/                |          |
|    entropy_loss       | 5.92     |
|    explained_variance | -2.06    |
|    learning_rate      | 0.0001   |
|    n_updates          | 59       |
|    policy_loss        | -0.397   |
|    std            

Eval num_timesteps=179200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=179200, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=182400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=182400, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=185600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=185600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=188800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=188800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=192000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=192000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    me

Eval num_timesteps=275200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=275200, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=278400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=278400, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=281600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=281600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=284800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=284800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=288000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=288000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.6

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

seed 2
Box(-100000.0, 100000.0, (9,), float64)
seed 2: model definition ..
Using cuda device
seed 2: learning ..




Eval num_timesteps=3200, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3200, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=6400, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=9600, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=9600, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12800, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12800, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=16000, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=16000, episode_reward=0.59 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------

Eval num_timesteps=92800, episode_reward=0.57 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=92800, episode_reward=0.58 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=96000, episode_reward=0.58 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=96000, episode_reward=0.58 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.582    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 60       |
|    time_elapsed       | 878      |
|    total_timesteps    | 96000    |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | -2.63    |
|    learning_rate      | 0.0001   |
|    n_updates          | 59       |
|    policy_loss        | -0.366   |
|    std                | 0.055    |
|    value_loss         | 0.0313   |
-------------------------------

Eval num_timesteps=176000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.608    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 110      |
|    time_elapsed       | 1607     |
|    total_timesteps    | 176000   |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | -0.455   |
|    learning_rate      | 0.0001   |
|    n_updates          | 109      |
|    policy_loss        | -0.227   |
|    std                | 0.055    |
|    value_loss         | 0.0191   |
------------------------------------
Eval num_timesteps=179200, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=179200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=182400, episode_reward=0.60 +/- 0.0

Eval num_timesteps=256000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=256000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.626    |
| time/                 |          |
|    fps                | 109      |
|    iterations         | 160      |
|    time_elapsed       | 2336     |
|    total_timesteps    | 256000   |
| train/                |          |
|    entropy_loss       | 5.93     |
|    explained_variance | 0.28     |
|    learning_rate      | 0.0001   |
|    n_updates          | 159      |
|    policy_loss        | -0.158   |
|    std                | 0.055    |
|    value_loss         | 0.013    |
------------------------------------
Eval num_timesteps=259200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=259200, episode_reward=0.62 +/- 0.0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

seed 3
Box(-100000.0, 100000.0, (9,), float64)
seed 3: model definition ..
Using cuda device
seed 3: learning ..




Eval num_timesteps=3200, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3200, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=9600, episode_reward=0.60 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=9600, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12800, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=16000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=16000, episode_reward=0.

Eval num_timesteps=86400, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=89600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=89600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=92800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=92800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=96000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=96000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.626    |
| time/                 |          |
|    fps                | 111      |
|    iterations         | 60       |
|    time_elapsed       | 860      |
|    total_timesteps    | 96000    |
| trai

Eval num_timesteps=176000, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=176000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.631    |
| time/                 |          |
|    fps                | 111      |
|    iterations         | 110      |
|    time_elapsed       | 1571     |
|    total_timesteps    | 176000   |
| train/                |          |
|    entropy_loss       | 5.92     |
|    explained_variance | 0.0171   |
|    learning_rate      | 0.0001   |
|    n_updates          | 109      |
|    policy_loss        | -0.226   |
|    std                | 0.055    |
|    value_loss         | 0.0152   |
------------------------------------
Eval num_timesteps=179200, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
New best mean reward!
Eval num_timesteps=179200, episo

Eval num_timesteps=259200, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=259200, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=262400, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=262400, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=265600, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=265600, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=268800, episode_reward=0.61 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=268800, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=272000, episode_reward=0.62 +/- 0.00
Episode length: 4.00 +/- 0.00
Eval num_timesteps=272000, episode_reward=0.63 +/- 0.00
Episode length: 4.00 +/- 0.00
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4        |
|    mean_reward        | 0.6

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>