In [1]:
import os

curr_dir = os.path.split(os.getcwd())[1]
if curr_dir != "irl-environment-design":
    os.chdir("..")

import numpy as np

from src.utils.make_environment import (
    transition_matrix,
    Environment,
    insert_walls_into_T,
)

from src.utils.constants import GenParamTuple# candidate_environments_args["n_environments"] = 50


np.set_printoptions(linewidth=160, precision=2)

from src.utils.environment_design import EnvironmentDesign

## Make true environment

In [2]:
# 0. Setup
## 0.2 Setup the environment
N, M = 7,7

# TODO describe world
agent_R = np.zeros((N, M))
agent_R[N-1, 0] = 1
agent_R[N-1, M-1] = 3



# Start by making the agent we want to learn the policy of
agent_gamma = 0.8
p_true=0.9

agent_R = agent_R.flatten()
goal_states = np.where(agent_R != 0)[0]


wall_states = [14] #TODO: why do we need this

T_true = transition_matrix(N, M, p=p_true, absorbing_states=goal_states)
T_True = insert_walls_into_T(T=T_true, wall_indices=wall_states)


def custom_transition_func(p):

    _T = transition_matrix(N=7, M=7, p=p, absorbing_states=goal_states)
    _T = insert_walls_into_T(T=_T, wall_indices=wall_states)
    return _T

def custom_gamma_func(gamma):
    return gamma

def custom_reward_func(*reward_func):
    return reward_func

#Create parameter ranges
resolution = 15
p_range = np.linspace(0.5, 0.95, resolution)
gamma_range = np.linspace(0.5, 0.95, resolution)


gamma_range = gamma_range.reshape(1, resolution)
p_range = p_range.reshape(1, resolution)

true_params = GenParamTuple(T = custom_transition_func(p_true), gamma=agent_gamma, R=agent_R)




big_small = Environment(
    N=N,
    M=M,
    reward_function = custom_reward_func,
    transition_function=custom_transition_func,
    gamma = custom_gamma_func,
    wall_states=wall_states,
    start_state=0,
    goal_states=goal_states
)

In [3]:
env_design = EnvironmentDesign(base_environment=big_small, 
                               user_params=true_params, 
                               learn_what = ["gamma", "T"],
                               parameter_ranges_R=None,
                               parameter_ranges_gamma=gamma_range,
                               parameter_ranges_T=p_range)

Generated parameter mesh of shape:  (15, 15)


In [4]:
candidate_environments_args = {}
candidate_environments_args["generate_how"] = "entropy_BM"

env_design.run_n_episodes(n_episodes = 15,
                          candidate_environments_args=candidate_environments_args,
                          verbose=True)

Started episode 0.
Finished episode 0.
Started episode 1.
Beginning calculation of log-likelihood. Calculating 225 samples.
Mean Parameters: [0.7377035947972961, 0.7541818104319102]
Computed Region of Interest. Size = 0.72
Finished BM Search. Entropy: 1.141526885567019. Max Ent possible: 1.3862943611198906. Cover: {0: 0.22085889570552147, 1: 0.3987730061349693, 2: 0.018404907975460124, 3: 0.3619631901840491}.
Finished episode 1.
Started episode 2.
Beginning calculation of log-likelihood. Calculating 225 samples.
Mean Parameters: [0.760538045801546, 0.8118320718790552]
Computed Region of Interest. Size = 0.51
Finished BM Search. Entropy: 1.082255207979348. Max Ent possible: 1.0986122886681096. Cover: {0: 0.391304347826087, 1: 0.25217391304347825, 2: 0.3565217391304348}.
Finished episode 2.
Started episode 3.
Beginning calculation of log-likelihood. Calculating 225 samples.
Mean Parameters: [0.7709327645485535, 0.8536387330634915]
Computed Region of Interest. Size = 0.36
Finished BM Sear

In [6]:
env_design.diagnostics

{'parameter_means': [[0.7377035947972961, 0.7541818104319102],
  [0.760538045801546, 0.8118320718790552],
  [0.7709327645485535, 0.8536387330634915],
  [0.8425356728790099, 0.8713079978879371],
  [0.8577706737892153, 0.8760925903792983],
  [0.8541341450296879, 0.8934699939720562],
  [0.8516242519444647, 0.9061194361200297],
  [0.8508533705639237, 0.9200963832695727],
  [0.8428047712241126, 0.9201634314399546],
  [0.8495463699382679, 0.9245979737596456],
  [0.840400590155589, 0.924491131101504],
  [0.8426685293995265, 0.9208159000944539],
  [0.8413425358276649, 0.9171242795795481],
  [0.8332866846104818, 0.911806643880059]],
 'region_of_interests': [array([164, 149, 179, 194, 209, 148, 163, 178, 224, 134, 193, 147, 162, 177, 208, 192, 133, 161, 176, 146, 223, 119, 207, 191, 160, 175, 132, 118, 104, 190,
         159, 206,  89, 174, 145, 103,  74, 222, 131, 117,  88,  59, 189, 102,  73, 173, 205, 116, 158,  44,  87,  58, 130, 188, 144, 101,  72,  29,  86, 221,
         115,  43, 204, 172

In [5]:
'''
user parameters consist of
- parametrizations of reward, transition and gamma that user supplies
- ranges for the unknown parameters
- resolution of the grid search
'''

'\nuser parameters consist of\n- parametrizations of reward, transition and gamma that user supplies\n- ranges for the unknown parameters\n- resolution of the grid search\n'