# Import Repo of Sepsis Simulator

In [None]:
!git clone https://github.com/clinicalml/gumbel-max-scm.git

Cloning into 'gumbel-max-scm'...
remote: Enumerating objects: 113, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 113 (delta 0), reused 0 (delta 0), pack-reused 110[K
Receiving objects: 100% (113/113), 1.48 MiB | 22.23 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [None]:
#Enable importing code from parent directory
import os, sys
simulator_path = os.path.abspath('./gumbel-max-scm')
sys.path.insert(1, simulator_path)

In [None]:
!pip install pymdptoolbox

Collecting pymdptoolbox
  Downloading pymdptoolbox-4.0-b3.zip (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pymdptoolbox
  Building wheel for pymdptoolbox (setup.py) ... [?25l[?25hdone
  Created wheel for pymdptoolbox: filename=pymdptoolbox-4.0b3-py3-none-any.whl size=25657 sha256=cc7022cc7c8375fdd6dc285ed800d0eee464ead18bed3ea0d4a0178d4053f349
  Stored in directory: /root/.cache/pip/wheels/2b/e7/c7/d7abf9e309f3573a934fed2750c70bd75d9e9d901f7f16e183
Successfully built pymdptoolbox
Installing collected packages: pymdptoolbox
Successfully installed pymdptoolbox-4.0b3


**IMPORTANT NOTE:** At this stage, to reproduce our experiments, one must modify line 38 of `gumbel-max-scm/sepsisSimDiabetes/DataGenerator.py` so that it reads:

```
mdp = MDP(init_state_idx=%state%,
          policy_array=policy, policy_idx_type=policy_idx_type,
          p_diabetes=p_diabetes)

```

We have essentially set the initial state to a fixed value so that we may estimate the Q-function from that state. Additionally, line 58 of the same file must be modified to:

```
mdp.state = mdp.get_new_state(state_idx = %state%)
```

In [None]:
import numpy as np
import cf.counterfactual as cf
import cf.utils as utils
import pandas as pd
import pickle
import itertools as it
from tqdm import tqdm_notebook as tqdm
from scipy.linalg import block_diag

# Sepsis Simulator code
from sepsisSimDiabetes.State import State
from sepsisSimDiabetes.Action import Action
from sepsisSimDiabetes.DataGenerator import DataGenerator
import sepsisSimDiabetes.MDP as simulator

import mdptoolboxSrc.mdp as mdptools

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Set up Variables and Functions

Code taken from [Oberst and Sontag](https://github.com/clinicalml/gumbel-max-scm/blob/master/plots-main-paper.ipynb).

Set up important variables

In [None]:
SEED = 1
np.random.seed(SEED)
NSIMSAMPS = 100000  # Samples to draw from the simulator
NSTEPS = 20  # Max length of each trajectory
NCFSAMPS = 5  # Counterfactual Samples per observed sample
DISCOUNT_Pol = 0.99 # Used for computing optimal policies
DISCOUNT = 1 # Used for computing actual reward
PHYS_EPSILON = 0.05 # Used for sampling using physician pol as eps greedy

# Option 1: Use bootstrapping w/replacement on the original NSIMSAMPS to estimate errors
USE_BOOSTRAP=True
N_BOOTSTRAP = 100

# Option 2: Use repeated sampling (i.e., NSIMSAMPS fresh simulations each time) to get error bars;
# This is done in the appendix of the paper, but not in the main paper
N_REPEAT_SAMPLING = 1

# These are properties of the simulator, do not change
n_actions = Action.NUM_ACTIONS_TOTAL
n_components = 2

# These are added as absorbing states
n_states_abs = State.NUM_OBS_STATES + 2
discStateIdx = n_states_abs - 1
deadStateIdx = n_states_abs - 2

# Number of runs for calculating MSE
RUNS = 20
# Number of episodes over which we average an OPE estimate
N = 1000

In [None]:
hr_state_mapping = ['Low', 'Normal', 'High']
sbp_state_mapping = ['Low', 'Normal', 'High']
o2_state_mapping = ['Low', 'Normal']
glu_state_mapping = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
abx_state_mapping = ['Off', 'On']
vaso_state_mapping = ['Off', 'On']
vent_state_mapping = ['Off', 'On']
diab_state_mapping = ['No', 'Yes']

Set up base for behaviour and evaluation policies

In [None]:
import zipfile
with zipfile.ZipFile("gumbel-max-scm/data/diab_txr_mats-replication.zip", 'r') as zip_ref:
    zip_ref.extractall("gumbel-max-scm/data")

In [None]:
# Get the transition and reward matrix from file
with open("gumbel-max-scm/data/diab_txr_mats-replication.pkl", "rb") as f:
    mdict = pickle.load(f)

tx_mat = mdict["tx_mat"]
r_mat = mdict["r_mat"]

In [None]:
from scipy.linalg import block_diag

tx_mat_full = np.zeros((n_actions, State.NUM_FULL_STATES, State.NUM_FULL_STATES))
r_mat_full = np.zeros((n_actions, State.NUM_FULL_STATES, State.NUM_FULL_STATES))

# Easily accessible variables
A = n_actions
S = State.NUM_FULL_STATES

for a in range(n_actions):
    tx_mat_full[a, ...] = block_diag(tx_mat[0, a, ...], tx_mat[1, a,...])
    r_mat_full[a, ...] = block_diag(r_mat[0, a, ...], r_mat[1, a, ...])

In [None]:
fullMDP = cf.MatrixMDP(tx_mat_full, r_mat_full)
fullPol = fullMDP.policyIteration(discount=DISCOUNT_Pol, eval_type=1)

#The behavior policy is the fully random policy
randPol = np.ones(fullPol.shape)/(fullPol.shape[1])

In [None]:
#We want the expected reward of starting in a state and taking an action
R = np.swapaxes(np.mean(r_mat_full, axis=-1), 0, 1)
R.shape

(1440, 8)

In [None]:
#To handle -1 states and -1 actions
def pad_policy(policy, val=1):
  #Add a column of zeroes to the end
  policy = np.concatenate((policy, np.full((policy.shape[0], 1), val)), axis=1)
  #Add a row of zeroes at the end
  policy = np.concatenate((policy, np.full((1, policy.shape[1]), val)), axis=0)
  return policy

# Load repo

In [None]:
!git clone https://github.com/ai4ai-lab/Factored-Action-Spaces-for-OPE.git

Cloning into 'Factored-Action-Spaces-for-OPE'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 93 (delta 20), reused 71 (delta 7), pack-reused 0[K
Receiving objects: 100% (93/93), 2.94 MiB | 19.32 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [None]:
#Enable importing code from parent directory
import os, sys
main_folder = os.path.abspath('./Factored-Action-Spaces-for-OPE')
sys.path.insert(1, main_folder)

# From Patient State 136, With Diabetes

In [None]:
#The patient has diabetes
PROB_DIAB = 1.0

### State Analysis

In [None]:
#Instantiate a state based on the idx and get the state vector
testState = State(state_idx = 136, diabetic_idx=1)
vec = testState.get_state_vector()

print(vec)

print(f'Heart Rate: {hr_state_mapping[vec[0]]}')
print(f'Systolic Blood Pressure: {sbp_state_mapping[vec[1]]}')
print(f'Percent Oxygen: {o2_state_mapping[vec[2]]}')
print(f'Glucose Level: {glu_state_mapping[vec[3]]}')
print(f'Antibiotics: {abx_state_mapping[vec[4]]}')
print(f'Vasopressors: {vaso_state_mapping[vec[5]]}')
print(f'Ventilator: {vent_state_mapping[vec[6]]}')
print(f'Diabetes: {testState.diabetic_idx}')

[0 1 1 2 0 0 0]
Heart Rate: Low
Systolic Blood Pressure: Normal
Percent Oxygen: Normal
Glucose Level: Normal
Antibiotics: Off
Vasopressors: Off
Ventilator: Off
Diabetes: 1


### Generate Data From Behaviour Policy

Run the data generator

In [None]:
dgen = DataGenerator()
states, actions, lengths, rewards, diab, emp_tx_totals, emp_r_totals = dgen.simulate(
    NSIMSAMPS, NSTEPS, policy=randPol, policy_idx_type='full',
    p_diabetes=PROB_DIAB, use_tqdm=False) #True, tqdm_desc='Behaviour Policy Simulation')

obs_samps = utils.format_dgen_samps(
    states, actions, rewards, diab, NSTEPS, NSIMSAMPS)

Convert data into array format

In [None]:
time = np.arange(NSTEPS)
times = np.stack(axis=0, arrays=[time]*NSIMSAMPS)
times = times[..., np.newaxis]

nf_tr_b = np.concatenate((times, states[:, 0:NSTEPS, :], actions, rewards, states[:, 1:, :]), axis=2)
nf_tr_b.shape

(100000, 20, 5)

In [None]:
print(nf_tr_b)

[[[  0. 136.   0.  -1. 168.]
  [  1. 168.  -1.   0.  -1.]
  [  2.  -1.  -1.   0.  -1.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   3.  -1. 227.]
  [  1. 227.  -1.   0.  -1.]
  [  2.  -1.  -1.   0.  -1.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   7.   0. 223.]
  [  1. 223.   3.   0. 219.]
  [  2. 219.   1.   0. 218.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 ...

 [[  0. 136.   2.   0. 377.]
  [  1. 377.   4.   0. 372.]
  [  2. 372.   7.   0. 463.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   5.   0. 222.]
  [  1. 222.   1.   0. 218.]
  [  2. 218.   7.  -1. 231.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   7.  -1. 231.]
  [  1. 231.  -1.   0.  -1.]
  [  2.  -1.  -1.   0.  -1

In [None]:
randPol = pad_policy(randPol)

### Varying Episodes $\epsilon_{e} = 0.4$ (Policy Divergence $4.8^{20}$)

Set up evaluation policy, generate data and convert into factored format

In [None]:
EVAL_EPSILON = 0.4

evalPolSoft = np.copy(fullPol)
evalPolSoft[evalPolSoft == 1] = 1 - EVAL_EPSILON
evalPolSoft[evalPolSoft == 0] = EVAL_EPSILON / (n_actions - 1)

In [None]:
# Calculate policy divergence from Voloshin et al.
D = 0
for state in range(randPol.shape[0] - 1):
    for action in range(randPol.shape[1] - 1):
        difference = evalPolSoft[state, action]/randPol[state, action]
        D = max(D, difference)
print(D)
shorter_D = round(D,2)

4.8


In [None]:
dgen = DataGenerator()
states, actions, lengths, rewards, diab, emp_tx_totals, emp_r_totals = dgen.simulate(
    NSIMSAMPS, NSTEPS, policy=evalPolSoft, policy_idx_type='full',
    p_diabetes=PROB_DIAB, use_tqdm=False) #True, tqdm_desc='Behaviour Policy Simulation')

obs_samps = utils.format_dgen_samps(
    states, actions, rewards, diab, NSTEPS, NSIMSAMPS)

In [None]:
time = np.arange(NSTEPS)
times = np.stack(axis=0, arrays=[time]*NSIMSAMPS)
times = times[..., np.newaxis]

nf_tr_e = np.concatenate((times, states[:, 0:NSTEPS, :], actions, rewards, states[:, 1:, :]), axis=2)
nf_tr_e.shape

(100000, 20, 5)

In [None]:
print(nf_tr_e)

[[[  0. 136.   0.   0. 136.]
  [  1. 136.   2.   0.  57.]
  [  2.  57.   3.  -1. 227.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   2.   0.  57.]
  [  1.  57.   4.  -1.  68.]
  [  2.  68.  -1.   0.  -1.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   2.   0. 449.]
  [  1. 449.   5.   0. 462.]
  [  2. 462.   7.   0. 471.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 ...

 [[  0. 136.   2.   0. 145.]
  [  1. 145.   3.   0. 147.]
  [  2. 147.   2.   0.  57.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   7.   0. 223.]
  [  1. 223.   6.   0. 221.]
  [  2. 221.   4.   0. 132.]
  ...
  [ 17.  -1.  -1.   0.  -1.]
  [ 18.  -1.  -1.   0.  -1.]
  [ 19.  -1.  -1.   0.  -1.]]

 [[  0. 136.   2.   0.  57.]
  [  1.  57.   3.   0. 147.]
  [  2. 147.   2.   0. 145

In [None]:
import policy_estimators as pe

#Obtain on policy estimate
on_policy_estimate = pe.on_policy_Q_estimate(nf_tr_e, DISCOUNT_Pol)

Plan

- Given data
- Objective: lower MSE with respect to on-policy estimate

- Evaluate on-policy estimate (true value)
- COnstruct a neural network to accept the action and state and output the decomposed policy values and decomposed rewards (2D outputs)
- Pass data through network in batches(runs) and based on this find the MSE
- Attempt to backpropagate through the network and ultimately obtain the best mapping.

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

true_val = torch.as_tensor(on_policy_estimate).to(device)

Using cuda device


In [None]:
class FactorNetwork(nn.Module):
    def __init__(self, D):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2, 10),
            nn.ReLU(),
            nn.Linear(10, 10),
            nn.ReLU(),
            nn.Linear(10, 10),
            nn.ReLU(),
            nn.Linear(10, 3*D),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
def train_loop(model, loss_fn, optimizer, discount_factor):
    BATCH_SIZE = 1000
    train_data = nf_tr_b.reshape((-1, BATCH_SIZE, NSTEPS, 5))
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for run in range(train_data.shape[0]):
      loss = torch.zeros((1), dtype=torch.float32).to(device)
      batch = train_data[run, :, :, :]
      for n in range(BATCH_SIZE):
        episode = batch[n, :, :]
        #Filter out -1 states and actions
        episode = episode[episode[:, 2] != -1, :]
        states_and_actions = torch.as_tensor(episode[:, 1:3], dtype=torch.float32).to(device)
        # Compute prediction and loss
        factored_pol_reward = model(states_and_actions) #Use predictions from network to calculate OPE estimates
        D = list(factored_pol_reward.size())[-1]//3
        factored_pi_b = factored_pol_reward[:, :D]
        factored_pi_e = factored_pol_reward[:, D:2*D]
        factored_reward = factored_pol_reward[:, 2*D:]

        fn = nn.ReLU()
        #Penalty for behaviour policy values < 1
        penalty1 = torch.sum(fn(torch.neg(factored_pi_b))).to(device)
        #Penalty for behaviour policy values < 1
        penalty2 = torch.sum(fn(torch.neg(factored_pi_e))).to(device)
        total_penalty1 = torch.add(penalty1, penalty2)

        #Penalty for policies not summing to 1
        penalty3 = torch.abs( torch.sub( torch.ones((1), dtype=torch.float32).to(device), torch.sum(factored_pi_b) ) )
        penalty4 = torch.abs( torch.sub( torch.ones((1), dtype=torch.float32).to(device), torch.sum(factored_pi_e) ) )
        total_penalty2 = torch.add(penalty3, penalty4)

        total_penalty = torch.add(total_penalty1, total_penalty2).div(episode.shape[0])

        pointwise_IS_ratios = torch.div(factored_pi_e, factored_pi_b)
        IS_ratios = torch.prod(pointwise_IS_ratios, 0)

        times = torch.as_tensor(np.repeat(np.expand_dims(episode[:, 0], axis=1), D, axis=1)).to(device)
        # Per-trajectory returns (discounted cumulative rewards)
        gamma = torch.full(times.shape, discount_factor).to(device)
        G = torch.mul(factored_reward, torch.pow(gamma, times)).sum()

        loss.to(device)
        loss = torch.add(loss, torch.add(loss_fn(true_val, G), total_penalty).div(BATCH_SIZE))

      # Backpropagation
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      if run % 10 == 0:
          loss, current = loss.item(), (run + 1) * BATCH_SIZE
          print(f"loss: {loss:>7f}  [{run:>5d} /{train_data.shape[0]:>5d}]")

In [None]:
model1 = FactorNetwork(2).to(device)

loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.SGD(model1.parameters(), lr=0.00001)

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(model1, loss_fn, optimizer, DISCOUNT_Pol)

#Test it out
model1.eval()
test = torch.tensor([136.0, 0.0]).to(device)
model1(test)

Epoch 1
-------------------------------
loss: 1.335135  [    0 /  100]
loss: 1.278470  [   10 /  100]
loss: 1.342532  [   20 /  100]
loss: 1.285794  [   30 /  100]
loss: 1.259601  [   40 /  100]
loss: 1.300566  [   50 /  100]
loss: 1.274568  [   60 /  100]
loss: 1.278864  [   70 /  100]
loss: 1.249542  [   80 /  100]
loss: 1.270595  [   90 /  100]
Epoch 2
-------------------------------
loss: 1.294299  [    0 /  100]
loss: 1.236093  [   10 /  100]
loss: 1.300161  [   20 /  100]
loss: 1.245162  [   30 /  100]
loss: 1.213984  [   40 /  100]
loss: 1.258779  [   50 /  100]
loss: 1.231544  [   60 /  100]
loss: 1.233908  [   70 /  100]
loss: 1.208391  [   80 /  100]
loss: 1.228957  [   90 /  100]
Epoch 3
-------------------------------
loss: 1.255375  [    0 /  100]
loss: 1.195838  [   10 /  100]
loss: 1.259844  [   20 /  100]
loss: 1.206623  [   30 /  100]
loss: 1.170749  [   40 /  100]
loss: 1.219081  [   50 /  100]
loss: 1.190683  [   60 /  100]
loss: 1.191433  [   70 /  100]
loss: 1.1694

tensor([ 0.5042,  0.0189,  0.1691,  0.1965,  0.6506, -0.6988], device='cuda:0',
       grad_fn=<AddBackward0>)

In [None]:
model2 = FactorNetwork(3).to(device)

loss_fn = nn.MSELoss().to(device)
optimizer = torch.optim.SGD(model2.parameters(), lr=0.00001)

epochs = 40
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(model2, loss_fn, optimizer, DISCOUNT_Pol)

#Test it out
model2.eval()
test = torch.tensor([136.0, 0.0]).to(device)
model2(test)

Epoch 1
-------------------------------
loss: 121.918434  [    0 /  100]
loss: 85.112808  [   10 /  100]
loss: 68.127640  [   20 /  100]
loss: 51.087219  [   30 /  100]
loss: 42.952618  [   40 /  100]
loss: 38.361416  [   50 /  100]
loss: 34.953259  [   60 /  100]
loss: 32.426785  [   70 /  100]
loss: 30.143805  [   80 /  100]
loss: 28.798235  [   90 /  100]
Epoch 2
-------------------------------
loss: 27.612373  [    0 /  100]
loss: 26.265516  [   10 /  100]
loss: 25.568386  [   20 /  100]
loss: 24.301170  [   30 /  100]
loss: 23.491777  [   40 /  100]
loss: 22.669216  [   50 /  100]
loss: 21.819134  [   60 /  100]
loss: 20.983740  [   70 /  100]
loss: 20.019804  [   80 /  100]
loss: 19.277946  [   90 /  100]
Epoch 3
-------------------------------
loss: 18.544365  [    0 /  100]
loss: 17.535997  [   10 /  100]
loss: 16.957306  [   20 /  100]
loss: 15.855524  [   30 /  100]
loss: 15.214669  [   40 /  100]
loss: 14.591391  [   50 /  100]
loss: 13.922743  [   60 /  100]
loss: 13.276759

tensor([ 0.0074,  0.0374, -0.0400, -0.0485, -0.0236, -0.0033, -0.0809,  0.1385,
         0.0095], device='cuda:0', grad_fn=<AddBackward0>)

Generate a new batch of behaviour data

Run the data generator

In [None]:
states, actions, lengths, rewards, diab, emp_tx_totals, emp_r_totals = dgen.simulate(
    NSIMSAMPS, NSTEPS, policy=randPol, policy_idx_type='full',
    p_diabetes=PROB_DIAB, use_tqdm=False) #True, tqdm_desc='Behaviour Policy Simulation')

obs_samps = utils.format_dgen_samps(
    states, actions, rewards, diab, NSTEPS, NSIMSAMPS)

Convert data into array format

In [None]:
time = np.arange(NSTEPS)
times = np.stack(axis=0, arrays=[time]*NSIMSAMPS)
times = times[..., np.newaxis]

nf_tr_b2 = np.concatenate((times, states[:, 0:NSTEPS, :], actions, rewards, states[:, 1:, :]), axis=2)
nf_tr_b2.shape

(100000, 20, 5)

### Varying Episodes $\epsilon_{e} = 0.6$ (Policy Divergence $3.2^{20}$)

Set up evaluation policy, generate data and convert into factored format

In [None]:
EVAL_EPSILON = 0.6

evalPolSoft = np.copy(fullPol)
evalPolSoft[evalPolSoft == 1] = 1 - EVAL_EPSILON
evalPolSoft[evalPolSoft == 0] = EVAL_EPSILON / (n_actions - 1)

In [None]:
# Calculate policy divergence from Voloshin et al.
D = 0
for state in range(randPol.shape[0] - 1):
    for action in range(randPol.shape[1] - 1):
        difference = evalPolSoft[state, action]/randPol[state, action]
        D = max(D, difference)
print(D)
shorter_D = round(D,2)

3.2


In [None]:
dgen = DataGenerator()
states, actions, lengths, rewards, diab, emp_tx_totals, emp_r_totals = dgen.simulate(
    NSIMSAMPS, NSTEPS, policy=evalPolSoft, policy_idx_type='full',
    p_diabetes=PROB_DIAB, use_tqdm=False) #True, tqdm_desc='Behaviour Policy Simulation')

obs_samps = utils.format_dgen_samps(
    states, actions, rewards, diab, NSTEPS, NSIMSAMPS)

In [None]:
time = np.arange(NSTEPS)
times = np.stack(axis=0, arrays=[time]*NSIMSAMPS)
times = times[..., np.newaxis]

nf_tr_e = np.concatenate((times, states[:, 0:NSTEPS, :], actions, rewards, states[:, 1:, :]), axis=2)
nf_tr_e.shape

(100000, 20, 5)

In [None]:
evalPolSoft = pad_policy(evalPolSoft)

### Varying Episodes $\epsilon_{e} = 0.8$ (Policy Divergence $1.6^{20}$)



Set up evaluation policy, generate data and convert into factored format

In [None]:
EVAL_EPSILON = 0.8

evalPolSoft = np.copy(fullPol)
evalPolSoft[evalPolSoft == 1] = 1 - EVAL_EPSILON
evalPolSoft[evalPolSoft == 0] = EVAL_EPSILON / (n_actions - 1)

In [None]:
# Calculate policy divergence from Voloshin et al.
D = 0
for state in range(randPol.shape[0] - 1):
    for action in range(randPol.shape[1] - 1):
        difference = evalPolSoft[state, action]/randPol[state, action]
        D = max(D, difference)
print(D)
shorter_D = round(D,2)

1.5999999999999996


In [None]:
dgen = DataGenerator()
states, actions, lengths, rewards, diab, emp_tx_totals, emp_r_totals = dgen.simulate(
    NSIMSAMPS, NSTEPS, policy=evalPolSoft, policy_idx_type='full',
    p_diabetes=PROB_DIAB, use_tqdm=False) #True, tqdm_desc='Behaviour Policy Simulation')

obs_samps = utils.format_dgen_samps(
    states, actions, rewards, diab, NSTEPS, NSIMSAMPS)

In [None]:
time = np.arange(NSTEPS)
times = np.stack(axis=0, arrays=[time]*NSIMSAMPS)
times = times[..., np.newaxis]

nf_tr_e = np.concatenate((times, states[:, 0:NSTEPS, :], actions, rewards, states[:, 1:, :]), axis=2)
nf_tr_e.shape

(100000, 20, 5)

In [None]:
evalPolSoft = pad_policy(evalPolSoft)

### Varying Episode Length $T$ With $\epsilon_{e} = 0.8$ (Policy Divergence $1.6^{T}$)

Set up evaluation policy, generate data and convert into factored format

In [None]:
EVAL_EPSILON = 0.8

evalPolSoft = np.copy(fullPol)
evalPolSoft[evalPolSoft == 1] = 1 - EVAL_EPSILON
evalPolSoft[evalPolSoft == 0] = EVAL_EPSILON / (n_actions - 1)

In [None]:
# Calculate policy divergence from Voloshin et al.
D = 0
for state in range(randPol.shape[0] - 1):
    for action in range(randPol.shape[1] - 1):
        difference = evalPolSoft[state, action]/randPol[state, action]
        D = max(D, difference)
print(D)
shorter_D = round(D,2)

1.5999999999999996


In [None]:
dgen = DataGenerator()
states, actions, lengths, rewards, diab, emp_tx_totals, emp_r_totals = dgen.simulate(
    NSIMSAMPS, NSTEPS, policy=evalPolSoft, policy_idx_type='full',
    p_diabetes=PROB_DIAB, use_tqdm=False) #True, tqdm_desc='Behaviour Policy Simulation')

obs_samps = utils.format_dgen_samps(
    states, actions, rewards, diab, NSTEPS, NSIMSAMPS)

In [None]:
time = np.arange(NSTEPS)
times = np.stack(axis=0, arrays=[time]*NSIMSAMPS)
times = times[..., np.newaxis]

nf_tr_e = np.concatenate((times, states[:, 0:NSTEPS, :], actions, rewards, states[:, 1:, :]), axis=2)
nf_tr_e.shape

(100000, 20, 5)

In [None]:
evalPolSoft = pad_policy(evalPolSoft)