<a href="https://colab.research.google.com/github/ajagota7/Shaping/blob/main/Lifegate_straight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import numpy as np
import os
from google.colab import drive
import pickle
# np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
from scipy.optimize import minimize
import random
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import torch
import sys
import plotly.graph_objects as go

# deadend dependencies

In [2]:
# !git clone https://github.com/microsoft/med-deadend.git


# Lifegate class play

In [3]:
import os
from copy import deepcopy
import pygame
import numpy as np
import click


pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
# RGB colors
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
RED = (255, 0, 0)
BLUE = (0, 100, 255)
GREEN = (0, 255, 0)
WALL = (80, 80, 80)
YELLOW = (255, 255, 0)



class LifeGate(object):
    def __init__(self, state_mode, rng, death_drag, max_steps=100, fixed_life=True, rendering=False, image_saving=False, render_dir=None):
        self.rng = rng
        self.state_dtype = np.float32
        self.frame_skip = 1  # for env consistency
        self.fixed_life = fixed_life
        self.blue = BLUE
        self.death_drag = death_drag
        self.legal_actions = [0, 1, 2, 3, 4]
        self.action_meanings = ['no-op', 'up', 'down', 'left', 'right']
        self.reward_scheme = {'death': -1.0, 'recovery': +1.0, 'step': 0.0, 'barrier': 0.0}
        self.nb_actions = len(self.legal_actions)
        self.player_pos_x = None
        self.player_pos_y = None
        self.agent_init_pos = None
        self.state_mode = state_mode    # how the returned state look like ('pixel' or '1hot' or 'multi-head')
        # self.scr_w = None
        # self.scr_h = None
        # self.possible_recoveries = []
        self.recovery_observablity = True
        # self.observability_switch_point = None  # where to turn observability off
        # self.rendering_scale = None
        # self.barriers = None
        self.recoveries = None
        self.deaths = None
        # self.dead_ends = None
        self._rendering = rendering
        # self.state_shape = None
        self.init_subclass()
        if rendering:
            self._init_pygame()
        self.image_saving = image_saving
        self.render_dir_main = render_dir
        self.render_dir = None
        self.state = None
        self.step_id = 0
        self.game_over = False

        self.max_steps = max_steps

        self.reset()

    def init_subclass(self):
        # should implement sizes, barriers, recoveries, deaths, init_player(), and rendering_scale
        self.scr_w, self.scr_h = 10, 10
        self.tabular_state_shape = (self.scr_w, self.scr_h)
        self.state_shape = [24]
        self.rendering_scale = 30
        self.barriers = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [1, 5], [2, 5], [3, 5], [4, 5]]
        self.possible_recoveries = [[5, 0], [6, 0], [7, 0]]
        self.main_deaths = [[self.scr_w - 1, k] for k in range(self.scr_h)] + [[8,0]]
        self.dead_ends = [[x, y] for x in range(self.scr_w // 2, self.scr_w - 1) for y in range(self.scr_w // 2, self.scr_w)]
        self.observability_switch_point = [0, 5]

    @property
    def rendering(self):
        return self._rendering

    @rendering.setter
    def rendering(self, flag):
        if flag is True:
            if self._rendering is False:
                self._init_pygame()
                self._rendering = True
        else:
            self.close()
            self._rendering = False

    def _init_pygame(self):
        pygame.init()
        size = [self.rendering_scale * self.scr_w, self.rendering_scale * self.scr_h]
        self.screen = pygame.display.set_mode(size)
        pygame.display.set_caption("LifeGate")

    def _init_rendering_folder(self):
        if self.render_dir_main is None:
            self.render_dir_main = 'render'
        if not os.path.exists(os.path.join(os.getcwd(), self.render_dir_main)):
            os.mkdir(os.path.join(os.getcwd(), self.render_dir_main))
        i = 0
        while os.path.exists(os.path.join(os.getcwd(), self.render_dir_main, 'render' + str(i))):
            i += 1
        self.render_dir = os.path.join(os.getcwd(), self.render_dir_main, 'render' + str(i))
        os.mkdir(self.render_dir)

    def reset(self):
        if self.image_saving:
            self._init_rendering_folder()
        self.game_over = False
        self.step_id = 0
        self.recovery_observablity = True
        self.blue = BLUE
        state = self.init_episode()
        return state

    def init_episode(self):
        # should implement reconfigurations at the beginning of each episode
        self.player_pos_x, self.player_pos_y = 2, self.scr_h - 1
        targets = deepcopy(self.possible_recoveries)
        # if self.fixed_life == True:
        #     rec = targets.pop(2)  # fixed life-gate for DQN
        # else:
        #     rec = targets.pop(self.rng.randint(len(targets)))
        self.recoveries = targets #[rec]
        self.deaths = self.main_deaths #+ targets
        return self.get_obs(self.state_mode)

    def render(self):
        if not self.rendering:
            return
        pygame.event.pump()
        self.screen.fill(BLACK)
        size = [self.rendering_scale, self.rendering_scale]
        for pos in self.dead_ends:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, YELLOW, rec1)
        player = pygame.Rect(self.rendering_scale * self.player_pos_x, self.rendering_scale * self.player_pos_y,
                             size[0], size[1])
        pygame.draw.rect(self.screen, WHITE, player)
        for pos in self.deaths:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, RED, rec1)
        for pos in self.recoveries:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, self.blue, rec1)  # self.blue will change if reach obs point
        for pos in self.barriers:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, WALL, rec1)
        pygame.display.flip()

        if self.image_saving:
            self.save_image()

    def save_image(self):
        if self.rendering and self.render_dir is not None:
            pygame.image.save(self.screen, self.render_dir + '/render' + str(self.step_id) + '.jpg')
        else:
            raise ValueError('env.rendering is False and/or environment has not been reset.')

    def close(self):
        if self.rendering:
            pygame.quit()

    def _move_player(self, action):
        x, y = (self.player_pos_x, self.player_pos_y)
        # dead-end:
        if [x, y] in self.dead_ends:
            if self.rng.binomial(1, 0.70):
                action = 4  # forceful right
            else:
                action = 0  # no-op
        else:
            # natural risk of death
            if self.rng.binomial(1, self.death_drag):  # say with 25% if death_drag==0.25
                action = 4

        if action == 4:    # right
            x += 1
        elif action == 3:  # left
            x -= 1
        elif action == 2:  # down
            y += 1
        elif action == 1:  # up
            y -= 1
        # updating the position
        if [x, y] in self.barriers or x < 0 or y < 0 or y >= self.scr_h:
            return
        else:
            self.player_pos_x, self.player_pos_y = x, y

    def _get_status(self):
        # check the current situation
        if [self.player_pos_x, self.player_pos_y] in self.deaths:
            return 'death'
        elif [self.player_pos_x, self.player_pos_y] in self.recoveries:
            return 'recovery'

    def step(self, action):
        assert action in self.legal_actions, 'Illegal action.'
        if self.step_id >= self.max_steps - 1:
            self.game_over = True
            return self.get_obs(self.state_mode), 0., self.game_over, {}
        self.step_id += 1
        self._move_player(action)
        if [self.player_pos_x, self.player_pos_y] == self.observability_switch_point and self.recovery_observablity == True:
            self.recovery_observablity = False
            self.blue = BLACK
        status = self._get_status()
        if status == 'death':
            self.game_over = True
            reward = self.reward_scheme['death']
        elif status == 'recovery':
            self.game_over = True
            reward = self.reward_scheme['recovery']
        else:
            reward = self.reward_scheme['step']
        return self.get_obs(self.state_mode), reward, self.game_over, {}

    def get_lives(self):
        if self.game_over == True:
            return 0
        else:
            return 1

    def get_state(self):
        return self.get_obs(self.state_mode)

    def get_obs(self, method):
        if method == 'vector':
            return self._get_vec_obs()
        elif method == 'pixel':
            return self._get_pixel_obs()
        elif method == 'tabular':
            return self._get_tabular_obs()
        else:
            raise ValueError('Unknown observation method.')

    def _get_vec_obs(self):
        x = np.zeros(self.scr_w + self.scr_h + len(self.possible_recoveries), dtype=self.state_dtype)
        x[self.player_pos_x] = 1.0
        x[self.player_pos_y + self.scr_w] = 1.0
        if self.recovery_observablity == True or self.fixed_life == True:
            for k in self.recoveries:
                x[k[0] - 5 + self.scr_w + self.scr_h] = 1.0
        return x

    def _get_tabular_obs(self):
        return np.array([self.player_pos_x, self.player_pos_y])

    def _get_pixel_obs(self):
        raise NotImplementedError

# shaping dependencies

In [5]:
!git clone https://github.com/ajagota7/Shaping.git

Cloning into 'Shaping'...
remote: Enumerating objects: 144, done.[K
remote: Counting objects: 100% (144/144), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 144 (delta 73), reused 107 (delta 43), pack-reused 0[K
Receiving objects: 100% (144/144), 12.00 MiB | 15.21 MiB/s, done.
Resolving deltas: 100% (73/73), done.


In [6]:
# %cd /content/Shaping

In [7]:
# !git pull origin main

In [8]:
# cd /content/

In [9]:
# %cd /content/Shaping

import zipfile

with zipfile.ZipFile('/content/Shaping/lifegate_1.zip', 'r') as zip_ref:
    # zip_ref.extractall('/content/med-deadend/lifegate/results/lifegate_1')
    zip_ref.extractall('/content/Shaping/')

In [10]:
import sys
# sys.path.append('/content/med-deadend/lifegate')
sys.path.append('/content/Shaping/')



In [11]:
import q_networks

In [12]:
# %cd /content/med-deadend/lifegate


# results_dir = 'results/lifegate_1/'
results_dir = '/content/Shaping/'
# Load the Q tables from the primary learning agent, Q_D and Q_R value functions
with open(results_dir+'tabular_qnet.pkl', 'rb') as fq:
    ai = pickle.load(fq)

with open(results_dir+'tabular_qd.pkl', 'rb') as fd:
    ai_d = pickle.load(fd)

with open(results_dir+'tabular_qr.pkl', 'rb') as fr:
    ai_r = pickle.load(fr)

In [13]:
q_table = np.zeros((10, 10, 5))
q_d = np.zeros_like(q_table)
q_r = np.zeros_like(q_table)


for i in range(10):
    for j in range(10):
        for a in range(5):
            key = tuple([j, i, a])
            try:
                q_table[i,j,a] = ai.q[key]
                q_d[i,j,a] = ai_d.q[key]
                q_r[i,j,a] = ai_r.q[key]
            except:
                pass

In [14]:
import yaml
import random
# from lifegate import LifeGate
params = yaml.safe_load(open(results_dir+'config.yaml', 'r'))
np.random.seed(seed=params['random_seed'])
random.seed(params['random_seed'])
random_state = np.random.RandomState(params['random_seed'])

# env

In [15]:
env = LifeGate(max_steps=params['episode_max_len'], state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.0)

In [16]:
env_30 = LifeGate(max_steps=30, state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.1)

In [17]:
env_50 = LifeGate(max_steps=50, state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.1)

In [18]:
import Shaping
from Shaping import *
# %cd /content/Shaping

from choose_actions import action_probs_top_n_epsilon
from shaping_features import *
from gen_policies import *
from IS import *
from subset_policies import *
from v_pi_e import *
from optimization import *
from neural_net import *
from prep_variance import *
from SCOPE_variance import SCOPE_variance

In [19]:
import torch.nn.functional as F

# Test model with l2 reg







In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class CustomizableFeatureNet(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_prob=0.2, l2_lambda=0.01, dtype=torch.float32):
        super(CustomizableFeatureNet, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # Create the hidden layers based on the provided sizes
        for in_dim, out_dim in zip([input_dim] + hidden_dims, hidden_dims):
            layer = nn.Linear(in_dim, out_dim).to(dtype)
            self.hidden_layers.append(layer)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim).to(dtype)
        self.dropout = nn.Dropout(dropout_prob)
        self.l2_lambda = l2_lambda

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

    def l2_regularization(self):
        l2_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l2_reg += torch.norm(layer.weight)
        l2_reg += torch.norm(self.output_layer.weight)
        return self.l2_lambda * l2_reg


# SCOPE straight

In [159]:
class SCOPE_straight(object):

  def __init__(self, model, gamma, num_bootstraps, pi_b, P_pi_b, P_pi_e, percent_to_estimate_phi, dtype):
        self.model = model
        self.gamma = gamma
        self.num_bootstraps = num_bootstraps
        self.pi_b = pi_b
        self.P_pi_b = P_pi_b
        self.P_pi_e = P_pi_e
        self.dtype = dtype

        self.percent_to_estimate_phi = percent_to_estimate_phi
        # self.num_epochs = num_epochs

  def subset_policies(self):
    # seed_value = 0
    # np.random.seed(seed_value)
    num_policies = len(self.pi_b)
    num_policies_to_estimate_phi = int(num_policies * self.percent_to_estimate_phi)

    policies_for_scope = self.pi_b[num_policies_to_estimate_phi:]
    policies_for_phi = self.pi_b[:num_policies_to_estimate_phi]

    return policies_for_phi, policies_for_scope


  # ---------------
  # Pre-processing
  # ---------------

  def prep_policies(self, chosen_policies):
      # Initialize lists to store axis data for each policy
      timesteps = []
      # states = []
      # state_first = []
      # state_last = []
      actions = []
      rewards = []
      # gamma_last = []
      # weight_last = []
      # weight_first = []
      # all_weights_temp, weights = calculate_importance_weights(P_pi_e, P_pi_b, pi_b)
      weights = calculate_importance_weights(self.P_pi_e, self.P_pi_b, chosen_policies)
      psi = []

      states_current = []
      states_next = []
      states_all = []

      states_last = []
      psi_last = []

      for index, policy in enumerate(chosen_policies):
          policy_array = np.array(policy)

          timesteps.append(policy_array['timestep'].astype(int))
          actions.append(policy_array['action'])
          rewards.append(policy_array['reward'].astype(float))

          state_last = policy_array['state_next'][-1]
          last_psi = smallest_distance_to_deadend(state_last, env)
          states_last.append(state_last)
          psi_last.append(last_psi)

          # Concatenate psi array with last_psi
          # all_psi = np.concatenate((policy_array['psi'], [last_psi]))
          # psi.append(all_psi)
          psi.append(policy_array['psi'])

          states_next.append(policy_array['state_next'])
          states_current.append(policy_array['state'])
          # all_states = policy_array['state'] + policy_array['state_next'][-1]
          all_states = np.vstack((policy_array['state'],policy_array['state_next'][-1]))
          states_all.append(all_states)

          # states_all.append(np.concatenate((policy_array['state'], policy_array['state_next'][-1])))



      return timesteps, rewards, states_next, states_current, weights, actions, psi, states_last, psi_last

  def padding_IS_terms(self, timesteps, actions, rewards, weights):

    # Find the maximum length among all lists
    max_length = max(len(traj) for traj in timesteps)

    # Define the padding values
    zero_padding = 0

    # Pad each list to match the maximum length
    padded_timesteps = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in timesteps]
    padded_rewards = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in rewards]
    padded_actions = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in actions]
    padded_weights = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in weights]

    return padded_timesteps, padded_rewards, padded_actions, padded_weights


  def tensorize_IS_terms(self, padded_timesteps, padded_rewards, padded_weights):

    padded_timestep_tensors = torch.tensor(padded_timesteps, dtype = self.dtype)
    padded_reward_tensors = torch.tensor(padded_rewards, dtype = self.dtype)
    padded_weight_tensors = torch.tensor(padded_weights, dtype = self.dtype)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors

  def padding_states_all(self, states_all, psi):
    max_length = max(len(trajectory) for trajectory in states_all)

    zero_padding = 0

    # Pad each trajectory to make them all the same length
    padded_states_all = [
        [list(item) for item in trajectory] + [[0, 0]] * (max_length - len(trajectory))
        for trajectory in states_all
    ]

    padded_psi = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in psi]
    mask = [[1] * len(trajectory) + [0] * (max_length - len(trajectory)) for trajectory in states_all]

    return padded_states_all, padded_psi, mask



  def padding_states(self, states_next, states_current, psi):
    # Find the maximum length of trajectories
    max_length = max(len(trajectory) for trajectory in states_current)

    zero_padding = 0

    # Pad each trajectory to make them all the same length
    padded_states_next = [
        [list(item) for item in trajectory] + [[0, 0]] * (max_length - len(trajectory))
        for trajectory in states_next
    ]

    # Pad each trajectory to make them all the same length
    padded_states_current = [
        [list(item) for item in trajectory] + [[0, 0]] * (max_length - len(trajectory))
        for trajectory in states_current
    ]

    padded_psi = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in psi]

    # Create mask
    mask = [[1] * len(trajectory) + [0] * (max_length - len(trajectory)) for trajectory in states_current]

    return padded_states_next, padded_states_current, padded_psi, mask


  def tensorize_padded_terms(self, padded_states_next, padded_states_current, padded_psi,mask):
    padded_states_next_tensors = torch.tensor(padded_states_next, dtype = self.dtype)
    padded_states_current_tensors = torch.tensor(padded_states_current, dtype = self.dtype)
    padded_psi_tensors = torch.tensor(padded_psi, dtype = self.dtype)

    mask_tensor = torch.tensor(mask, dtype = self.dtype)
    return padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor

  def tensorize_all_states_psi(self, padded_states_all, padded_psi, mask):
    padded_states_all_tensors = torch.tensor(padded_states_all, dtype = self.dtype)
    padded_psi_tensors = torch.tensor(padded_psi, dtype = self.dtype)
    mask_tensor = torch.tensor(mask, dtype = self.dtype)

    return padded_states_all_tensors, padded_psi_tensors, mask_tensor

  def tensorize_last_states_psi(self, states_last, psi_last):
    states_last_tensor = torch.tensor(states_last, dtype = self.dtype)
    psi_last_tensor = torch.tensor(psi_last, dtype = self.dtype)

    return states_last_tensor, psi_last_tensor

  #-----------------------
  # Preparation Functions
  # ----------------------

  def prepare_IS(self):
    timesteps, rewards, states_next, states_current, weights, actions,_,_,_ = self.prep_policies(self.pi_b)
    padded_timesteps, padded_rewards, padded_actions, padded_weights = self.padding_IS_terms(timesteps, actions, rewards, weights)
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors = self.tensorize_IS_terms(padded_timesteps, padded_rewards, padded_weights)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors

  def prepare_SCOPE(self, policies):
    timesteps, rewards, states_next, states_current, weights, actions, psi,states_last, psi_last = self.prep_policies(policies)
    padded_timesteps, padded_rewards, padded_actions, padded_weights = self.padding_IS_terms(timesteps, actions, rewards, weights)
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors = self.tensorize_IS_terms(padded_timesteps, padded_rewards, padded_weights)
    padded_states_next, padded_states_current, padded_psi, mask = self.padding_states(states_next, states_current, psi)
    padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor = self.tensorize_padded_terms(padded_states_next, padded_states_current, padded_psi, mask)
    states_last_tensor, psi_last_tensor = self.tensorize_last_states_psi(states_last, psi_last)
    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor

  def prepare_SCOPE_phi(self):
    phi_set,_ = self.subset_policies()
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor = self.prepare_SCOPE(phi_set)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor

  def prepare_SCOPE_test(self):
    _, scope_set = self.subset_policies()
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors,_,_,_,_ = self.prepare_SCOPE(scope_set)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors


  # ----------------
  # IS Calculations
  # ----------------


  def bootstrap_IS(self, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors):
    seed = 42
    torch.manual_seed(seed)

    num_samples = self.num_bootstraps
    num_bootstraps_lin = num_samples*padded_timestep_tensors.shape[0]

    # Sample indices with replacement
    sampled_indices = torch.randint(0, len(padded_timestep_tensors), size=(num_bootstraps_lin,), dtype=torch.long)

    reshaped_size = (num_samples, padded_timestep_tensors.shape[0], padded_timestep_tensors.shape[1])

    padded_IS = self.gamma**(padded_timestep_tensors)*padded_weight_tensors*padded_reward_tensors

    IS_bootstraps = padded_IS[sampled_indices].view(reshaped_size)

    # timestep_bootstraps = padded_timestep_tensors[sampled_indices].view(reshaped_size)
    # rewards_bootstraps = padded_reward_tensors[sampled_indices].view(reshaped_size)
    # weights_bootstraps = padded_weight_tensors[sampled_indices].view(reshaped_size)
    # return timestep_bootstraps, rewards_bootstraps, weights_bootstraps, IS_bootstraps
    return IS_bootstraps


  def calc_var_IS(self, IS_bootstraps):
    # Step 1: Sum along the third dimension
    sum_IS_trajectories = torch.sum(IS_bootstraps, dim=2)  # Shape: [1000, 1000]

    # Step 2: Take the mean along the second dimension
    mean_IS_sum = torch.mean(sum_IS_trajectories, dim=1)  # Shape: [1000]

    # Step 3: Calculate the variance across the first dimension
    IS_variance = torch.var(mean_IS_sum)  # A single scalar value

    IS_mean = torch.mean(mean_IS_sum) # A single scalar value

    return IS_mean, IS_variance


  def IS_pipeline(self):
    padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS = self.prepare_IS()
    # timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
    IS_bootstraps = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
    # IS_mean, IS_variance = self.calc_variance_IS(timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS)
    IS_mean, IS_variance = self.calc_var_IS(IS_bootstraps)

    return IS_mean, IS_variance



  # ---------------------
  # SCOPE calculations
  # ---------------------

  def pass_states(self, padded_states_next_tensors, padded_states_current_tensors):
    states_next_output = self.model(padded_states_next_tensors)
    states_current_output = self.model(padded_states_current_tensors)

    return states_next_output.squeeze(), states_current_output.squeeze()

  def bootstrap_straight(self, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output):
      seed = 42
      torch.manual_seed(seed)

      num_samples = self.num_bootstraps
      num_bootstraps_lin = num_samples*padded_timestep_tensors.shape[0]

      # Sample indices with replacement
      sampled_indices = torch.randint(0, len(padded_timestep_tensors), size=(num_bootstraps_lin,), dtype=torch.long)

      reshaped_size = (num_samples, padded_timestep_tensors.shape[0], padded_timestep_tensors.shape[1])

      padded_scope = self.gamma**(padded_timestep_tensors)*padded_weight_tensors*(padded_reward_tensors +self.gamma*states_next_output - states_current_output)
      scope_bootstraps = padded_scope[sampled_indices].view(reshaped_size)

      return scope_bootstraps

  def pass_then_boostraps(self, padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors):
    states_next_output, states_current_output = self.pass_states(padded_states_next_tensors, padded_states_current_tensors)
    # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
    scope_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
    # return timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps
    return scope_bootstraps

  def calc_var_straight(self, scope_bootstraps):

    # Step 1: Sum along the third dimension
    sum_scope_trajectories = torch.sum(scope_bootstraps, dim=2)  # Shape: [1000, 1000]

    # Step 2: Take the mean along the second dimension
    mean_scope_sum = torch.mean(sum_scope_trajectories, dim=1)  # Shape: [1000]

    # Step 3: Calculate the variance across the first dimension
    scope_variance = torch.var(mean_scope_sum)  # A single scalar value

    scope_mean = torch.mean(mean_scope_sum) # A single scalar value

    return scope_mean, scope_variance

  def train_var_scope(self, num_epochs, learning_rate, scope_weight=1, mse_weight=1):

      # IS terms for comparison to SCOPE
      padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS = self.prepare_IS()
      # timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
      # IS_mean, IS_variance = self.calc_variance_IS(timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS)

      IS_bootstraps = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
      IS_mean, IS_variance = self.calc_var_IS(IS_bootstraps)

      # SCOPE terms for training phi
      padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor = self.prepare_SCOPE_phi()


      self.model.train()

      # Enable anomaly detection
      torch.autograd.set_detect_anomaly(True)

      optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

      for epoch in range(num_epochs):
          total_loss = 0


          # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.pass_then_boostraps(padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)

          states_next_output, states_current_output = self.pass_states(padded_states_next_tensors, padded_states_current_tensors)
          # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
          # SCOPE_mean, SCOPE_variance = self.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

          scope_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
          SCOPE_mean, SCOPE_variance = self.calc_var_straight(scope_bootstraps)

          # mse_loss = F.mse_loss(states_current_output, 0.2*padded_psi_tensors)
          mse_loss = F.mse_loss(states_current_output, 0.1*padded_psi_tensors, reduction='none')
          masked_mse_loss = mse_loss * mask_tensor

          states_last_output = self.model(states_last_tensor)
          mse_states_last_loss = F.mse_loss(states_last_output.squeeze(),0.1*psi_last_tensor, reduction = 'none')

          # mean_mse_loss = masked_mse_loss.mean()
          sum_mse_loss = torch.sum(masked_mse_loss, dim = 1)

          mean_mse_loss = torch.mean(sum_mse_loss + mse_states_last_loss)


          print(f"Epoch {epoch+1}")
          print("IS variance: ", IS_variance)
          print("SCOPE Var loss: ", SCOPE_variance)
          print("MSE loss: ", mean_mse_loss.item())

          # Testing evaluaton
          scope_mean, scope_var = self.evaluate_scope()
          print(f"SCOPE mean: {scope_mean}, SCOPE var: {scope_var}")
          self.model.train()


          # tot = SCOPE_variance
          # tot = SCOPE_variance + mse_loss
          tot = scope_weight*SCOPE_variance + mse_weight*mean_mse_loss

          optimizer.zero_grad()

          # Retain the graph to avoid clearing it before backward pass
          tot.backward(retain_graph=True)

          optimizer.step()

          total_loss += tot.item()

          print(f"Total Loss: {total_loss}")
          print("-" * 40)

      # Disable anomaly detection after running the code
      torch.autograd.set_detect_anomaly(False)

      for name, param in self.model.named_parameters():
          if param.requires_grad:
              print(f"Parameter name: {name}")
              print(f"Weights: {param.data}")

      return self.model #, sum_mse_loss, mse_states_last_loss

  def evaluate_scope(self):
    self.model.eval()
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors = self.prepare_SCOPE_test()
    # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.pass_then_boostraps(padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
    # SCOPE_mean, SCOPE_variance = self.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

    scope_bootstraps = self.pass_then_boostraps(padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
    SCOPE_mean, SCOPE_variance = self.calc_var_straight(scope_bootstraps)

    return SCOPE_mean, SCOPE_variance


  # -----------------------
  # Heatmaps for lifegate
  # -----------------------
  def get_model_output_dict(self):

    self.model.eval()

    # Initialize an empty dictionary to store data
    data = {}

    # Loop through all combinations from [0,0] to [9,9]
    for i in range(10):
      for j in range(10):
          # Prepare input data
          input_data = torch.tensor([i, j], dtype=torch.float64)

          # Pass input through the self.model
          output = self.model(input_data)

          # Store data in the dictionary
          data[(i, j)] = output.item()

    return data

  def plot_heatmap(self, data):
    values = np.zeros((10, 10))
    for (x, y), value in data.items():
        values[y, x] = value

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(z=values, colorscale='viridis'))

    # Add colorbar
    fig.update_layout(coloraxis_colorbar=dict(title='Values',
                                              ticks='outside',
                                              tickvals=[np.min(values), np.max(values)],
                                              ticktext=[np.min(values), np.max(values)]))

    # Add labels and title
    fig.update_layout(xaxis=dict(tickvals=np.arange(10), ticktext=list(range(10)), title='X'),
                      yaxis=dict(tickvals=np.arange(9, -1, -1), ticktext=list(range(9, -1, -1)), title='Y', autorange="reversed"),
                      title='Heatmap')

    fig.show()

  def get_heatmap(self):
    data = self.get_model_output_dict()
    self.plot_heatmap(data)

  # ---------------------
  # State Visitation Heatmap
  # ---------------------

  def count_state_visits(self):
    state_visit_counts = {}
    for trajectory in self.pi_b:
        for data_point in trajectory:
            state = tuple(data_point['state'])
            if state not in state_visit_counts:
                state_visit_counts[state] = 0
            state_visit_counts[state] += 1

        # Include last state_next of the trajectory
        last_state_next = tuple(trajectory[-1]['state_next'])
        if last_state_next not in state_visit_counts:
            state_visit_counts[last_state_next] = 0
        state_visit_counts[last_state_next] += 1

    return state_visit_counts

  def create_state_visit_dict(self):
      state_visit_dict = {}
      for i in range(10):
          for j in range(10):
              state_visit_dict[(i, j)] = 0
      return state_visit_dict

  def fill_state_visit_dict(self,state_visit_counts):
      state_visit_dict = self.create_state_visit_dict()
      for state, count in state_visit_counts.items():
          state_visit_dict[state] = count
      return state_visit_dict


  def plot_state_visitations_heatmap(self, state_visit_dict):
    # Create lists to store x, y, and z values
    x = []
    y = []
    z = []

    # Iterate through the state visit dictionary
    for state, count in state_visit_dict.items():
        x.append(state[0])
        y.append(9 - state[1])  # Flip y-axis to have (0, 0) at the bottom-left
        z.append(count)

    # Create the heatmap trace
    trace = go.Heatmap(
        x=x,
        y=y,
        z=z,
        colorscale='Viridis',  # Choose a colorscale
        colorbar=dict(title='Visits'),
        zmin=0,
        zmax=max(z)  # Set maximum value for the color scale
    )

    # Create layout
    layout = go.Layout(
        title='State Visitations Heatmap',
        xaxis=dict(title='X-axis'),
        yaxis=dict(title='Y-axis', tickvals=list(range(10)), ticktext=list(range(9, -1, -1))),
    )

    # Create figure
    fig = go.Figure(data=[trace], layout=layout)
    fig.show()


  def get_state_visitation_heatmap(self):

    # Count state visits
    state_visit_counts = self.count_state_visits()

    # Fill state visit dictionary
    state_visit_dict = self.fill_state_visit_dict(state_visit_counts)

    # Assuming state_visit_dict is your dictionary with state visitations
    self.plot_state_visitations_heatmap(state_visit_dict)




# Test class

In [22]:
env.possible_recoveries

[[5, 0], [6, 0], [7, 0]]

In [149]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e = experiment_actions(1000, env, P_pi_e)
model_200_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_200_0p99 = SCOPE_straight(model_200_0p99, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [112]:
timesteps, rewards, states_next, states_current, weights, actions, psi, states_last, psi_last = test_200_0p99.prep_policies(pi_b)

In [150]:
model, masked_mean_set, last_set = test_200_0p99.train_var_scope(2, 0.001)

Epoch 1
IS variance:  tensor(6.6645e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0617, dtype=torch.float64, grad_fn=<VarBackward0>)
SCOPE mean: 0.03283755622152981, SCOPE var: 0.005437360955003497
Total Loss: 1.0616657317194855
----------------------------------------
Epoch 2
IS variance:  tensor(6.6645e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0278, dtype=torch.float64, grad_fn=<VarBackward0>)
SCOPE mean: 0.04225211428680145, SCOPE var: 0.005846893896544516
Total Loss: 1.0278019486643444
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[-0.5256, -0.1831],
        [ 0.4874, -0.2204],
        [ 0.3107,  0.1398],
        [ 0.0890, -0.3095],
        [ 0.6320,  0.2276],
        [-0.2607, -0.6321],
        [-0.7033, -0.3404],
        [-0.5834, -0.1342],
        [ 0.0957, -0.5969],
        [-0.4036,  0.4857],
        [-0.4621, -0.6912],
        [ 0.6774, -0.3109],
        [ 0.2194, -0.6755],
        [ 0.4790,  0.3212],
        

In [157]:
masked_mean_set

tensor([ 9.1995,  5.6653,  6.6862,  5.5858, 10.7349, 11.2185,  8.0980, 11.1943,
         7.5683,  5.2833, 10.5943,  8.1137,  8.2862,  7.7130,  7.1801,  5.7873,
        13.8663,  6.1387,  4.9187,  6.9743,  9.1391, 10.0840,  6.5897,  4.5218,
         7.6901,  4.7775, 14.2380,  8.5109,  4.9762,  6.2620,  7.8673,  5.6961,
         5.3566,  5.9950,  1.8361,  6.7066, 11.8596,  4.8783, 10.2837,  5.2763,
         5.0988,  3.5132,  5.1391,  9.1433,  6.4432,  5.8137,  5.3222,  7.7351,
         8.4064,  5.9786,  9.1857,  6.0923,  5.1522,  8.4583,  3.8289,  6.1480,
         8.8692,  9.6334, 14.3951,  3.6950], dtype=torch.float64,
       grad_fn=<SumBackward1>)

In [158]:
torch.mean(masked_mean_set+last_set)

tensor(7.7470, dtype=torch.float64, grad_fn=<MeanBackward0>)

In [154]:
last_set

tensor([6.2488e-02, 1.0773e-01, 6.2056e-01, 4.2670e-01, 7.4674e-01, 8.5925e-02,
        4.5368e-01, 6.7436e-01, 7.9104e-03, 4.9821e-02, 1.0558e+00, 6.7005e-01,
        1.4596e-01, 4.0665e-01, 6.6734e-01, 3.5449e-01, 1.8626e-01, 6.1782e-01,
        1.3178e+00, 2.2973e-01, 6.7766e-01, 3.6193e-01, 1.4151e-02, 1.1771e-01,
        3.3000e-01, 9.4572e-02, 1.7433e-02, 9.4038e-02, 2.8302e-01, 5.1651e-01,
        3.3207e-01, 3.1182e-01, 1.6128e-03, 1.7661e-01, 2.1079e-01, 2.4753e-01,
        3.5967e-01, 8.3684e-01, 3.0507e-04, 1.6805e+00, 1.0348e+00, 2.6982e-01,
        2.9373e-01, 8.5336e-01, 4.3011e-01, 3.8731e-01, 1.8092e-01, 1.7744e-01,
        2.8587e-01, 1.6206e-01, 1.3928e-01, 5.9762e-01, 1.2352e+00, 1.3601e-02,
        8.0091e-02, 4.7974e-01, 1.9591e-01, 2.5214e-01, 6.2668e-01, 1.6981e-01],
       dtype=torch.float64, grad_fn=<MseLossBackward0>)

In [121]:
model_200_0p99(torch.tensor(states_last)).squeeze()

tensor([ 3.5931e-01,  3.1535e-01,  1.1579e-02,  4.1809e-01,  5.3519e-01,
         8.3596e-02, -2.7350e-01, -7.7953e-02,  1.6937e-01,  7.5656e-02,
        -5.9883e-01,  1.6519e-01,  5.4058e-01,  1.2196e-01,  7.0372e-03,
        -1.7420e-02, -3.2525e-01,  3.3452e-01,  2.4492e-01,  3.1173e-01,
        -9.0382e-02, -1.4154e-01, -1.5056e-01, -2.8426e-02, -4.4593e-01,
        -2.8394e-01,  3.5727e-01, -5.7754e-01, -1.0026e-01, -9.1278e-02,
         3.4936e-02,  3.1954e-01, -4.4189e-01, -1.0707e-01,  7.4240e-02,
        -2.0949e-01, -1.4405e-01,  1.8865e-01, -2.4926e-01,  8.7673e-03,
         2.9567e-02,  2.9281e-01,  1.1581e-01, -7.9609e-04,  2.1255e-01,
        -6.5637e-02,  5.3519e-01,  3.3169e-01, -3.1210e-01,  2.9727e-01,
         2.4818e-02,  4.5850e-01, -6.3380e-01,  1.9436e-01, -1.9333e-01,
         2.6602e-02,  1.5739e-01,  1.6537e-01,  3.7224e-01, -1.5389e-01,
         4.4906e-01, -1.0166e-01,  5.0598e-01, -3.7695e-01,  4.6480e-01,
         2.2792e-01,  7.8922e-02, -4.3059e-01,  7.1

In [91]:
psi_last

[5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0

In [75]:
len(states_current[0])

29

In [78]:
len(states_all[0])

30

In [79]:
len(psi[0])

30

In [82]:
states_all

[array([[2., 9.],
        [2., 9.],
        [2., 9.],
        [2., 9.],
        [1., 9.],
        [1., 9.],
        [0., 9.],
        [0., 8.],
        [0., 7.],
        [0., 6.],
        [0., 6.],
        [0., 5.],
        [0., 4.],
        [0., 5.],
        [0., 5.],
        [0., 5.],
        [0., 5.],
        [0., 4.],
        [1., 4.],
        [1., 3.],
        [1., 4.],
        [1., 4.],
        [1., 3.],
        [2., 3.],
        [3., 3.],
        [3., 2.],
        [3., 1.],
        [4., 1.],
        [5., 1.],
        [5., 0.]]),
 array([[2., 9.],
        [1., 9.],
        [0., 9.],
        [0., 8.],
        [0., 7.],
        [0., 6.],
        [0., 7.],
        [0., 6.],
        [0., 5.],
        [0., 4.],
        [1., 4.],
        [1., 3.],
        [2., 3.],
        [2., 4.],
        [2., 3.],
        [2., 2.],
        [2., 2.],
        [2., 1.],
        [2., 1.],
        [2., 2.],
        [3., 2.],
        [3., 2.],
        [3., 1.],
        [4., 1.],
        [4., 1.],
        

In [68]:
states_next[0][-1]

array([5., 0.])

In [70]:
np.vstack((states_current[0],states_next[0][-1]))

array([[2., 9.],
       [2., 8.],
       [1., 8.],
       [1., 7.],
       [0., 7.],
       [0., 7.],
       [0., 6.],
       [0., 5.],
       [0., 5.],
       [0., 4.],
       [1., 4.],
       [1., 3.],
       [2., 3.],
       [2., 3.],
       [2., 4.],
       [2., 4.],
       [2., 4.],
       [2., 3.],
       [2., 3.],
       [2., 2.],
       [3., 2.],
       [4., 2.],
       [4., 1.],
       [5., 1.],
       [5., 0.]])

In [47]:
psi[0]

array([ 8.,  8.,  9.,  9., 10., 10., 10., 10., 10., 11., 10., 11., 10.,
       10.,  9.,  9.,  9., 10., 10., 11., 10.,  9., 10.,  9.])

In [43]:
len(states_next[0])

24

In [36]:
padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS = test_200_0p99.prepare_IS()
timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS, IS_boostraps = test_200_0p99.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
test_200_0p99.calc_variance_IS(timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS)


(tensor(0.1634, dtype=torch.float64), tensor(0.0052, dtype=torch.float64))

In [37]:
test_200_0p99.calc_var_IS(IS_boostraps)

(tensor(0.1634, dtype=torch.float64), tensor(0.0052, dtype=torch.float64))

In [40]:
padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor = test_200_0p99.prepare_SCOPE_phi()

states_next_output, states_current_output = test_200_0p99.pass_states(padded_states_next_tensors, padded_states_current_tensors)
timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps, scope_bootstraps = test_200_0p99.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
test_200_0p99.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

(tensor(0.2590, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0194, dtype=torch.float64, grad_fn=<VarBackward0>))

In [96]:
test_200_0p99.calc_var_straight(scope_bootstraps)

NameError: name 'scope_bootstraps' is not defined

In [33]:
IS_variance

tensor(0.0052, dtype=torch.float64)

In [None]:
test_200_0p99.get_state_visitation_heatmap()

In [None]:
test_200_0p99.IS_pipeline()

(tensor(0.0040, dtype=torch.float64), tensor(5.4213e-06, dtype=torch.float64))

In [None]:
pi_e = experiment_actions(1000, env_30, P_pi_e)


In [None]:
calc_V_pi_e(pi_e)

0.02618155036170724

In [None]:
test_200_0p99.evaluate_scope()

(tensor(0.1136, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>))

In [125]:
model, masked_mean_set, last_set = test_200_0p99.train_var_scope(2, 0.001)

TypeError: SCOPE_straight.pass_states() missing 1 required positional argument: 'states_last_tensor'

In [119]:
torch.sum(masked_mean_set, dim = 1)

tensor([ 8.5748,  4.6997,  9.4879,  5.7368,  8.7992, 12.2325,  9.0167,  7.4956,
         8.9889,  5.5547,  7.1841,  6.8971, 11.3291,  4.0391, 10.0364,  7.8897,
        10.0546,  5.6250,  4.2488,  5.0116,  6.4645,  8.6754,  9.2830,  5.0178,
         7.2064,  4.4252,  5.2096,  9.4458,  5.3339, 12.6139, 14.0995,  5.3023,
         5.8995,  5.9573,  7.1260,  6.6979, 12.6700,  8.7708, 14.1373,  8.8810,
         7.4070,  5.6319,  1.4903,  6.4208,  6.5101,  5.2310,  5.2161, 14.1081,
         6.2500,  7.6385,  8.8202,  5.3136,  4.6834,  4.8666,  4.5748,  8.1876,
         7.9586,  6.6502,  6.2471,  4.7426], dtype=torch.float64,
       grad_fn=<SumBackward1>)

In [None]:
test_200_0p99.get_heatmap()

# Test random policy

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_30, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 5, 0.05)
# pi_e = experiment_actions(200, env_30, P_pi_e)
model_200_random = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_200_random = SCOPE_straight(model_200_random, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
test_200_random.get_state_visitation_heatmap()

In [None]:
test_200_random.train_var_scope(200, 0.001)

Epoch 1
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0005, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0005218583187486271
----------------------------------------
Epoch 2
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(9.8199e-06, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 9.819862798753365e-06
----------------------------------------
Epoch 3
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(1.0250e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 1.024958128773479e-05
----------------------------------------
Epoch 4
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(9.3286e-06, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 9.328558541237725e-06
----------------------------------------
Epoch 5
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(8.3393e-06, dtype=torch.float64, grad_fn=<VarB

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_200_random.get_heatmap()

In [None]:
test_200_random.IS_pipeline()

(tensor(-0.0130, dtype=torch.float64), tensor(8.9415e-05, dtype=torch.float64))

In [None]:
pi_e = experiment_actions(1000, env_30, P_pi_e)


In [None]:
calc_V_pi_e(pi_e)

-0.12203056843017494

In [None]:
test_200_random.evaluate_scope()

(tensor(-0.0521, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0003, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test random pi_b

In [125]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_200_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_200_random_pi_b = SCOPE_straight(model_200_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [126]:
test_200_random_pi_b.IS_pipeline()

(tensor(2.6041, dtype=torch.float64), tensor(6.1715, dtype=torch.float64))

In [103]:
test_200_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_200_random_pi_b.train_var_scope(300, 0.001)

Epoch 1
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.1993, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04945267717328408
Total Loss: 0.24875341891612565
----------------------------------------
Epoch 2
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04854473065112503
Total Loss: 0.05144933898137734
----------------------------------------
Epoch 3
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04864399770972223
Total Loss: 0.05152271293468217
----------------------------------------
Epoch 4
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04866784514199147
Total Loss: 0.051536243945871946
----------------------------------------
Epoch 5
IS variance:  tenso

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_200_random_pi_b.get_heatmap()

In [None]:
test_200_random_pi_b.IS_pipeline()

(tensor(-4.8178e-10, dtype=torch.float64),
 tensor(1.6278e-19, dtype=torch.float64))

In [None]:
calc_V_pi_e(pi_e)

0.1871443974984857

In [None]:
test_200_random_pi_b.evaluate_scope()

(tensor(0.4028, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0122, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test random 400 pi_b

In [151]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(400, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_400_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_400_random_pi_b = SCOPE_straight(model_400_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [152]:
test_400_random_pi_b.IS_pipeline()

(tensor(0.7103, dtype=torch.float64), tensor(0.0590, dtype=torch.float64))

In [None]:
test_400_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_400_random_pi_b.train_var_scope(500, 0.001)

Epoch 1
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2298, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2863851925761847
Total Loss: 0.5162089638144078
----------------------------------------
Epoch 2
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.5702, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2473059801237863
Total Loss: 0.8175554880907143
----------------------------------------
Epoch 3
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.5554, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2326082591574042
Total Loss: 0.7880451023448775
----------------------------------------
Epoch 4
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.5384, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.21986669348691343
Total Loss: 0.7582925337594453
----------------------------------------
Epoch 5
IS variance:  tensor(4.4092

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_400_random_pi_b.get_heatmap()

In [None]:
calc_V_pi_e(pi_e)

0.15634452293280188

In [None]:
test_400_random_pi_b.evaluate_scope()

(tensor(0.1504, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0006, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test 600 pi_b top 2

In [93]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(600, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_600_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_600_random_pi_b = SCOPE_straight(model_600_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [94]:
test_600_random_pi_b.IS_pipeline()

(tensor(24.2014, dtype=torch.float64), tensor(134.2654, dtype=torch.float64))

In [None]:
test_600_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_600_random_pi_b.train_var_scope(300, 0.001)

Epoch 1
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0033, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.008124469012146312
SCOPE mean: 0.11593467505750471, SCOPE var: 0.001134831577478478
Total Loss: 0.011470035771407587
----------------------------------------
Epoch 2
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.007877572700959606
SCOPE mean: 0.11402489523061764, SCOPE var: 0.0011566205224887104
Total Loss: 0.008660720224668825
----------------------------------------
Epoch 3
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.007835659068336106
SCOPE mean: 0.1149561133499077, SCOPE var: 0.0011602695166762504
Total Loss: 0.00864455892775779
----------------------------------------
Epoch 4
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Va

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_600_random_pi_b.get_heatmap()

In [None]:
calc_V_pi_e(pi_e)

0.15891747325670808

In [None]:
test_600_random_pi_b.evaluate_scope()

(tensor(0.1061, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0004, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test model with l2 reg







In [166]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class CustomizableFeatureNet(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_prob=0.2, l2_lambda=0.01, dtype=torch.float32):
        super(CustomizableFeatureNet, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # Create the hidden layers based on the provided sizes
        for in_dim, out_dim in zip([input_dim] + hidden_dims, hidden_dims):
            layer = nn.Linear(in_dim, out_dim).to(dtype)
            self.hidden_layers.append(layer)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim).to(dtype)
        self.dropout = nn.Dropout(dropout_prob)
        self.l2_lambda = l2_lambda

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

    def l2_regularization(self):
        l2_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l2_reg += torch.norm(layer.weight)
        l2_reg += torch.norm(self.output_layer.weight)
        return self.l2_lambda * l2_reg


# Test 800 pi_b top 2

In [165]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(800, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_800_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l2_lambda=0.001)
test_800_random_pi_b = SCOPE_straight(model_800_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [166]:
test_800_random_pi_b.IS_pipeline()

(tensor(0.5781, dtype=torch.float64), tensor(0.0772, dtype=torch.float64))

In [167]:
test_800_random_pi_b.get_state_visitation_heatmap()

In [181]:
test_800_random_pi_b.train_var_scope(50, 0.0005)

Epoch 1
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0044, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.004218989771201134
SCOPE mean: 0.22285763285074944, SCOPE var: 0.016886603818618454
Total Loss: 0.008665766088156722
----------------------------------------
Epoch 2
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.004043985329936416
SCOPE mean: 0.21677965178133518, SCOPE var: 0.01643719341992325
Total Loss: 0.004883264246003726
----------------------------------------
Epoch 3
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.004032811828319944
SCOPE mean: 0.2141315282737515, SCOPE var: 0.016257919950321238
Total Loss: 0.00487736908343905
----------------------------------------
Epoch 4
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=10, bias=True)
  )
  (output_layer): Linear(in_features=10, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [182]:
test_800_random_pi_b.get_heatmap()

In [33]:
calc_V_pi_e(pi_e)

15.720270087636253

In [None]:
test_800_random_pi_b.evaluate_scope()

(tensor(0.1058, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0007, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test 800 pi_b weighted mse

In [167]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(800, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_800_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
test_800_random_pi_b = SCOPE_straight(model_800_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [43]:
model_800_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)


In [168]:
test_800_random_pi_b.IS_pipeline()

(tensor(0.2171, dtype=torch.float64), tensor(0.0069, dtype=torch.float64))

In [169]:
test_800_random_pi_b.get_state_visitation_heatmap()

In [174]:
test_800_random_pi_b.train_var_scope(50, 0.001, 1.05, 1)

Epoch 1
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0035, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2715759773210386
SCOPE mean: 0.2696508115449471, SCOPE var: 0.006249119573549508
Total Loss: 0.27522511218355766
----------------------------------------
Epoch 2
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0039, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.26279331083988405
SCOPE mean: 0.2643576825346261, SCOPE var: 0.006085705937020623
Total Loss: 0.2669093205259913
----------------------------------------
Epoch 3
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0039, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.26132147548679246
SCOPE mean: 0.25856304251984974, SCOPE var: 0.005944873514962677
Total Loss: 0.2654164315113085
----------------------------------------
Epoch 4
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0039, dty

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [171]:
test_800_random_pi_b.get_heatmap()

In [25]:
calc_V_pi_e(pi_e)

0.1603494445055814

In [45]:
test_800_random_pi_b.evaluate_scope()

(tensor(1.9744, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(1.6041, dtype=torch.float64, grad_fn=<VarBackward0>))

In [None]:
# Get the state_dict of the model
model_state_dict = test_200_0p99.model.state_dict()

# Print the keys to see the structure of the state_dict
print(model_state_dict.keys())

# Extract and print the weights of each layer
for name, param in model_state_dict.items():
    if 'weight' in name:
        print(f"Layer: {name}")
        print(param)

odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'output_layer.weight', 'output_layer.bias'])
Layer: hidden_layers.0.weight
tensor([[-0.5992, -0.0774],
        [ 0.4740, -0.2568],
        [ 0.3057,  0.1028],
        [ 0.1205, -0.2533],
        [ 0.6123,  0.1611],
        [-0.2607, -0.6321],
        [-0.7033, -0.3404],
        [-0.5834, -0.1342],
        [ 0.1821, -0.5556],
        [-0.4225,  0.3797],
        [-0.4621, -0.6912],
        [ 0.6232, -0.4203],
        [ 0.2262, -0.6769],
        [ 0.5225,  0.2460],
        [ 0.6153, -0.0401],
        [-0.1569, -0.2980]], dtype=torch.float64)
Layer: hidden_layers.1.weight
tensor([[ 1.0516e-01, -1.5570e-01, -8.9501e-02,  2.2105e-01,  2.2795e-01,
          6.3637e-02,  3.5722e-02, -1.2352e-01, -1.2532e-01, -1.9321e-01,
          1.7001e-01,  2.7690e-01,  5.0010e-02,  4.4123e-02, -1.7390e-01,
         -3.3544e-02],
        [ 2.9563e-02, -1.6539e-01,  1.8811e-01,  2.4416e-02, -2.0411

In [None]:
scope_testing = SCOPE_straight(model, 0.9, 10000, pi_b, P_pi_b, P_pi_e, dtype = torch.float64)

In [None]:
padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors = scope_testing.prepare()
timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = scope_testing.pass_then_boostraps(model, padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
IS_variance, scope_variance = scope_testing.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

# train var scope

In [None]:
def train_var_scope(model, num_epochs, learning_rate, test1):

    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors = test1.prepare()

    model.train()

    # Enable anomaly detection
    torch.autograd.set_detect_anomaly(True)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0

        # Forward pass
        # states_output, states_first_output, states_last_output = test1.pass_states(model, padded_state_tensors, states_first_tensor, states_last_tensor)
        # sums_states_weight_diff = test1.states_weight_diff_sums(states_output, padded_weight_diff_tensors)
        # gamma_weights_states_last_sub_states_first = test1.last_first_terms_operations(gamma_weights_last_tensor, states_last_output, states_first_output, weight_first_tensor)
        # # sample_sums_states_weight_diff, samples_gamma_weight_states_last_sub_states_first, samples_all_shaping, samples_IS_SCOPE = test1.bootstrap_shaping_terms(sums_states_weight_diff, gamma_weights_states_last_sub_states_first, IS_tensor)

        # samples_IS, sample_sums_states_weight_diff, samples_gamma_weight_states_last_sub_states_first, samples_all_shaping, samples_IS_SCOPE = test1.bootstrap_all_terms(sums_states_weight_diff, gamma_weights_states_last_sub_states_first, IS_tensor, padded_psi_tensors)


        # Calculate MSE loss between states_output and padded_state_tensors
        # mse_loss = F.mse_loss(states_output, padded_state_tensors)

        # E_IS_sq, E_IS_all_sq, E_s_wdiff_sq, E_s_wdiff_all_sq, E_IS_SCOPE, E_IS_E_SCOPE, _, variance_loss, E_IS, E_SCOPE = calculate_shaped_variance_play(samples_IS, sample_sums_states_weight_diff, samples_gamma_weight_states_last_sub_states_first, samples_all_shaping, samples_IS_SCOPE)

        timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = test1.pass_then_boostraps(model, padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
        IS_variance, variance_loss = test1.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)
        print(f"Epoch {epoch+1}")
        print("IS variance: ", IS_variance)
        print("SCOPE Var loss: ", variance_loss)
        # print("MSE loss: ", mse_loss.item())


        tot = variance_loss
        # tot = variance_loss + mse_loss

        optimizer.zero_grad()

        # Retain the graph to avoid clearing it before backward pass
        tot.backward(retain_graph=True)

        optimizer.step()

        total_loss += tot.item()

        print(f"Total Loss: {total_loss}")
        print("-" * 40)

    # Disable anomaly detection after running the code
    torch.autograd.set_detect_anomaly(False)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"Parameter name: {name}")
            print(f"Weights: {param.data}")

    return model


# Test

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
pi_e = experiment_actions(200, env, P_pi_e)

In [None]:
model_200 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 32], output_dim=1, dtype = torch.float64)

In [None]:
test_200 = SCOPE_straight(model_200, 0.9, 10000, pi_b, P_pi_b, P_pi_e, dtype = torch.float64)

In [None]:
model_200 = train_var_scope(model_200, 1000, 0.0005, test_200)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0008492256656322879
----------------------------------------
Epoch 22
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.000832036997470311
----------------------------------------
Epoch 23
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0008178091443168292
----------------------------------------
Epoch 24
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0008067588044517235
----------------------------------------
Epoch 25
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tenso

# Test 200 0.99

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e = experiment_actions(200, env, P_pi_e)

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_30, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e = experiment_actions(200, env, P_pi_e)
model_200_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float64)
test_200_0p99 = SCOPE_straight(model_200_0p99, 0.99, 10000, pi_b, P_pi_b, P_pi_e, dtype = torch.float64)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.8547, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.8546716723749704
----------------------------------------
Epoch 7
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.8345, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.8344950296004069
----------------------------------------
Epoch 8
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.8157, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.815702575610253
----------------------------------------
Epoch 9
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.7972, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.7971985257757953
----------------------------------------
Epoch 10
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.7790, dtype

In [None]:
model_200_0p99 = train_var_scope(model_200_0p99, 200, 0.001, test_200_0p99)

Epoch 1
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(6.2321e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 6.232071548283213e-05
----------------------------------------
Epoch 2
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(5.8212e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 5.821238477719147e-05
----------------------------------------
Epoch 3
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(5.4394e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 5.439401945160914e-05
----------------------------------------
Epoch 4
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(5.0930e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 5.093024913095342e-05
----------------------------------------
Epoch 5
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(4.7785e-05, dtype=torch.float64, grad_fn=<

# Test 400 0.99

In [None]:
P_pi_b_400 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_400 = experiment_actions(400, env, P_pi_b_400)
P_pi_e_400 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_400 = experiment_actions(400, env, P_pi_e_400)
model_400_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float64)
test_400_0p99 = SCOPE_straight(model_400_0p99, 0.99, 1000, pi_b_400, P_pi_b_400, P_pi_e_400, dtype = torch.float64)
model_400_0p99 = train_var_scope(model_400_0p99, 5, 0.001, test_400_0p99)

Epoch 1
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0864, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.08636927251674584
----------------------------------------
Epoch 2
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2360, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.23601228060459742
----------------------------------------
Epoch 3
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2284, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.22842130839536168
----------------------------------------
Epoch 4
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2203, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.22034274292801542
----------------------------------------
Epoch 5
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2122, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.21219201524204384
-

# Test 600 0.99

In [None]:
P_pi_b_600 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_600 = experiment_actions(600, env_30, P_pi_b_600)
P_pi_e_600 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_600 = experiment_actions(600, env, P_pi_e_600)
model_600_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float32)
test_600_0p99 = SCOPE_straight(model_600_0p99, 0.99, 10000, pi_b_600, P_pi_b_600, P_pi_e_600, dtype = torch.float32)
model_600_0p99 = train_var_scope(model_600_0p99, 5, 0.001, test_600_0p99)

Epoch 1
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0505, grad_fn=<VarBackward0>)
Total Loss: 0.05046245828270912
----------------------------------------
Epoch 2
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0493, grad_fn=<VarBackward0>)
Total Loss: 0.0493154413998127
----------------------------------------
Epoch 3
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0481, grad_fn=<VarBackward0>)
Total Loss: 0.04812745749950409
----------------------------------------
Epoch 4
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0469, grad_fn=<VarBackward0>)
Total Loss: 0.046937569975852966
----------------------------------------
Epoch 5
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0458, grad_fn=<VarBackward0>)
Total Loss: 0.04576045647263527
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[ 0.1204,  0.1797],
        [ 0.1732,  0.2432],
        [ 0.6242,  0.4461],
        [ 0.4436,  0.0618],
        [ 

# Test 800 0.99

In [None]:
P_pi_b_800 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_800 = experiment_actions(800, env_30, P_pi_b_800)
P_pi_e_800 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_800 = experiment_actions(800, env_30, P_pi_e_800)
model_800_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float32)
test_800_0p99 = SCOPE_straight(model_800_0p99, 0.99, 10000, pi_b_800, P_pi_b_800, P_pi_e_800, dtype = torch.float32)
model_800_0p99 = train_var_scope(model_800_0p99, 5, 0.001, test_800_0p99)

Epoch 1
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1004, grad_fn=<VarBackward0>)
Total Loss: 0.10040785372257233
----------------------------------------
Epoch 2
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1327, grad_fn=<VarBackward0>)
Total Loss: 0.13271035254001617
----------------------------------------
Epoch 3
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1297, grad_fn=<VarBackward0>)
Total Loss: 0.12969112396240234
----------------------------------------
Epoch 4
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1267, grad_fn=<VarBackward0>)
Total Loss: 0.12670785188674927
----------------------------------------
Epoch 5
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1238, grad_fn=<VarBackward0>)
Total Loss: 0.12376594543457031
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[ 0.1256,  0.1763],
        [ 0.1731,  0.2431],
        [ 0.6243,  0.4461],
        [ 0.4445, 

# Test 1000 0.99

In [None]:
P_pi_b_1000 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_1000 = experiment_actions(1000, env_30, P_pi_b_1000)
P_pi_e_1000 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_1000 = experiment_actions(1000, env, P_pi_e_1000)
model_1000_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float32)
test_1000_0p99 = SCOPE_straight(model_1000_0p99, 0.90, 10000, pi_b_1000, P_pi_b_1000, P_pi_e_1000, dtype = torch.float32)
model_1000_0p99 = train_var_scope(model_1000_0p99, 5, 0.001, test_1000_0p99)

Epoch 1
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0071, grad_fn=<VarBackward0>)
Total Loss: 0.0071372101083397865
----------------------------------------
Epoch 2
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0059, grad_fn=<VarBackward0>)
Total Loss: 0.005885153077542782
----------------------------------------
Epoch 3
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0057, grad_fn=<VarBackward0>)
Total Loss: 0.005744975060224533
----------------------------------------
Epoch 4
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0056, grad_fn=<VarBackward0>)
Total Loss: 0.005605767946690321
----------------------------------------
Epoch 5
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0055, grad_fn=<VarBackward0>)
Total Loss: 0.005469260271638632
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[-0.3398,  0.4284],
        [ 0.6470, -0.5601],
        [-0.0836, -0.5419],
        [ 0.