<a href="https://colab.research.google.com/github/ajagota7/Shaping/blob/main/Lifegate_straight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [5]:
import numpy as np
import os
from google.colab import drive
import pickle
# np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
from scipy.optimize import minimize
import random
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import torch
import sys
import plotly.graph_objects as go

# deadend dependencies

In [6]:
# !git clone https://github.com/microsoft/med-deadend.git


# Lifegate class play

In [7]:
import os
from copy import deepcopy
import pygame
import numpy as np
import click


pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [8]:
# RGB colors
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
RED = (255, 0, 0)
BLUE = (0, 100, 255)
GREEN = (0, 255, 0)
WALL = (80, 80, 80)
YELLOW = (255, 255, 0)



class LifeGate(object):
    def __init__(self, state_mode, rng, death_drag, max_steps=100, fixed_life=True, rendering=False, image_saving=False, render_dir=None):
        self.rng = rng
        self.state_dtype = np.float32
        self.frame_skip = 1  # for env consistency
        self.fixed_life = fixed_life
        self.blue = BLUE
        self.death_drag = death_drag
        self.legal_actions = [0, 1, 2, 3, 4]
        self.action_meanings = ['no-op', 'up', 'down', 'left', 'right']
        self.reward_scheme = {'death': -1.0, 'recovery': +1.0, 'step': 0.0, 'barrier': 0.0}
        self.nb_actions = len(self.legal_actions)
        self.player_pos_x = None
        self.player_pos_y = None
        self.agent_init_pos = None
        self.state_mode = state_mode    # how the returned state look like ('pixel' or '1hot' or 'multi-head')
        # self.scr_w = None
        # self.scr_h = None
        # self.possible_recoveries = []
        self.recovery_observablity = True
        # self.observability_switch_point = None  # where to turn observability off
        # self.rendering_scale = None
        # self.barriers = None
        self.recoveries = None
        self.deaths = None
        # self.dead_ends = None
        self._rendering = rendering
        # self.state_shape = None
        self.init_subclass()
        if rendering:
            self._init_pygame()
        self.image_saving = image_saving
        self.render_dir_main = render_dir
        self.render_dir = None
        self.state = None
        self.step_id = 0
        self.game_over = False

        self.max_steps = max_steps

        self.reset()

    def init_subclass(self):
        # should implement sizes, barriers, recoveries, deaths, init_player(), and rendering_scale
        self.scr_w, self.scr_h = 10, 10
        self.tabular_state_shape = (self.scr_w, self.scr_h)
        self.state_shape = [24]
        self.rendering_scale = 30
        self.barriers = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [1, 5], [2, 5], [3, 5], [4, 5]]
        self.possible_recoveries = [[5, 0], [6, 0], [7, 0]]
        self.main_deaths = [[self.scr_w - 1, k] for k in range(self.scr_h)] + [[8,0]]
        self.dead_ends = [[x, y] for x in range(self.scr_w // 2, self.scr_w - 1) for y in range(self.scr_w // 2, self.scr_w)]
        self.observability_switch_point = [0, 5]

    @property
    def rendering(self):
        return self._rendering

    @rendering.setter
    def rendering(self, flag):
        if flag is True:
            if self._rendering is False:
                self._init_pygame()
                self._rendering = True
        else:
            self.close()
            self._rendering = False

    def _init_pygame(self):
        pygame.init()
        size = [self.rendering_scale * self.scr_w, self.rendering_scale * self.scr_h]
        self.screen = pygame.display.set_mode(size)
        pygame.display.set_caption("LifeGate")

    def _init_rendering_folder(self):
        if self.render_dir_main is None:
            self.render_dir_main = 'render'
        if not os.path.exists(os.path.join(os.getcwd(), self.render_dir_main)):
            os.mkdir(os.path.join(os.getcwd(), self.render_dir_main))
        i = 0
        while os.path.exists(os.path.join(os.getcwd(), self.render_dir_main, 'render' + str(i))):
            i += 1
        self.render_dir = os.path.join(os.getcwd(), self.render_dir_main, 'render' + str(i))
        os.mkdir(self.render_dir)

    def reset(self):
        if self.image_saving:
            self._init_rendering_folder()
        self.game_over = False
        self.step_id = 0
        self.recovery_observablity = True
        self.blue = BLUE
        state = self.init_episode()
        return state

    def init_episode(self):
        # should implement reconfigurations at the beginning of each episode
        self.player_pos_x, self.player_pos_y = 2, self.scr_h - 1
        targets = deepcopy(self.possible_recoveries)
        # if self.fixed_life == True:
        #     rec = targets.pop(2)  # fixed life-gate for DQN
        # else:
        #     rec = targets.pop(self.rng.randint(len(targets)))
        self.recoveries = targets #[rec]
        self.deaths = self.main_deaths #+ targets
        return self.get_obs(self.state_mode)

    def render(self):
        if not self.rendering:
            return
        pygame.event.pump()
        self.screen.fill(BLACK)
        size = [self.rendering_scale, self.rendering_scale]
        for pos in self.dead_ends:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, YELLOW, rec1)
        player = pygame.Rect(self.rendering_scale * self.player_pos_x, self.rendering_scale * self.player_pos_y,
                             size[0], size[1])
        pygame.draw.rect(self.screen, WHITE, player)
        for pos in self.deaths:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, RED, rec1)
        for pos in self.recoveries:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, self.blue, rec1)  # self.blue will change if reach obs point
        for pos in self.barriers:
            p = [self.rendering_scale * pos[0], self.rendering_scale * pos[1]]
            rec1 = pygame.Rect(p[0], p[1], size[0], size[1])
            pygame.draw.rect(self.screen, WALL, rec1)
        pygame.display.flip()

        if self.image_saving:
            self.save_image()

    def save_image(self):
        if self.rendering and self.render_dir is not None:
            pygame.image.save(self.screen, self.render_dir + '/render' + str(self.step_id) + '.jpg')
        else:
            raise ValueError('env.rendering is False and/or environment has not been reset.')

    def close(self):
        if self.rendering:
            pygame.quit()

    def _move_player(self, action):
        x, y = (self.player_pos_x, self.player_pos_y)
        # dead-end:
        if [x, y] in self.dead_ends:
            if self.rng.binomial(1, 0.70):
                action = 4  # forceful right
            else:
                action = 0  # no-op
        else:
            # natural risk of death
            if self.rng.binomial(1, self.death_drag):  # say with 25% if death_drag==0.25
                action = 4

        if action == 4:    # right
            x += 1
        elif action == 3:  # left
            x -= 1
        elif action == 2:  # down
            y += 1
        elif action == 1:  # up
            y -= 1
        # updating the position
        if [x, y] in self.barriers or x < 0 or y < 0 or y >= self.scr_h:
            return
        else:
            self.player_pos_x, self.player_pos_y = x, y

    def _get_status(self):
        # check the current situation
        if [self.player_pos_x, self.player_pos_y] in self.deaths:
            return 'death'
        elif [self.player_pos_x, self.player_pos_y] in self.recoveries:
            return 'recovery'

    def step(self, action):
        assert action in self.legal_actions, 'Illegal action.'
        if self.step_id >= self.max_steps - 1:
            self.game_over = True
            return self.get_obs(self.state_mode), 0., self.game_over, {}
        self.step_id += 1
        self._move_player(action)
        if [self.player_pos_x, self.player_pos_y] == self.observability_switch_point and self.recovery_observablity == True:
            self.recovery_observablity = False
            self.blue = BLACK
        status = self._get_status()
        if status == 'death':
            self.game_over = True
            reward = self.reward_scheme['death']
        elif status == 'recovery':
            self.game_over = True
            reward = self.reward_scheme['recovery']
        else:
            reward = self.reward_scheme['step']
        return self.get_obs(self.state_mode), reward, self.game_over, {}

    def get_lives(self):
        if self.game_over == True:
            return 0
        else:
            return 1

    def get_state(self):
        return self.get_obs(self.state_mode)

    def get_obs(self, method):
        if method == 'vector':
            return self._get_vec_obs()
        elif method == 'pixel':
            return self._get_pixel_obs()
        elif method == 'tabular':
            return self._get_tabular_obs()
        else:
            raise ValueError('Unknown observation method.')

    def _get_vec_obs(self):
        x = np.zeros(self.scr_w + self.scr_h + len(self.possible_recoveries), dtype=self.state_dtype)
        x[self.player_pos_x] = 1.0
        x[self.player_pos_y + self.scr_w] = 1.0
        if self.recovery_observablity == True or self.fixed_life == True:
            for k in self.recoveries:
                x[k[0] - 5 + self.scr_w + self.scr_h] = 1.0
        return x

    def _get_tabular_obs(self):
        return np.array([self.player_pos_x, self.player_pos_y])

    def _get_pixel_obs(self):
        raise NotImplementedError

# shaping dependencies

In [9]:
!git clone https://github.com/ajagota7/Shaping.git

Cloning into 'Shaping'...
remote: Enumerating objects: 156, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 156 (delta 80), reused 109 (delta 45), pack-reused 0[K
Receiving objects: 100% (156/156), 12.34 MiB | 25.53 MiB/s, done.
Resolving deltas: 100% (80/80), done.


In [10]:
# %cd /content/Shaping

In [11]:
# !git pull origin main

In [12]:
# cd /content/

In [13]:
# %cd /content/Shaping

import zipfile

with zipfile.ZipFile('/content/Shaping/lifegate_1.zip', 'r') as zip_ref:
    # zip_ref.extractall('/content/med-deadend/lifegate/results/lifegate_1')
    zip_ref.extractall('/content/Shaping/')

In [14]:
import sys
# sys.path.append('/content/med-deadend/lifegate')
sys.path.append('/content/Shaping/')



In [15]:
import q_networks

In [16]:
# %cd /content/med-deadend/lifegate


# results_dir = 'results/lifegate_1/'
results_dir = '/content/Shaping/'
# Load the Q tables from the primary learning agent, Q_D and Q_R value functions
with open(results_dir+'tabular_qnet.pkl', 'rb') as fq:
    ai = pickle.load(fq)

with open(results_dir+'tabular_qd.pkl', 'rb') as fd:
    ai_d = pickle.load(fd)

with open(results_dir+'tabular_qr.pkl', 'rb') as fr:
    ai_r = pickle.load(fr)

In [17]:
q_table = np.zeros((10, 10, 5))
q_d = np.zeros_like(q_table)
q_r = np.zeros_like(q_table)


for i in range(10):
    for j in range(10):
        for a in range(5):
            key = tuple([j, i, a])
            try:
                q_table[i,j,a] = ai.q[key]
                q_d[i,j,a] = ai_d.q[key]
                q_r[i,j,a] = ai_r.q[key]
            except:
                pass

In [18]:
import yaml
import random
# from lifegate import LifeGate
params = yaml.safe_load(open(results_dir+'config.yaml', 'r'))
np.random.seed(seed=params['random_seed'])
random.seed(params['random_seed'])
random_state = np.random.RandomState(params['random_seed'])

In [19]:
params['random_seed']

1234

In [20]:
random_state = np.random.RandomState(1234)

# env

In [21]:
env = LifeGate(max_steps=params['episode_max_len'], state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.0)

In [22]:
env_30 = LifeGate(max_steps=30, state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.1)

In [23]:
env_50 = LifeGate(max_steps=50, state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.1)

In [24]:
env_100 = LifeGate(max_steps=100, state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = 0.1)

In [25]:
import Shaping
from Shaping import *
# %cd /content/Shaping

from choose_actions import action_probs_top_n_epsilon
from shaping_features import *
from gen_policies import *
from IS import *
from subset_policies import *
from v_pi_e import *
from optimization import *
from neural_net import *
from prep_variance import *
from SCOPE_variance import SCOPE_variance

In [26]:
import torch.nn.functional as F

# Test model with l2 reg







In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class CustomizableFeatureNet(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_prob=0.2, l2_lambda=0.01, dtype=torch.float32):
        super(CustomizableFeatureNet, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # Create the hidden layers based on the provided sizes
        for in_dim, out_dim in zip([input_dim] + hidden_dims, hidden_dims):
            layer = nn.Linear(in_dim, out_dim).to(dtype)
            self.hidden_layers.append(layer)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim).to(dtype)
        self.dropout = nn.Dropout(dropout_prob)
        self.l2_lambda = l2_lambda

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

    def l2_regularization(self):
        l2_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l2_reg += torch.norm(layer.weight)
        l2_reg += torch.norm(self.output_layer.weight)
        return self.l2_lambda * l2_reg


# Test model with l1_reg

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NN_l1_reg(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_prob=0.2, l1_lambda=0.01, dtype=torch.float32):
        super(NN_l1_reg, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # Create the hidden layers based on the provided sizes
        for in_dim, out_dim in zip([input_dim] + hidden_dims, hidden_dims):
            layer = nn.Linear(in_dim, out_dim).to(dtype)
            self.hidden_layers.append(layer)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim).to(dtype)
        self.dropout = nn.Dropout(dropout_prob)
        self.l1_lambda = l1_lambda

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

    def l1_regularization(self):
        l1_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l1_reg += torch.norm(layer.weight, p=1)
        l1_reg += torch.norm(self.output_layer.weight, p=1)
        return self.l1_lambda * l1_reg


# L1 L2 reg

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NN_l1_l2_reg(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_prob=0.2, l1_lambda=0.01, l2_lambda=0.01, dtype=torch.float32):
        super(NN_l1_l2_reg, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # Create the hidden layers based on the provided sizes
        for in_dim, out_dim in zip([input_dim] + hidden_dims, hidden_dims):
            layer = nn.Linear(in_dim, out_dim).to(dtype)
            self.hidden_layers.append(layer)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim).to(dtype)
        self.dropout = nn.Dropout(dropout_prob)
        self.l1_lambda = l1_lambda
        self.l2_lambda = l2_lambda

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

    def l1_regularization(self):
        l1_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l1_reg += torch.norm(layer.weight, p=1)
        l1_reg += torch.norm(self.output_layer.weight, p=1)
        return self.l1_lambda * l1_reg

    def l2_regularization(self):
        l2_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l2_reg += torch.norm(layer.weight, p=2)
        l2_reg += torch.norm(self.output_layer.weight, p=2)
        return self.l2_lambda * l2_reg

    def total_regularization(self):
        return self.l1_regularization() + self.l2_regularization()

# SCOPE straight

In [30]:
from datetime import datetime

class SCOPE_straight(object):

  def __init__(self, model, gamma, num_bootstraps, pi_b, P_pi_b, pi_e, P_pi_e, percent_to_estimate_phi, dtype):
        self.model = model
        self.gamma = gamma
        self.num_bootstraps = num_bootstraps
        self.pi_b = pi_b
        self.P_pi_b = P_pi_b
        self.P_pi_e = P_pi_e
        self.pi_e = pi_e
        self.dtype = dtype

        self.percent_to_estimate_phi = percent_to_estimate_phi
        # self.num_epochs = num_epochs

  def subset_policies(self):
    # seed_value = 0
    # np.random.seed(seed_value)
    num_policies = len(self.pi_b)
    num_policies_to_estimate_phi = int(num_policies * self.percent_to_estimate_phi)

    policies_for_scope = self.pi_b[num_policies_to_estimate_phi:]
    policies_for_phi = self.pi_b[:num_policies_to_estimate_phi]

    return policies_for_phi, policies_for_scope


  # ---------------
  # Pre-processing
  # ---------------

  def prep_policies(self, chosen_policies):
      # Initialize lists to store axis data for each policy
      timesteps = []
      # states = []
      # state_first = []
      # state_last = []
      actions = []
      rewards = []
      # gamma_last = []
      # weight_last = []
      # weight_first = []
      # all_weights_temp, weights = calculate_importance_weights(P_pi_e, P_pi_b, pi_b)
      weights = calculate_importance_weights(self.P_pi_e, self.P_pi_b, chosen_policies)
      psi = []

      states_current = []
      states_next = []
      states_all = []

      states_last = []
      psi_last = []

      for index, policy in enumerate(chosen_policies):
          policy_array = np.array(policy)

          timesteps.append(policy_array['timestep'].astype(int))
          actions.append(policy_array['action'])
          rewards.append(policy_array['reward'].astype(float))

          state_last = policy_array['state_next'][-1]
          last_psi = smallest_distance_to_deadend(state_last, env)
          states_last.append(state_last)
          psi_last.append(last_psi)

          # Concatenate psi array with last_psi
          # all_psi = np.concatenate((policy_array['psi'], [last_psi]))
          # psi.append(all_psi)
          psi.append(policy_array['psi'])

          states_next.append(policy_array['state_next'])
          states_current.append(policy_array['state'])
          # all_states = policy_array['state'] + policy_array['state_next'][-1]
          all_states = np.vstack((policy_array['state'],policy_array['state_next'][-1]))
          states_all.append(all_states)

          # states_all.append(np.concatenate((policy_array['state'], policy_array['state_next'][-1])))



      return timesteps, rewards, states_next, states_current, weights, actions, psi, states_last, psi_last

  def padding_IS_terms(self, timesteps, actions, rewards, weights):

    # Find the maximum length among all lists
    max_length = max(len(traj) for traj in timesteps)

    # Define the padding values
    zero_padding = 0

    # Pad each list to match the maximum length
    padded_timesteps = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in timesteps]
    padded_rewards = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in rewards]
    padded_actions = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in actions]
    padded_weights = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in weights]

    return padded_timesteps, padded_rewards, padded_actions, padded_weights


  def tensorize_IS_terms(self, padded_timesteps, padded_rewards, padded_weights):

    padded_timestep_tensors = torch.tensor(padded_timesteps, dtype = self.dtype)
    padded_reward_tensors = torch.tensor(padded_rewards, dtype = self.dtype)
    padded_weight_tensors = torch.tensor(padded_weights, dtype = self.dtype)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors

  def padding_states_all(self, states_all, psi):
    max_length = max(len(trajectory) for trajectory in states_all)

    zero_padding = 0

    # Pad each trajectory to make them all the same length
    padded_states_all = [
        [list(item) for item in trajectory] + [[0, 0]] * (max_length - len(trajectory))
        for trajectory in states_all
    ]

    padded_psi = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in psi]
    mask = [[1] * len(trajectory) + [0] * (max_length - len(trajectory)) for trajectory in states_all]

    return padded_states_all, padded_psi, mask



  def padding_states(self, states_next, states_current, psi):
    # Find the maximum length of trajectories
    max_length = max(len(trajectory) for trajectory in states_current)

    zero_padding = 0

    # Pad each trajectory to make them all the same length
    padded_states_next = [
        [list(item) for item in trajectory] + [[0, 0]] * (max_length - len(trajectory))
        for trajectory in states_next
    ]

    # Pad each trajectory to make them all the same length
    padded_states_current = [
        [list(item) for item in trajectory] + [[0, 0]] * (max_length - len(trajectory))
        for trajectory in states_current
    ]

    padded_psi = [np.concatenate([traj, [zero_padding] * (max_length - len(traj))]) for traj in psi]

    # Create mask
    mask = [[1] * len(trajectory) + [0] * (max_length - len(trajectory)) for trajectory in states_current]

    return padded_states_next, padded_states_current, padded_psi, mask


  def tensorize_padded_terms(self, padded_states_next, padded_states_current, padded_psi,mask):
    padded_states_next_tensors = torch.tensor(padded_states_next, dtype = self.dtype)
    padded_states_current_tensors = torch.tensor(padded_states_current, dtype = self.dtype)
    padded_psi_tensors = torch.tensor(padded_psi, dtype = self.dtype)

    mask_tensor = torch.tensor(mask, dtype = self.dtype)
    return padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor

  def tensorize_all_states_psi(self, padded_states_all, padded_psi, mask):
    padded_states_all_tensors = torch.tensor(padded_states_all, dtype = self.dtype)
    padded_psi_tensors = torch.tensor(padded_psi, dtype = self.dtype)
    mask_tensor = torch.tensor(mask, dtype = self.dtype)

    return padded_states_all_tensors, padded_psi_tensors, mask_tensor

  def tensorize_last_states_psi(self, states_last, psi_last):
    states_last_tensor = torch.tensor(states_last, dtype = self.dtype)
    psi_last_tensor = torch.tensor(psi_last, dtype = self.dtype)

    return states_last_tensor, psi_last_tensor

  #-----------------------
  # Preparation Functions
  # ----------------------

  def prepare_IS(self):
    timesteps, rewards, states_next, states_current, weights, actions,_,_,_ = self.prep_policies(self.pi_b)
    padded_timesteps, padded_rewards, padded_actions, padded_weights = self.padding_IS_terms(timesteps, actions, rewards, weights)
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors = self.tensorize_IS_terms(padded_timesteps, padded_rewards, padded_weights)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors

  def prepare_SCOPE(self, policies):
    timesteps, rewards, states_next, states_current, weights, actions, psi,states_last, psi_last = self.prep_policies(policies)
    padded_timesteps, padded_rewards, padded_actions, padded_weights = self.padding_IS_terms(timesteps, actions, rewards, weights)
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors = self.tensorize_IS_terms(padded_timesteps, padded_rewards, padded_weights)
    padded_states_next, padded_states_current, padded_psi, mask = self.padding_states(states_next, states_current, psi)
    padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor = self.tensorize_padded_terms(padded_states_next, padded_states_current, padded_psi, mask)
    states_last_tensor, psi_last_tensor = self.tensorize_last_states_psi(states_last, psi_last)
    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor

  def prepare_SCOPE_phi(self):
    phi_set,_ = self.subset_policies()
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor = self.prepare_SCOPE(phi_set)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor

  def prepare_SCOPE_test(self):
    _, scope_set = self.subset_policies()
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors,_,_,_,_ = self.prepare_SCOPE(scope_set)

    return padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors


  # ----------------
  # IS Calculations
  # ----------------


  def bootstrap_IS(self, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors):
    seed = 42
    torch.manual_seed(seed)

    num_samples = self.num_bootstraps
    num_bootstraps_lin = num_samples*padded_timestep_tensors.shape[0]

    # Sample indices with replacement
    sampled_indices = torch.randint(0, len(padded_timestep_tensors), size=(num_bootstraps_lin,), dtype=torch.long)

    reshaped_size = (num_samples, padded_timestep_tensors.shape[0], padded_timestep_tensors.shape[1])

    padded_IS = self.gamma**(padded_timestep_tensors)*padded_weight_tensors*padded_reward_tensors

    IS_bootstraps = padded_IS[sampled_indices].view(reshaped_size)

    # timestep_bootstraps = padded_timestep_tensors[sampled_indices].view(reshaped_size)
    # rewards_bootstraps = padded_reward_tensors[sampled_indices].view(reshaped_size)
    # weights_bootstraps = padded_weight_tensors[sampled_indices].view(reshaped_size)
    # return timestep_bootstraps, rewards_bootstraps, weights_bootstraps, IS_bootstraps
    return IS_bootstraps


  def calc_var_IS(self, IS_bootstraps):
    # Step 1: Sum along the third dimension
    sum_IS_trajectories = torch.sum(IS_bootstraps, dim=2)  # Shape: [1000, 1000]

    # Step 2: Take the mean along the second dimension
    mean_IS_sum = torch.mean(sum_IS_trajectories, dim=1)  # Shape: [1000]

    # Step 3: Calculate the variance across the first dimension
    IS_variance = torch.var(mean_IS_sum)  # A single scalar value

    IS_mean = torch.mean(mean_IS_sum) # A single scalar value

    return IS_mean, IS_variance


  def IS_pipeline(self):
    padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS = self.prepare_IS()
    # timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
    IS_bootstraps = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
    # IS_mean, IS_variance = self.calc_variance_IS(timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS)
    IS_mean, IS_variance = self.calc_var_IS(IS_bootstraps)

    return IS_mean, IS_variance



  # ---------------------
  # SCOPE calculations
  # ---------------------

  def pass_states(self, padded_states_next_tensors, padded_states_current_tensors):
    states_next_output = self.model(padded_states_next_tensors)
    states_current_output = self.model(padded_states_current_tensors)

    return states_next_output.squeeze(), states_current_output.squeeze()

  def bootstrap_straight(self, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output):
      seed = 42
      torch.manual_seed(seed)

      num_samples = self.num_bootstraps
      num_bootstraps_lin = num_samples*padded_timestep_tensors.shape[0]

      # Sample indices with replacement
      sampled_indices = torch.randint(0, len(padded_timestep_tensors), size=(num_bootstraps_lin,), dtype=torch.long)

      reshaped_size = (num_samples, padded_timestep_tensors.shape[0], padded_timestep_tensors.shape[1])

      padded_scope = self.gamma**(padded_timestep_tensors)*padded_weight_tensors*(padded_reward_tensors +self.gamma*states_next_output - states_current_output)
      scope_bootstraps = padded_scope[sampled_indices].view(reshaped_size)

      return scope_bootstraps

  def pass_then_boostraps(self, padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors):
    states_next_output, states_current_output = self.pass_states(padded_states_next_tensors, padded_states_current_tensors)
    # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
    scope_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
    # return timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps
    return scope_bootstraps

  def calc_var_straight(self, scope_bootstraps):

    # Step 1: Sum along the third dimension
    sum_scope_trajectories = torch.sum(scope_bootstraps, dim=2)  # Shape: [1000, 1000]

    # Step 2: Take the mean along the second dimension
    mean_scope_sum = torch.mean(sum_scope_trajectories, dim=1)  # Shape: [1000]

    # Step 3: Calculate the variance across the first dimension
    scope_variance = torch.var(mean_scope_sum)  # A single scalar value

    scope_mean = torch.mean(mean_scope_sum) # A single scalar value

    return scope_mean, scope_variance

  def train_var_scope(self, num_epochs, learning_rate, shaping_coefficient, scope_weight=1, mse_weight=1): #, folder_name, filename)
      # Get the current date
      # current_date = datetime.now().strftime("%Y-%m-%d")

      # Folder name with current date
      # folder_name = f"{folder}_{current_date}"

      # Create folder if it doesn't exist
      # if not os.path.exists(folder_name):
      #     os.makedirs(folder_name)

      # IS terms for comparison to SCOPE
      padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS = self.prepare_IS()
      # timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
      # IS_mean, IS_variance = self.calc_variance_IS(timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS)

      IS_bootstraps = self.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
      IS_mean, IS_variance = self.calc_var_IS(IS_bootstraps)

      # SCOPE terms for training phi
      padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor, states_last_tensor, psi_last_tensor = self.prepare_SCOPE_phi()


      self.model.train()

      # Enable anomaly detection
      torch.autograd.set_detect_anomaly(True)

      optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
      # Initialize empty list to store metrics and model state
      all_metrics = []
      model_states = []

      for epoch in range(num_epochs):
          total_loss = 0


          # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.pass_then_boostraps(padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)

          states_next_output, states_current_output = self.pass_states(padded_states_next_tensors, padded_states_current_tensors)
          # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
          # SCOPE_mean, SCOPE_variance = self.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

          scope_bootstraps = self.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
          SCOPE_mean, SCOPE_variance = self.calc_var_straight(scope_bootstraps)

          # mse_loss = F.mse_loss(states_current_output, 0.2*padded_psi_tensors)
          mse_loss = F.mse_loss(states_current_output, shaping_coefficient*padded_psi_tensors, reduction='none')
          masked_mse_loss = mse_loss * mask_tensor

          states_last_output = self.model(states_last_tensor)
          mse_states_last_loss = F.mse_loss(states_last_output.squeeze(),shaping_coefficient*psi_last_tensor, reduction = 'none')

          # mean_mse_loss = masked_mse_loss.mean()
          sum_mse_loss = torch.sum(masked_mse_loss, dim = 1)

          mean_mse_loss = torch.mean(sum_mse_loss + mse_states_last_loss)


          print(f"Epoch {epoch+1}")
          print(f"IS mean: {IS_mean},IS variance: {IS_variance}")
          print("SCOPE Var loss: ", SCOPE_variance)
          print("MSE loss: ", mean_mse_loss.item())


          # Testing evaluaton
          self.model.eval()
          scope_mean, scope_var = self.evaluate_scope()
          print(f"SCOPE mean: {scope_mean}, SCOPE var: {scope_var}")

          self.model.train()


          # tot = SCOPE_variance
          # tot = SCOPE_variance + mse_loss
          tot = scope_weight*SCOPE_variance + mse_weight*mean_mse_loss

          optimizer.zero_grad()

          # Retain the graph to avoid clearing it before backward pass
          tot.backward(retain_graph=True)

          optimizer.step()

          total_loss += tot.item()

          print(f"Total Loss: {total_loss}")
          print("-" * 40)
          # Append metrics to the list
          epoch_metrics = {
              "epoch": epoch + 1,
              # "IS_mean": IS_mean.item(),
              # "IS_variance": IS_variance.item(),
              "Train_mean": SCOPE_mean.item(),
              "Train_variance": SCOPE_variance.item(),
              "Train_mse_loss": mean_mse_loss.item(),
              "total_loss": total_loss,
              "Test_mean": scope_mean.item(),
              "Test_variance": scope_var.item()
          }

          all_metrics.append(epoch_metrics)

          # temporary_model_weights = self.model.state_dict()
          # model_weight_epoch = temporary_model_weights.copy()

          model_state = self.model.state_dict()

          # print(self.model.state_dict())
          # print(model_state)
          # Save model weights every 25 epochs
          if (epoch + 1) % 25 == 0:
              # Append model state to the model states list
              model_states.append({"epoch": epoch + 1, "model_state":  copy.deepcopy(model_state)})

      experiment_metrics = {"per_epoch": all_metrics, "model_weights": model_states}
      # Save all metrics and model states to a single file
      # if filename is not None:
      #     torch.save({"metrics": all_metrics, "model_states": model_states}, os.path.join(folder_name, f"{filename}_all_metrics.pt"))

      # Disable anomaly detection after running the code
      torch.autograd.set_detect_anomaly(False)

      # for name, param in self.model.named_parameters():
      #     if param.requires_grad:
      #         print(f"Parameter name: {name}")
      #         print(f"Weights: {param.data}")

      return experiment_metrics #all_metrics #,self.model #, sum_mse_loss, mse_states_last_loss, all_metrics

  def evaluate_scope(self):
    self.model.eval()
    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors = self.prepare_SCOPE_test()
    # timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = self.pass_then_boostraps(padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
    # SCOPE_mean, SCOPE_variance = self.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

    scope_bootstraps = self.pass_then_boostraps(padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
    SCOPE_mean, SCOPE_variance = self.calc_var_straight(scope_bootstraps)

    return SCOPE_mean, SCOPE_variance


  '''
  On Policy Calculations
  '''
  def calc_V_pi_e(self):
      all_timesteps = []
      gamma = self.gamma
      for j in range(len(self.pi_e)):
          Timestep_values = []
          for i in range(len(self.pi_e[j])):
            # print(i)
            timestep = gamma ** (i) * self.pi_e[j][i][2]
            Timestep_values.append(timestep)

          all_timesteps.append(Timestep_values)

      V_est = sum([sum(sublist) for sublist in all_timesteps])/len(self.pi_e)
      return V_est


  '''
  Visualizations:
  Requirements:
    model (for phi heatmap), pi_b (for s-v frequency heatmap)
  Consider extricating to its own visualization class or wthin experiment loading class
  maybe just utils functions
  '''

  # -----------------------
  # Heatmaps for lifegate
  # -----------------------
  def get_model_output_dict(self):

    self.model.eval()

    # Initialize an empty dictionary to store data
    data = {}

    # Loop through all combinations from [0,0] to [9,9]
    for i in range(10):
      for j in range(10):
          # Prepare input data
          input_data = torch.tensor([i, j], dtype=torch.float64)

          # Pass input through the self.model
          output = self.model(input_data)

          # Store data in the dictionary
          data[(i, j)] = output.item()

    return data

  def plot_heatmap(self, data):
    values = np.zeros((10, 10))
    for (x, y), value in data.items():
        values[y, x] = value

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(z=values, colorscale='viridis'))

    # Add colorbar
    fig.update_layout(coloraxis_colorbar=dict(title='Values',
                                              ticks='outside',
                                              tickvals=[np.min(values), np.max(values)],
                                              ticktext=[np.min(values), np.max(values)]))

    # Add labels and title
    fig.update_layout(xaxis=dict(tickvals=np.arange(10), ticktext=list(range(10)), title='X'),
                      yaxis=dict(tickvals=np.arange(9, -1, -1), ticktext=list(range(9, -1, -1)), title='Y', autorange="reversed"),
                      title='Heatmap')

    fig.show()

  def get_heatmap(self):
    data = self.get_model_output_dict()
    self.plot_heatmap(data)

  # ---------------------
  # State Visitation Heatmap
  # ---------------------

  def count_state_visits(self):
    state_visit_counts = {}
    for trajectory in self.pi_b:
        for data_point in trajectory:
            state = tuple(data_point['state'])
            if state not in state_visit_counts:
                state_visit_counts[state] = 0
            state_visit_counts[state] += 1

        # Include last state_next of the trajectory
        last_state_next = tuple(trajectory[-1]['state_next'])
        if last_state_next not in state_visit_counts:
            state_visit_counts[last_state_next] = 0
        state_visit_counts[last_state_next] += 1

    return state_visit_counts

  def create_state_visit_dict(self):
      state_visit_dict = {}
      for i in range(10):
          for j in range(10):
              state_visit_dict[(i, j)] = 0
      return state_visit_dict

  def fill_state_visit_dict(self,state_visit_counts):
      state_visit_dict = self.create_state_visit_dict()
      for state, count in state_visit_counts.items():
          state_visit_dict[state] = count
      return state_visit_dict


  def plot_state_visitations_heatmap(self, state_visit_dict):
    # Create lists to store x, y, and z values
    x = []
    y = []
    z = []

    # Iterate through the state visit dictionary
    for state, count in state_visit_dict.items():
        x.append(state[0])
        y.append(9 - state[1])  # Flip y-axis to have (0, 0) at the bottom-left
        z.append(count)

    # Create the heatmap trace
    trace = go.Heatmap(
        x=x,
        y=y,
        z=z,
        colorscale='Viridis',  # Choose a colorscale
        colorbar=dict(title='Visits'),
        zmin=0,
        zmax=max(z)  # Set maximum value for the color scale
    )

    # Create layout
    layout = go.Layout(
        title='State Visitations Heatmap',
        xaxis=dict(title='X-axis'),
        yaxis=dict(title='Y-axis', tickvals=list(range(10)), ticktext=list(range(9, -1, -1))),
    )

    # Create figure
    fig = go.Figure(data=[trace], layout=layout)
    fig.show()


  def get_state_visitation_heatmap(self):

    # Count state visits
    state_visit_counts = self.count_state_visits()

    # Fill state visit dictionary
    state_visit_dict = self.fill_state_visit_dict(state_visit_counts)

    # Assuming state_visit_dict is your dictionary with state visitations
    self.plot_state_visitations_heatmap(state_visit_dict)




# Experiment Class

In [38]:
from typing import List, Union
import uuid
from datetime import datetime
import copy
from typing import List, Callable


class SCOPE_experiment():
    def __init__(self,
                #  Parameters related to policy generation
                 pi_b_top_k: int,
                 pi_b_epsilon: float,
                 pi_e_top_k: int,
                 pi_e_epsilon: float,
                 q_table,#: List[List[float]],
                 gamma: float,
                 num_trajectories: int,
                 num_bootstraps: int,
                 percent_to_estimate_phi: float,
                 shaping_feature: Callable,
                 shaping_coefficient: float,
                #  Parameters related to neural network architecture and training
                 hidden_dims: List[int],
                 learning_rate: float,
                 dropout_prob: float,
                 l1_reg: float,
                 l2_reg: float,
                 scope_weight: float,
                 mse_weight: float,
                 num_epochs: int,
                #  Parameters related to environment
                 max_length: int,
                 death_drag: float,

                #  Other general parameters
                 dtype: str,
                 experiment_type: str,
                 folder_path: str):
                # folder_name):

        self.pi_b_top_k = pi_b_top_k
        self.pi_b_epsilon = pi_b_epsilon
        self.pi_e_top_k = pi_e_top_k
        self.pi_e_epsilon = pi_e_epsilon
        self.q_table = q_table
        self.gamma = gamma
        self.num_trajectories = num_trajectories
        self.num_bootstraps = num_bootstraps
        self.percent_to_estimate_phi = percent_to_estimate_phi
        self.shaping_feature = shaping_feature
        self.shaping_coefficient = shaping_coefficient

        self.hidden_dims = hidden_dims
        self.learning_rate = learning_rate
        self.dropout_prob = dropout_prob
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.scope_weight = scope_weight
        self.mse_weight = mse_weight
        self.num_epochs = num_epochs

        self.max_length = max_length
        self.death_drag = death_drag

        self.dtype = dtype
        self.experiment_type = experiment_type
        self.folder_path = folder_path
        # self.folder_name = folder_name

    '''
    Infrastructure for generating policies
    Generating transition distributions
    Choosing actions
      Chosen shaping features
    Traversing trajectory
    '''

    def action_probs_top_n_epsilon(self, top_k, epsilon):
      """
      Calculate action probabilities with epsilon-greedy strategy for top actions.

      Parameters:
      - n: Number of top actions
      - epsilon: Exploration-exploitation trade-off parameter

      Returns:
      - action_probs: Calculated action probabilities
      """

      num_actions = self.q_table.shape[-1]

      # Initialize a 2D array to represent action probabilities
      action_probs = np.zeros_like(self.q_table)

      # For each state, set the probability for the top two actions
      for i in range(self.q_table.shape[0]):
          for j in range(self.q_table.shape[1]):
              sorted_actions = np.argsort(self.q_table[j, i])  # Get the indices of all actions, sorted by Q-value
              top_actions = sorted_actions[-top_k:]  # Get the indices of the top two actions
              non_top_actions = sorted_actions[:-top_k]
              action_probs[i, j, top_actions] = (1 - epsilon) / top_k + epsilon/num_actions  # Split the probability evenly between the top two actions
              action_probs[i, j, non_top_actions] = epsilon/num_actions

      return action_probs


    def initialize_env(self):
      # Initalize lifegate class
      env = LifeGate(max_steps=self.max_length, state_mode='tabular',
                        rendering=True, image_saving=False, render_dir=None, rng=random_state, death_drag = self.death_drag)

      return env


    def choose_action(self, state, action_probs):
        # Get the probability distribution for the given state
        state_probs = action_probs[state]

        # Choose an action based on the probabilities
        action = np.random.choice(len(state_probs), p=state_probs)

        return action


    def experiment_actions(self, nb_trajectories, env, action_probs):#, shaping_feature):
        """
        Run the experiment for a specified number of episodes.

        Parameters:
        - nb_trajectories: Number of episodes
        - env: Experiment environment
        - action_probs: Action probabilities

        Returns:
        - policies: List of policies (pi_b or pi_e)
        """
        # Define the dtype for the structured array
        dtype = [
            ('state', np.float64, (2,)),
            ('action', np.int64),
            ('reward', np.float64),
            ('state_next', np.float64, (2,)),
            ('timestep', np.int64),
            ('psi', np.float64)

        ]

        policies = []
        for i in range(nb_trajectories):
            trajectory = np.empty(0, dtype=dtype)
            s = env.reset()
            env.render()
            term = False
            timestep = 0
            while not term:
                state_last = s
                action = self.choose_action(tuple(s), action_probs)
                s, r, term, _ = env.step(action)


                # psi = smallest_distance_to_deadend(state_last, env)
                psi = self.shaping_feature(state_last, env)

                data_point = np.array([(state_last, action, r, s, timestep, psi)], dtype=dtype)
                trajectory = np.append(trajectory, data_point)
                timestep += 1

            policies.append(trajectory)

        # with open('policies.pkl', 'wb') as f:
        #     pickle.dump(policies, f)

        return policies


    '''
    Running experiment:
      Preare environment
      Prepare model
      Prepare filename
      Running experiment
    '''

    def initalize_model(self):
      model = NN_l1_l2_reg(input_dim=2, hidden_dims=self.hidden_dims, dropout_prob=self.dropout_prob, output_dim=1, dtype = torch.float64, l1_lambda=self.l1_reg, l2_lambda = self.l2_reg)

    def prepare_experiment(self):
      env = self.initialize_env()
      P_pi_b = self.action_probs_top_n_epsilon(self.pi_b_top_k, self.pi_b_epsilon)
      P_pi_e = self.action_probs_top_n_epsilon(self.pi_e_top_k, self.pi_e_epsilon)
      pi_b = self.experiment_actions(self.num_trajectories, env, P_pi_b)
      pi_e = self.experiment_actions(1000, env, P_pi_e)

      # consider changing this to method within class
      model = NN_l1_l2_reg(input_dim=2, hidden_dims=self.hidden_dims, output_dim=1, dtype = self.dtype, l1_lambda=self.l1_reg, l2_lambda = self.l2_reg)

      experiment_class = SCOPE_straight(model, self.gamma, self.num_bootstraps, pi_b, P_pi_b, pi_e, P_pi_e, self.percent_to_estimate_phi, self.dtype)

      return pi_b, pi_e, model, experiment_class

    def generate_file_name(self):
      shaping_function = self.shaping_feature.__name__
      hidden_dims_str = '_'.join(map(str, self.hidden_dims))  # Convert hidden_dims to a string
      return f"{self.num_trajectories}_{self.gamma}_{self.percent_to_estimate_phi}_{shaping_function}_{self.shaping_coefficient}_{hidden_dims_str}_{self.dropout_prob}_{self.learning_rate}_{self.l1_reg}_{self.l2_reg}_{self.scope_weight}_{self.mse_weight}_{self.max_length}"

    def run_experiment(self):
      filename = self.generate_file_name()

      # generate file path with folder and filename
      file_path = os.path.join(self.folder_path, f"{filename}.pt")

      # Check if experiment exists
      if os.path.exists(file_path):
        print(f"The file {filename}.pt already exists in the folder.")
      else:
        pi_b, pi_e, model, experiment_class = self.prepare_experiment()
        all_metrics = experiment_class.train_var_scope(self.num_epochs, self.learning_rate, self.shaping_coefficient, self.scope_weight, self.mse_weight)
        on_policy_estimate = experiment_class.calc_V_pi_e()
        IS_mean, IS_variance = experiment_class.IS_pipeline()


        experiment_data = {
            "Experiment Parameters": self.__dict__,
            "Experiment Metrics": all_metrics,
            "On Policy Estimate": on_policy_estimate,
            "pi_b": pi_b,
            "pi_e": pi_e,
            "IS Estimate": {"Estimate": IS_mean.item(), "Variance": IS_variance.item()}
        }

        # torch.save(experiment_data, f"{self.folder_path}/{filename}.pt")
        torch.save(experiment_data, file_path)

    def load_experiment(self):
      filename = self.generate_file_name()
      # generate file path with folder and filename
      file_path = os.path.join(self.folder_path, f"{filename}.pt")

      loaded_data = torch.load(file_path)

      return loaded_data


    '''
    ------------------
    Stored experiments
    Save experiment
    Load experiment
    ------------------
    '''

    # def save_experiment(self):
    #   # filename stuff

    #   '''
    #   Save all parameters
    #   Save experiment run info:
    #   Weights every ~25 epochs
    #   train and test losses segmented by variance and shaping mse loss
    #     need to modify train_var_scope function in SCOPE_straight


    #   '''

    # def load_data(self):
    #   '''
    #   load saved files
    #   load data from saved file
    #   '''

    # def create_filename():
    #   return

    # # def create_experiment_description():

    # '''
    # Visualizing experiments
    # Over trajectories
    # Over train/test/splits
    # Heatmaps for each
    # State Visitation for each
    # Varying features

    # '''

    # def visualize_experiment(self):
    #   '''
    #   first load data
    #   visualize all experiment information
    #   heatmaps, loss plots, state_visitation frequencies
    #   variance plots over trajectories
    #   plots over varying train/test splits

    #   '''





# Load and visualize saved experiments

In [50]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

class existing_experiments(object):

    def __init__(self, experiment_instance, folder_path):
      # self.folder_name = folder_name
      # self.filename = filename
      self.experiment_instance = experiment_instance
      self.folder_path = folder_path

    # Individual Experiments
    '''
    Loading data
    '''
    def load_experiment_metrics(self):
        '''
        Load saved files and data from a saved file
        '''
        filename = self.experiment_instance.generate_file_name()
        # filename = self.generate_file_name()

        # generate file path with folder and filename
        file_path = os.path.join(self.folder_path, f"{filename}.pt")

        # Load data from the .pt file
        # loaded_data = torch.load(f"{self.folder_name}/{self.filename}.pt")
        # loaded_data = torch.load(f"{self.folder_path}/{filename}.pt")
        loaded_data = torch.load(file_path)

        # Access specific data from the loaded dictionary
        experiment_metrics = loaded_data["Experiment Metrics"]

        per_epoch = experiment_metrics["per_epoch"]
        model_weights = experiment_metrics["model_weights"]

        return per_epoch, model_weights

    def load_pi_b(self):
        filename = self.experiment_instance.generate_file_name()
        # Load data from the .pt file
        # loaded_data = torch.load(f"{self.folder_name}/{self.filename}.pt")
        # loaded_data = torch.load(f"{self.folder_path}/{filename}.pt")

        file_path = os.path.join(self.folder_path, f"{filename}.pt")
        loaded_data = torch.load(file_path)


        pi_b = loaded_data["pi_b"]

        return pi_b

    def load_on_policy_estimate(self):
       # Load data from the .pt file
        # loaded_data = torch.load(f"{self.folder_name}/{self.filename}.pt")
        filename = self.experiment_instance.generate_file_name()
        # loaded_data = torch.load(f"{self.folder_path}/{filename}.pt")
        file_path = os.path.join(self.folder_path, f"{filename}.pt")
        loaded_data = torch.load(file_path)

        on_policy_estimate = loaded_data["On Policy Estimate"]
        return on_policy_estimate

    def load_IS_estimate(self):
      filename = self.experiment_instance.generate_file_name()
      # loaded_data = torch.load(f"{self.folder_path}/{filename}.pt")
      file_path = os.path.join(self.folder_path, f"{filename}.pt")
      loaded_data = torch.load(file_path)

      IS_estimate = loaded_data['IS Estimate']['Estimate']
      IS_variance = loaded_data['IS Estimate']['Variance']
      return IS_estimate, IS_variance

    def preprocess_epoch_metrics(self):
        per_epoch, _ = self.load_experiment_metrics()

        # Extract metrics
        # IS_mean = np.zeros(len(per_epoch))
        # IS_variance = np.zeros(len(per_epoch))
        Train_mean = np.zeros(len(per_epoch))
        Train_variance = np.zeros(len(per_epoch))
        Train_mse_loss = np.zeros(len(per_epoch))
        total_loss = np.zeros(len(per_epoch))
        Test_mean = np.zeros(len(per_epoch))
        Test_variance = np.zeros(len(per_epoch))

        for i, epoch_data in enumerate(per_epoch):
            # IS_mean[i] = epoch_data['IS_mean']
            # IS_variance[i] = epoch_data['IS_variance']
            Train_mean[i] = epoch_data['Train_mean']
            Train_variance[i] = epoch_data['Train_variance']
            Train_mse_loss[i] = epoch_data['Train_mse_loss']
            total_loss[i] = epoch_data['total_loss']
            Test_mean[i] = epoch_data['Test_mean']
            Test_variance[i] = epoch_data['Test_variance']

        return Train_mean, Train_variance, Train_mse_loss, total_loss, Test_mean, Test_variance



    def calculate_bias(self, estimate):
      on_policy_estimate = self.load_on_policy_estimate()
      bias = estimate - on_policy_estimate
      return bias

    def calculate_mse(self, variance, estimate):
      bias = self.calculate_bias(estimate)
      mse = variance + bias**2
      return mse


    def load_model(self, epoch = None):
      filename = self.experiment_instance.generate_file_name()
      loaded_data = torch.load(f"{filename}.pt")
      # loaded_data = torch.load(f"{self.folder_name}/{self.filename}.pt")
      experiment_parameters = loaded_data["Experiment Parameters"]
      _, model_weights = self.load_experiment_metrics()

      if epoch is not None:
          # Find the index of the model weights corresponding to the specified epoch
          index = next((i for i, item in enumerate(model_weights) if item["epoch"] == epoch), None)
          if index is not None:
              chosen_model_state = model_weights[index]['model_state']
          else:
              raise ValueError(f"No weights found for epoch {epoch}")
      else:
          # If epoch is not specified, load the final weights
          chosen_model_state = model_weights[-1]['model_state']

      model = NN_l1_l2_reg(input_dim=2,
                           hidden_dims=experiment_parameters["hidden_dims"],
                           output_dim=1, dtype = experiment_parameters["dtype"],
                           l1_lambda=experiment_parameters["l1_reg"],
                           l2_lambda = experiment_parameters["l2_reg"])

      # Load the final weights into the model
      model.load_state_dict(chosen_model_state)

      return model


    '''
    Heatmap for model output
    '''

    # -----------------------
    # Heatmaps for lifegate
    # -----------------------
    def get_model_output_dict(self, epoch):
      model = self.load_model(epoch)
      model.eval()

      # Initialize an empty dictionary to store data
      data = {}

      # Loop through all combinations from [0,0] to [9,9]
      for i in range(10):
        for j in range(10):
            # Prepare input data
            input_data = torch.tensor([i, j], dtype=torch.float64)

            # Pass input through the self.model
            output = model(input_data)

            # Store data in the dictionary
            data[(i, j)] = output.item()

      return data

    def plot_heatmap(self, data):
      values = np.zeros((10, 10))
      for (x, y), value in data.items():
          values[y, x] = value

      # Create the heatmap
      fig = go.Figure(data=go.Heatmap(z=values, colorscale='viridis'))

      # Add colorbar
      fig.update_layout(coloraxis_colorbar=dict(title='Values',
                                                ticks='outside',
                                                tickvals=[np.min(values), np.max(values)],
                                                ticktext=[np.min(values), np.max(values)]))

      # Add labels and title
      fig.update_layout(xaxis=dict(tickvals=np.arange(10), ticktext=list(range(10)), title='X'),
                        yaxis=dict(tickvals=np.arange(9, -1, -1), ticktext=list(range(9, -1, -1)), title='Y', autorange="reversed"),
                        title='Heatmap')

      fig.show()

    def get_heatmap(self, epoch = None):
      data = self.get_model_output_dict(epoch)
      self.plot_heatmap(data)



    # ---------------------
    # State Visitation Heatmap
    # ---------------------

    def count_state_visits(self, pi_b):
      state_visit_counts = {}
      for trajectory in pi_b:
          for data_point in trajectory:
              state = tuple(data_point['state'])
              if state not in state_visit_counts:
                  state_visit_counts[state] = 0
              state_visit_counts[state] += 1

          # Include last state_next of the trajectory
          last_state_next = tuple(trajectory[-1]['state_next'])
          if last_state_next not in state_visit_counts:
              state_visit_counts[last_state_next] = 0
          state_visit_counts[last_state_next] += 1

      return state_visit_counts

    def create_state_visit_dict(self):
        state_visit_dict = {}
        for i in range(10):
            for j in range(10):
                state_visit_dict[(i, j)] = 0
        return state_visit_dict

    def fill_state_visit_dict(self,state_visit_counts):
        state_visit_dict = self.create_state_visit_dict()
        for state, count in state_visit_counts.items():
            state_visit_dict[state] = count
        return state_visit_dict


    def plot_state_visitations_heatmap(self, state_visit_dict):
      # Create lists to store x, y, and z values
      x = []
      y = []
      z = []

      # Iterate through the state visit dictionary
      for state, count in state_visit_dict.items():
          x.append(state[0])
          y.append(9 - state[1])  # Flip y-axis to have (0, 0) at the bottom-left
          z.append(count)

      # Create the heatmap trace
      trace = go.Heatmap(
          x=x,
          y=y,
          z=z,
          colorscale='Viridis',  # Choose a colorscale
          colorbar=dict(title='Visits'),
          zmin=0,
          zmax=max(z)  # Set maximum value for the color scale
      )

      # Create layout
      layout = go.Layout(
          title='State Visitations Heatmap',
          xaxis=dict(title='X-axis'),
          yaxis=dict(title='Y-axis', tickvals=list(range(10)), ticktext=list(range(9, -1, -1))),
      )

      # Create figure
      fig = go.Figure(data=[trace], layout=layout)
      fig.show()


    def get_state_visitation_heatmap(self):
      pi_b = self.load_pi_b()
      # Count state visits
      state_visit_counts = self.count_state_visits(pi_b)

      # Fill state visit dictionary
      state_visit_dict = self.fill_state_visit_dict(state_visit_counts)

      # Assuming state_visit_dict is your dictionary with state visitations
      self.plot_state_visitations_heatmap(state_visit_dict)

    def epoch_specific_values(self, epoch=None):
        Train_mean, Train_variance, Train_mse_loss, total_loss, Test_mean, Test_variance = self.preprocess_epoch_metrics()
        IS_mean, IS_variance = self.load_IS_estimate()

        IS_bias = self.calculate_bias(IS_mean)
        Train_bias = self.calculate_bias(Train_mean)
        Test_bias = self.calculate_bias(Test_mean)

        # Calculate MSE for IS
        IS_mse = self.calculate_mse(IS_variance, IS_mean)

        # Calculate MSE for Train and Test separately
        Train_mse = self.calculate_mse(Train_variance, Train_mean)
        Test_mse = self.calculate_mse(Test_variance, Test_mean)

        if epoch is None:
            # If no epoch is specified, use the values of the final epoch for Train and Test
            return IS_bias, Train_bias[-1], Test_bias[-1], IS_variance, Train_variance[-1], Test_variance[-1], IS_mse, Train_mse[-1], Test_mse[-1]
        else:
            # If epoch is specified, ensure it's within the range of available epochs
            if epoch < 1 or epoch > len(Train_mse):
                raise ValueError("Epoch out of range.")

            # Return values for the specified epoch for Train and Test
            return IS_bias, Train_bias[epoch-1], Test_bias[epoch-1], IS_variance, Train_variance[epoch-1], Test_variance[epoch-1], IS_mse, Train_mse[epoch-1], Test_mse[epoch-1]

    # def choose_mse_by_epoch(self):



    def mse_plots(self):
        Train_mean, Train_variance, Train_mse_loss, total_loss, Test_mean, Test_variance = self.preprocess_epoch_metrics()
        IS_mean, IS_variance = self.load_IS_estimate()

        IS_mse = self.calculate_mse(IS_variance, IS_mean)
        Train_mse = self.calculate_mse(Train_variance, Train_mean)
        Test_mse = self.calculate_mse(Test_variance, Test_mean)

        epochs = np.arange(1, len(Train_mean) + 1)
        fig = make_subplots(rows=1, cols=1, subplot_titles=("MSE over Epochs"))

        # Add traces for IS, Train, and Test MSE
        # fig.add_trace(go.Scatter(x=epochs, y=IS_mse, mode='lines+markers', name='IS MSE'), row=1, col=1)
        IS_line = [IS_mse] * len(epochs)
        fig.add_trace(go.Scatter(x=epochs, y=IS_line, mode='lines', name='IS MSE'), row=1, col=1)

        fig.add_trace(go.Scatter(x=epochs, y=Train_mse, mode='lines+markers', name='Train MSE'), row=1, col=1)
        fig.add_trace(go.Scatter(x=epochs, y=Test_mse, mode='lines+markers', name='Test MSE'), row=1, col=1)

        fig.update_xaxes(title_text="Epoch", row=1, col=1)
        fig.update_yaxes(title_text="MSE", row=1, col=1)

        fig.update_layout(title_text="MSE over Epochs", showlegend=True)
        fig.show()

    def plot_metrics_test(self):
    # def plot_metrics_test(self, IS_mean, Train_mean, Train_variance, Train_mse_loss, total_loss, Test_mean, Test_variance):
      '''
      Plot metrics over epochs
      '''
      Train_mean, Train_variance, Train_mse_loss, total_loss, Test_mean, Test_variance = self.preprocess_epoch_metrics()

      epochs = np.arange(1, len(Train_mean) + 1)

      on_policy_estimate = self.load_on_policy_estimate()
      # Create a list representing on-policy estimate for each epoch
      on_policy_line = [on_policy_estimate] * len(epochs)

      IS_mean, IS_variance = self.load_IS_estimate()



      fig = make_subplots(rows=2, cols=2, subplot_titles=("Estimate over Epochs",
                                                          "Variance over Epochs", "Train MSE Loss over Epochs",
                                                          "Total Loss over Epochs"))
      IS_mean_line = [IS_mean] * len(epochs)
      fig.add_trace(go.Scatter(x=epochs, y=IS_mean_line, mode='lines', name='IS Estimate'), row=1, col=1)
      # fig.add_trace(go.Scatter(x=epochs, y=IS_mean, mode='lines+markers', name='IS Estimate'), row=1, col=1)
      fig.add_trace(go.Scatter(x=epochs, y=Train_mean, mode='lines+markers', name='Train Estimate'), row=1, col=1)
      fig.add_trace(go.Scatter(x=epochs, y=Test_mean, mode='lines+markers', name='Test Estimate'), row=1, col=1)
      fig.add_trace(go.Scatter(x=epochs, y=on_policy_line, mode='lines', name='On-policy Estimate'), row=1, col=1)

      IS_var_line = [IS_variance] * len(epochs)
      fig.add_trace(go.Scatter(x=epochs, y=IS_var_line, mode='lines', name='IS Variance'), row=1, col=2)
      fig.add_trace(go.Scatter(x=epochs, y=Train_variance, mode='lines+markers', name='Train Variance'), row=1, col=2)
      fig.add_trace(go.Scatter(x=epochs, y=Test_variance, mode='lines+markers', name='Test Variance'), row=1, col=2)

      fig.add_trace(go.Scatter(x=epochs, y=Train_mse_loss, mode='lines+markers', name='Train MSE Loss'), row=2, col=2)
      fig.add_trace(go.Scatter(x=epochs, y=total_loss, mode='lines+markers', name='Total Loss'), row=2, col=1)

      fig.update_xaxes(title_text="Epoch", row=1, col=1)
      fig.update_xaxes(title_text="Epoch", row=1, col=2)
      fig.update_xaxes(title_text="Epoch", row=2, col=1)
      fig.update_xaxes(title_text="Epoch", row=2, col=2)

      fig.update_yaxes(title_text="Estimate", row=1, col=1)
      fig.update_yaxes(title_text="Variance", row=1, col=2)
      fig.update_yaxes(title_text="Total Loss", row=2, col=2)
      fig.update_yaxes(title_text="MSE Loss", row=2, col=1)

      fig.update_layout(title_text="Metrics over Epochs", showlegend=True)
      fig.show()

    # def loss_plotting(self):


    '''
      Multi-Experiment loading and visualization
    '''


    '''
    Visualizing experiments
    Over trajectories
    Over train/test/splits
    Heatmaps for each and over epochs within experiment
    State Visitation for each
    Varying features

    '''

    # def visualize_experiment(self):
    #   '''
    #   first load data
    #   visualize all experiment information
    #   heatmaps, loss plots, state_visitation frequencies
    #   variance plots over trajectories
    #   plots over varying train/test splits

    #   '''


# Visualize multiple experiments

In [None]:
# def chosen_mse(epoch=None):
#     if epoch is None:
#         # If no epoch is specified, use the final epoch
#         mse = self.calculate_mse(Train_variance[-1], Train_mean[-1])
#     else:
#         # Calculate MSE for the chosen epoch
#         if epoch < 1 or epoch > len(Train_mean):
#             raise ValueError("Epoch out of range.")
#         mse = self.calculate_mse(Train_variance[epoch - 1], Train_mean[epoch - 1])

#     return mse

In [48]:
num_trajectories = [200, 400, 600, 800, 1000]

In [60]:
# MSE/variance/bias over trajectories

# Initialize lists to store values for IS, Train, and Test
IS_bias_values = []
Train_bias_values = []
Test_bias_values = []
IS_variance_values = []
Train_variance_values = []
Test_variance_values = []
IS_mse_values = []
Train_mse_values = []
Test_mse_values = []

for i in num_trajectories:
    # Create experiment instance and load data
    test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, i, 10000, 0.3, smallest_distance_to_deadend,0.1, [16], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 300, 50, 0.0, torch.float64, "varying_num_trajectories", "/content/drive/MyDrive/Lifegate_experiments")
    test_load = existing_experiments(test_experiment,"/content/drive/MyDrive/Lifegate_experiments")

    # Get epoch-specific values for IS, Train, and Test
    IS_bias, Train_bias, Test_bias, IS_variance, Train_variance, Test_variance, IS_mse, Train_mse, Test_mse = test_load.epoch_specific_values()

    # Append values to respective lists
    IS_bias_values.append(IS_bias)
    Train_bias_values.append(Train_bias)
    Test_bias_values.append(Test_bias)
    IS_variance_values.append(IS_variance)
    Train_variance_values.append(Train_variance)
    Test_variance_values.append(Test_variance)
    IS_mse_values.append(IS_mse)
    Train_mse_values.append(Train_mse)
    Test_mse_values.append(Test_mse)


# # Plot bias over trajectories
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=num_trajectories, y=IS_bias_values, mode='lines', name='IS Bias'))
# fig.add_trace(go.Scatter(x=num_trajectories, y=Train_bias_values, mode='lines', name='Train Bias'))
# fig.add_trace(go.Scatter(x=num_trajectories, y=Test_bias_values, mode='lines', name='Test Bias'))
# fig.update_layout(title='Bias over Trajectories', xaxis_title='Number of Trajectories', yaxis_title='Bias')
# fig.show()

# # Plot variance over trajectories
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=num_trajectories, y=IS_variance_values, mode='lines', name='IS Variance'))
# fig.add_trace(go.Scatter(x=num_trajectories, y=Train_variance_values, mode='lines', name='Train Variance'))
# fig.add_trace(go.Scatter(x=num_trajectories, y=Test_variance_values, mode='lines', name='Test Variance'))
# fig.update_layout(title='Variance over Trajectories', xaxis_title='Number of Trajectories', yaxis_title='Variance')
# fig.show()

# # Plot MSE over trajectories
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=num_trajectories, y=IS_mse_values, mode='lines', name='IS MSE'))
# fig.add_trace(go.Scatter(x=num_trajectories, y=Train_mse_values, mode='lines', name='Train MSE'))
# fig.add_trace(go.Scatter(x=num_trajectories, y=Test_mse_values, mode='lines', name='Test MSE'))
# fig.update_layout(title='MSE over Trajectories', xaxis_title='Number of Trajectories', yaxis_title='MSE')
# fig.show()

# Create subplots
fig = make_subplots(rows=3, cols=1, subplot_titles=("Bias over Trajectories", "Variance over Trajectories", "MSE over Trajectories"))

# Add traces for bias
fig.add_trace(go.Scatter(x=num_trajectories, y=IS_bias_values, mode='lines', name='IS Bias'), row=1, col=1)
fig.add_trace(go.Scatter(x=num_trajectories, y=Train_bias_values, mode='lines', name='Train Bias'), row=1, col=1)
fig.add_trace(go.Scatter(x=num_trajectories, y=Test_bias_values, mode='lines', name='Test Bias'), row=1, col=1)

# Add traces for variance
fig.add_trace(go.Scatter(x=num_trajectories, y=IS_variance_values, mode='lines', name='IS Variance'), row=2, col=1)
fig.add_trace(go.Scatter(x=num_trajectories, y=Train_variance_values, mode='lines', name='Train Variance'), row=2, col=1)
fig.add_trace(go.Scatter(x=num_trajectories, y=Test_variance_values, mode='lines', name='Test Variance'), row=2, col=1)

# Add traces for MSE
fig.add_trace(go.Scatter(x=num_trajectories, y=IS_mse_values, mode='lines', name='IS MSE'), row=3, col=1)
fig.add_trace(go.Scatter(x=num_trajectories, y=Train_mse_values, mode='lines', name='Train MSE'), row=3, col=1)
fig.add_trace(go.Scatter(x=num_trajectories, y=Test_mse_values, mode='lines', name='Test MSE'), row=3, col=1)

# Update layout with increased subplot height
fig.update_layout(title_text="Metrics over Trajectories", showlegend=True, height=1000)

# Show plot
fig.show()

In [57]:
train_set = [0.1, 0.2, 0.3, 0.4, 0.5]

# MSE/variance/bias over train_set

# Initialize lists to store values for IS, Train, and Test
IS_bias_values = []
Train_bias_values = []
Test_bias_values = []
IS_variance_values = []
Train_variance_values = []
Test_variance_values = []
IS_mse_values = []
Train_mse_values = []
Test_mse_values = []

for j in train_set:
    # Create experiment instance and load data
    test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, 1000, 10000, j, smallest_distance_to_deadend,0.1, [16], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 300, 50, 0.0, torch.float64, "varying_train_set", "/content/drive/MyDrive/Lifegate_experiments")
    test_load = existing_experiments(test_experiment,"/content/drive/MyDrive/Lifegate_experiments")

    # Get epoch-specific values for IS, Train, and Test
    IS_bias, Train_bias, Test_bias, IS_variance, Train_variance, Test_variance, IS_mse, Train_mse, Test_mse = test_load.epoch_specific_values()

    # Append values to respective lists
    IS_bias_values.append(IS_bias)
    Train_bias_values.append(Train_bias)
    Test_bias_values.append(Test_bias)
    IS_variance_values.append(IS_variance)
    Train_variance_values.append(Train_variance)
    Test_variance_values.append(Test_variance)
    IS_mse_values.append(IS_mse)
    Train_mse_values.append(Train_mse)
    Test_mse_values.append(Test_mse)


# Plot bias over train set
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_set, y=IS_bias_values, mode='lines', name='IS Bias'))
fig.add_trace(go.Scatter(x=train_set, y=Train_bias_values, mode='lines', name='Train Bias'))
fig.add_trace(go.Scatter(x=train_set, y=Test_bias_values, mode='lines', name='Test Bias'))
fig.update_layout(title='Bias over train set', xaxis_title='Proportion for Train Set', yaxis_title='Bias')
fig.show()

# Plot variance over train set
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_set, y=IS_variance_values, mode='lines', name='IS Variance'))
fig.add_trace(go.Scatter(x=train_set, y=Train_variance_values, mode='lines', name='Train Variance'))
fig.add_trace(go.Scatter(x=train_set, y=Test_variance_values, mode='lines', name='Test Variance'))
fig.update_layout(title='Variance over train set', xaxis_title='Proportion for Train Set', yaxis_title='Variance')
fig.show()

# Plot MSE over train set
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_set, y=IS_mse_values, mode='lines', name='IS MSE'))
fig.add_trace(go.Scatter(x=train_set, y=Train_mse_values, mode='lines', name='Train MSE'))
fig.add_trace(go.Scatter(x=train_set, y=Test_mse_values, mode='lines', name='Test MSE'))
fig.update_layout(title='MSE over train set', xaxis_title='Proportion for Train Set', yaxis_title='MSE')
fig.show()



In [None]:
# MSE over train set sizes

# MSE over max trajectory lengths

# Testing experiment class

In [None]:
loaded_data = torch.load("/content/200_0.99_0.3_smallest_distance_to_deadend_0.1_8_8_0.2_0.001_1e-05_1e-05_1.0_1.0_50.pt")

In [None]:
loaded_data["On Policy Estimate"]

0.8515307428837143

In [None]:
experiment_metrics = loaded_data["Experiment Metrics"]
per_epoch = experiment_metrics["per_epoch"]
model_weights = experiment_metrics["model_weights"]

## Running experiment class (testing)

In [33]:
trajectory_lengths = [200, 400, 600, 800, 1000]

In [None]:
for i in trajectory_lengths:
  print(f"{i} Trajectories")
  print("-" * 100)
  test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, i, 10000, 0.3, smallest_distance_to_deadend,0.1, [8,8], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 300, 50, 0.0, torch.float64, "varying_num_trajectories")
  test_experiment.run_experiment()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
SCOPE Var loss:  tensor(0.0487, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  1.9890555897495208
SCOPE mean: 0.06774221542453844, SCOPE var: 0.010271504460539042
Total Loss: 2.0377178725521934
----------------------------------------
Epoch 188
IS mean: 0.20638617178220056,IS variance: 0.004659974317752809
SCOPE Var loss:  tensor(0.0485, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  1.9785429637158964
SCOPE mean: 0.06853031892834112, SCOPE var: 0.010256813250759255
Total Loss: 2.027009544487155
----------------------------------------
Epoch 189
IS mean: 0.20638617178220056,IS variance: 0.004659974317752809
SCOPE Var loss:  tensor(0.0483, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  1.9680467729356712
SCOPE mean: 0.0693115221406825, SCOPE var: 0.010242231829801458
Total Loss: 2.0163194960043875
----------------------------------------
Epoch 190
IS mean: 0.20638617178220056,IS variance: 0.00465

## Loading experiment class (testing)

In [None]:
test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, 400, 10000, 0.3, smallest_distance_to_deadend,0.1, [8,8], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 26, 50, 0.0, torch.float64, "varying_num_trajectories")
test_load = existing_experiments(test_experiment)

In [None]:
test_load.get_heatmap()

In [None]:
test_load.mse_plots()

In [None]:
test_load.plot_metrics_test()

In [None]:
test_load.mse_plots()

## Saving to drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls /content/drive/MyDrive/Lifegate_experiments

In [34]:
trajectory_lengths = [200, 400, 600, 800, 1000]

In [46]:
for i in trajectory_lengths:
  print(f"{i} Trajectories")
  print("-" * 100)
  test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, i, 10000, 0.3, smallest_distance_to_deadend,0.1, [6,6], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 300, 50, 0.0, torch.float64, "varying_num_trajectories", "/content/drive/MyDrive/Lifegate_experiments")
  test_experiment.run_experiment()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
SCOPE Var loss:  tensor(1.1216, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  1.63184073317128
SCOPE mean: 0.5794265121511369, SCOPE var: 0.24984630440004613
Total Loss: 2.7534623783318364
----------------------------------------
Epoch 188
IS mean: 1.432540426555102,IS variance: 0.3979464949761987
SCOPE Var loss:  tensor(1.1163, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  1.6236692966481683
SCOPE mean: 0.5812486679145888, SCOPE var: 0.24985791519079836
Total Loss: 2.739954887352365
----------------------------------------
Epoch 189
IS mean: 1.432540426555102,IS variance: 0.3979464949761987
SCOPE Var loss:  tensor(1.1110, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  1.6156045686228164
SCOPE mean: 0.5830685410008792, SCOPE var: 0.2498674291874376
Total Loss: 2.7265584562649012
----------------------------------------
Epoch 190
IS mean: 1.432540426555102,IS variance: 0.3979464949761987
SCOPE 

In [47]:
num_trajectories = [200, 400, 600, 800, 1000]
train_set = [0.1, 0.2, 0.3, 0.4, 0.5]
for i in num_trajectories:
  for j in train_set:
    print(f"{i} Trajectories")
    print(f"{j} Train set")
    print("-" * 70)
    test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, i, 10000, j, smallest_distance_to_deadend,0.1, [16], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 300, 50, 0.0, torch.float64, "varying_num_trajectories", "/content/drive/MyDrive/Lifegate_experiments")
    test_experiment.run_experiment()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IS mean: 0.955726589351799,IS variance: 0.14930341867057947
SCOPE Var loss:  tensor(0.3845, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  2.490662032939283
SCOPE mean: 0.4514640291481441, SCOPE var: 0.06932195639901796
Total Loss: 2.875128769148885
----------------------------------------
Epoch 189
IS mean: 0.955726589351799,IS variance: 0.14930341867057947
SCOPE Var loss:  tensor(0.3824, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  2.478728139696601
SCOPE mean: 0.45087829796552686, SCOPE var: 0.06910489266886219
Total Loss: 2.8610872640253313
----------------------------------------
Epoch 190
IS mean: 0.955726589351799,IS variance: 0.14930341867057947
SCOPE Var loss:  tensor(0.3803, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  2.4668238815038825
SCOPE mean: 0.4503347943802111, SCOPE var: 0.06889393754114455
Total Loss: 2.847132633753705
----------------------------------------
Epoch 191
IS

## Load from Drive

In [41]:
test_experiment = SCOPE_experiment(1, 0.4, 1, 0.05, q_table, 0.99, 200, 10000, 0.3, smallest_distance_to_deadend,0.1, [16], 0.001, 0.2, 0.00001, 0.00001, 1.0, 1.0, 300, 50, 0.0, torch.float64, "varying_num_trajectories", "/content/drive/MyDrive/Lifegate_experiments")

In [43]:
test_load = existing_experiments(test_experiment,"/content/drive/MyDrive/Lifegate_experiments")

In [45]:
test_load.plot_metrics_test()

# Plotting over trajectories

In [None]:
P_pi_b_200 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_200 = experiment_actions(200, env_50, P_pi_b_200)
P_pi_e_200 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_200 = experiment_actions(1000, env_50, P_pi_e_200)
model_200_random_pi_b_200 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
test_200_random_pi_b_200 = SCOPE_straight(model_200_random_pi_b_200, 0.99, 10000, pi_b_200, P_pi_b_200, P_pi_e_200, 0.3, dtype = torch.float64)
test_200_random_pi_b_200.train_var_scope(400, 0.001, 1, 0)

Experiments
- Trajectory length
- Number of trajectories
- Train/test split
- shaping features

Information to save
- architecture
- Hyperparameters (regularization, etc.)
- weights every 50 epochs
- training loss every epoch
- test loss every epoch
- chosen feature

Plots (for each experiment)
- train vs test loss over time
- variance loss, mse loss (train,test?)
- heatmap
-

Thoughts on naming and running experiments
- manual input names I think
- have a few different pipelines
- Folders for each set of experiments

# Test class

In [None]:
env.possible_recoveries

[[5, 0], [6, 0], [7, 0]]

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e = experiment_actions(1000, env, P_pi_e)
model_200_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_200_0p99 = SCOPE_straight(model_200_0p99, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
timesteps, rewards, states_next, states_current, weights, actions, psi, states_last, psi_last = test_200_0p99.prep_policies(pi_b)

In [None]:
model, masked_mean_set, last_set = test_200_0p99.train_var_scope(2, 0.001)

Epoch 1
IS variance:  tensor(6.6645e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0617, dtype=torch.float64, grad_fn=<VarBackward0>)
SCOPE mean: 0.03283755622152981, SCOPE var: 0.005437360955003497
Total Loss: 1.0616657317194855
----------------------------------------
Epoch 2
IS variance:  tensor(6.6645e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0278, dtype=torch.float64, grad_fn=<VarBackward0>)
SCOPE mean: 0.04225211428680145, SCOPE var: 0.005846893896544516
Total Loss: 1.0278019486643444
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[-0.5256, -0.1831],
        [ 0.4874, -0.2204],
        [ 0.3107,  0.1398],
        [ 0.0890, -0.3095],
        [ 0.6320,  0.2276],
        [-0.2607, -0.6321],
        [-0.7033, -0.3404],
        [-0.5834, -0.1342],
        [ 0.0957, -0.5969],
        [-0.4036,  0.4857],
        [-0.4621, -0.6912],
        [ 0.6774, -0.3109],
        [ 0.2194, -0.6755],
        [ 0.4790,  0.3212],
        

In [None]:
masked_mean_set

tensor([ 9.1995,  5.6653,  6.6862,  5.5858, 10.7349, 11.2185,  8.0980, 11.1943,
         7.5683,  5.2833, 10.5943,  8.1137,  8.2862,  7.7130,  7.1801,  5.7873,
        13.8663,  6.1387,  4.9187,  6.9743,  9.1391, 10.0840,  6.5897,  4.5218,
         7.6901,  4.7775, 14.2380,  8.5109,  4.9762,  6.2620,  7.8673,  5.6961,
         5.3566,  5.9950,  1.8361,  6.7066, 11.8596,  4.8783, 10.2837,  5.2763,
         5.0988,  3.5132,  5.1391,  9.1433,  6.4432,  5.8137,  5.3222,  7.7351,
         8.4064,  5.9786,  9.1857,  6.0923,  5.1522,  8.4583,  3.8289,  6.1480,
         8.8692,  9.6334, 14.3951,  3.6950], dtype=torch.float64,
       grad_fn=<SumBackward1>)

In [None]:
torch.mean(masked_mean_set+last_set)

tensor(7.7470, dtype=torch.float64, grad_fn=<MeanBackward0>)

In [None]:
last_set

tensor([6.2488e-02, 1.0773e-01, 6.2056e-01, 4.2670e-01, 7.4674e-01, 8.5925e-02,
        4.5368e-01, 6.7436e-01, 7.9104e-03, 4.9821e-02, 1.0558e+00, 6.7005e-01,
        1.4596e-01, 4.0665e-01, 6.6734e-01, 3.5449e-01, 1.8626e-01, 6.1782e-01,
        1.3178e+00, 2.2973e-01, 6.7766e-01, 3.6193e-01, 1.4151e-02, 1.1771e-01,
        3.3000e-01, 9.4572e-02, 1.7433e-02, 9.4038e-02, 2.8302e-01, 5.1651e-01,
        3.3207e-01, 3.1182e-01, 1.6128e-03, 1.7661e-01, 2.1079e-01, 2.4753e-01,
        3.5967e-01, 8.3684e-01, 3.0507e-04, 1.6805e+00, 1.0348e+00, 2.6982e-01,
        2.9373e-01, 8.5336e-01, 4.3011e-01, 3.8731e-01, 1.8092e-01, 1.7744e-01,
        2.8587e-01, 1.6206e-01, 1.3928e-01, 5.9762e-01, 1.2352e+00, 1.3601e-02,
        8.0091e-02, 4.7974e-01, 1.9591e-01, 2.5214e-01, 6.2668e-01, 1.6981e-01],
       dtype=torch.float64, grad_fn=<MseLossBackward0>)

In [None]:
model_200_0p99(torch.tensor(states_last)).squeeze()

tensor([ 3.5931e-01,  3.1535e-01,  1.1579e-02,  4.1809e-01,  5.3519e-01,
         8.3596e-02, -2.7350e-01, -7.7953e-02,  1.6937e-01,  7.5656e-02,
        -5.9883e-01,  1.6519e-01,  5.4058e-01,  1.2196e-01,  7.0372e-03,
        -1.7420e-02, -3.2525e-01,  3.3452e-01,  2.4492e-01,  3.1173e-01,
        -9.0382e-02, -1.4154e-01, -1.5056e-01, -2.8426e-02, -4.4593e-01,
        -2.8394e-01,  3.5727e-01, -5.7754e-01, -1.0026e-01, -9.1278e-02,
         3.4936e-02,  3.1954e-01, -4.4189e-01, -1.0707e-01,  7.4240e-02,
        -2.0949e-01, -1.4405e-01,  1.8865e-01, -2.4926e-01,  8.7673e-03,
         2.9567e-02,  2.9281e-01,  1.1581e-01, -7.9609e-04,  2.1255e-01,
        -6.5637e-02,  5.3519e-01,  3.3169e-01, -3.1210e-01,  2.9727e-01,
         2.4818e-02,  4.5850e-01, -6.3380e-01,  1.9436e-01, -1.9333e-01,
         2.6602e-02,  1.5739e-01,  1.6537e-01,  3.7224e-01, -1.5389e-01,
         4.4906e-01, -1.0166e-01,  5.0598e-01, -3.7695e-01,  4.6480e-01,
         2.2792e-01,  7.8922e-02, -4.3059e-01,  7.1

In [None]:
psi_last

[5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0

In [None]:
len(states_current[0])

29

In [None]:
len(states_all[0])

30

In [None]:
len(psi[0])

30

In [None]:
states_all

[array([[2., 9.],
        [2., 9.],
        [2., 9.],
        [2., 9.],
        [1., 9.],
        [1., 9.],
        [0., 9.],
        [0., 8.],
        [0., 7.],
        [0., 6.],
        [0., 6.],
        [0., 5.],
        [0., 4.],
        [0., 5.],
        [0., 5.],
        [0., 5.],
        [0., 5.],
        [0., 4.],
        [1., 4.],
        [1., 3.],
        [1., 4.],
        [1., 4.],
        [1., 3.],
        [2., 3.],
        [3., 3.],
        [3., 2.],
        [3., 1.],
        [4., 1.],
        [5., 1.],
        [5., 0.]]),
 array([[2., 9.],
        [1., 9.],
        [0., 9.],
        [0., 8.],
        [0., 7.],
        [0., 6.],
        [0., 7.],
        [0., 6.],
        [0., 5.],
        [0., 4.],
        [1., 4.],
        [1., 3.],
        [2., 3.],
        [2., 4.],
        [2., 3.],
        [2., 2.],
        [2., 2.],
        [2., 1.],
        [2., 1.],
        [2., 2.],
        [3., 2.],
        [3., 2.],
        [3., 1.],
        [4., 1.],
        [4., 1.],
        

In [None]:
states_next[0][-1]

array([5., 0.])

In [None]:
np.vstack((states_current[0],states_next[0][-1]))

array([[2., 9.],
       [2., 8.],
       [1., 8.],
       [1., 7.],
       [0., 7.],
       [0., 7.],
       [0., 6.],
       [0., 5.],
       [0., 5.],
       [0., 4.],
       [1., 4.],
       [1., 3.],
       [2., 3.],
       [2., 3.],
       [2., 4.],
       [2., 4.],
       [2., 4.],
       [2., 3.],
       [2., 3.],
       [2., 2.],
       [3., 2.],
       [4., 2.],
       [4., 1.],
       [5., 1.],
       [5., 0.]])

In [None]:
psi[0]

array([ 8.,  8.,  9.,  9., 10., 10., 10., 10., 10., 11., 10., 11., 10.,
       10.,  9.,  9.,  9., 10., 10., 11., 10.,  9., 10.,  9.])

In [None]:
len(states_next[0])

24

In [None]:
padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS = test_200_0p99.prepare_IS()
timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS, IS_boostraps = test_200_0p99.bootstrap_IS(padded_timestep_tensors_IS, padded_reward_tensors_IS, padded_weight_tensors_IS)
test_200_0p99.calc_variance_IS(timestep_bootstraps_IS, rewards_bootstraps_IS, weights_bootstraps_IS)


(tensor(0.1634, dtype=torch.float64), tensor(0.0052, dtype=torch.float64))

In [None]:
test_200_0p99.calc_var_IS(IS_boostraps)

(tensor(0.1634, dtype=torch.float64), tensor(0.0052, dtype=torch.float64))

In [None]:
padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors, padded_psi_tensors, mask_tensor = test_200_0p99.prepare_SCOPE_phi()

states_next_output, states_current_output = test_200_0p99.pass_states(padded_states_next_tensors, padded_states_current_tensors)
timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps, scope_bootstraps = test_200_0p99.bootstrap_straight(padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, states_next_output, states_current_output)
test_200_0p99.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

(tensor(0.2590, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0194, dtype=torch.float64, grad_fn=<VarBackward0>))

In [None]:
test_200_0p99.calc_var_straight(scope_bootstraps)

NameError: name 'scope_bootstraps' is not defined

In [None]:
IS_variance

tensor(0.0052, dtype=torch.float64)

In [None]:
test_200_0p99.get_state_visitation_heatmap()

In [None]:
test_200_0p99.IS_pipeline()

(tensor(0.0040, dtype=torch.float64), tensor(5.4213e-06, dtype=torch.float64))

In [None]:
pi_e = experiment_actions(1000, env_30, P_pi_e)


In [None]:
calc_V_pi_e(pi_e)

0.02618155036170724

In [None]:
test_200_0p99.evaluate_scope()

(tensor(0.1136, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>))

In [None]:
model, masked_mean_set, last_set = test_200_0p99.train_var_scope(2, 0.001)

TypeError: SCOPE_straight.pass_states() missing 1 required positional argument: 'states_last_tensor'

In [None]:
torch.sum(masked_mean_set, dim = 1)

tensor([ 8.5748,  4.6997,  9.4879,  5.7368,  8.7992, 12.2325,  9.0167,  7.4956,
         8.9889,  5.5547,  7.1841,  6.8971, 11.3291,  4.0391, 10.0364,  7.8897,
        10.0546,  5.6250,  4.2488,  5.0116,  6.4645,  8.6754,  9.2830,  5.0178,
         7.2064,  4.4252,  5.2096,  9.4458,  5.3339, 12.6139, 14.0995,  5.3023,
         5.8995,  5.9573,  7.1260,  6.6979, 12.6700,  8.7708, 14.1373,  8.8810,
         7.4070,  5.6319,  1.4903,  6.4208,  6.5101,  5.2310,  5.2161, 14.1081,
         6.2500,  7.6385,  8.8202,  5.3136,  4.6834,  4.8666,  4.5748,  8.1876,
         7.9586,  6.6502,  6.2471,  4.7426], dtype=torch.float64,
       grad_fn=<SumBackward1>)

In [None]:
test_200_0p99.get_heatmap()

# Test random policy

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_30, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 5, 0.05)
# pi_e = experiment_actions(200, env_30, P_pi_e)
model_200_random = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_200_random = SCOPE_straight(model_200_random, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
test_200_random.get_state_visitation_heatmap()

In [None]:
test_200_random.train_var_scope(200, 0.001)

Epoch 1
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0005, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0005218583187486271
----------------------------------------
Epoch 2
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(9.8199e-06, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 9.819862798753365e-06
----------------------------------------
Epoch 3
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(1.0250e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 1.024958128773479e-05
----------------------------------------
Epoch 4
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(9.3286e-06, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 9.328558541237725e-06
----------------------------------------
Epoch 5
IS variance:  tensor(8.9415e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(8.3393e-06, dtype=torch.float64, grad_fn=<VarB

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_200_random.get_heatmap()

In [None]:
test_200_random.IS_pipeline()

(tensor(-0.0130, dtype=torch.float64), tensor(8.9415e-05, dtype=torch.float64))

In [None]:
pi_e = experiment_actions(1000, env_30, P_pi_e)


In [None]:
calc_V_pi_e(pi_e)

-0.12203056843017494

In [None]:
test_200_random.evaluate_scope()

(tensor(-0.0521, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0003, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test random pi_b

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_200_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_200_random_pi_b = SCOPE_straight(model_200_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
test_200_random_pi_b.IS_pipeline()

(tensor(2.6041, dtype=torch.float64), tensor(6.1715, dtype=torch.float64))

In [None]:
test_200_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_200_random_pi_b.train_var_scope(300, 0.001)

Epoch 1
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.1993, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04945267717328408
Total Loss: 0.24875341891612565
----------------------------------------
Epoch 2
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04854473065112503
Total Loss: 0.05144933898137734
----------------------------------------
Epoch 3
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04864399770972223
Total Loss: 0.05152271293468217
----------------------------------------
Epoch 4
IS variance:  tensor(1.6278e-19, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0029, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.04866784514199147
Total Loss: 0.051536243945871946
----------------------------------------
Epoch 5
IS variance:  tenso

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_200_random_pi_b.get_heatmap()

In [None]:
test_200_random_pi_b.IS_pipeline()

(tensor(-4.8178e-10, dtype=torch.float64),
 tensor(1.6278e-19, dtype=torch.float64))

In [None]:
calc_V_pi_e(pi_e)

0.1871443974984857

In [None]:
test_200_random_pi_b.evaluate_scope()

(tensor(0.4028, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0122, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test random 400 pi_b

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(400, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_400_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_400_random_pi_b = SCOPE_straight(model_400_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
test_400_random_pi_b.IS_pipeline()

(tensor(0.7103, dtype=torch.float64), tensor(0.0590, dtype=torch.float64))

In [None]:
test_400_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_400_random_pi_b.train_var_scope(500, 0.001)

Epoch 1
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2298, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2863851925761847
Total Loss: 0.5162089638144078
----------------------------------------
Epoch 2
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.5702, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2473059801237863
Total Loss: 0.8175554880907143
----------------------------------------
Epoch 3
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.5554, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.2326082591574042
Total Loss: 0.7880451023448775
----------------------------------------
Epoch 4
IS variance:  tensor(4.4092e-11, dtype=torch.float64)
SCOPE Var loss:  tensor(0.5384, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.21986669348691343
Total Loss: 0.7582925337594453
----------------------------------------
Epoch 5
IS variance:  tensor(4.4092

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_400_random_pi_b.get_heatmap()

In [None]:
calc_V_pi_e(pi_e)

0.15634452293280188

In [None]:
test_400_random_pi_b.evaluate_scope()

(tensor(0.1504, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0006, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test 600 pi_b top 2

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(600, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_600_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64)
test_600_random_pi_b = SCOPE_straight(model_600_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
test_600_random_pi_b.IS_pipeline()

(tensor(24.2014, dtype=torch.float64), tensor(134.2654, dtype=torch.float64))

In [None]:
test_600_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_600_random_pi_b.train_var_scope(300, 0.001)

Epoch 1
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0033, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.008124469012146312
SCOPE mean: 0.11593467505750471, SCOPE var: 0.001134831577478478
Total Loss: 0.011470035771407587
----------------------------------------
Epoch 2
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.007877572700959606
SCOPE mean: 0.11402489523061764, SCOPE var: 0.0011566205224887104
Total Loss: 0.008660720224668825
----------------------------------------
Epoch 3
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.007835659068336106
SCOPE mean: 0.1149561133499077, SCOPE var: 0.0011602695166762504
Total Loss: 0.00864455892775779
----------------------------------------
Epoch 4
IS variance:  tensor(4.8629e-09, dtype=torch.float64)
SCOPE Va

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_600_random_pi_b.get_heatmap()

In [None]:
calc_V_pi_e(pi_e)

0.15891747325670808

In [None]:
test_600_random_pi_b.evaluate_scope()

(tensor(0.1061, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0004, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test model with l2 reg







In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class CustomizableFeatureNet(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_prob=0.2, l2_lambda=0.01, dtype=torch.float32):
        super(CustomizableFeatureNet, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # Create the hidden layers based on the provided sizes
        for in_dim, out_dim in zip([input_dim] + hidden_dims, hidden_dims):
            layer = nn.Linear(in_dim, out_dim).to(dtype)
            self.hidden_layers.append(layer)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim).to(dtype)
        self.dropout = nn.Dropout(dropout_prob)
        self.l2_lambda = l2_lambda

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

    def l2_regularization(self):
        l2_reg = torch.tensor(0., device=self.output_layer.weight.device)
        for layer in self.hidden_layers:
            l2_reg += torch.norm(layer.weight)
        l2_reg += torch.norm(self.output_layer.weight)
        return self.l2_lambda * l2_reg


# Test 800 pi_b top 2

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(800, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_800_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l2_lambda=0.001)
test_800_random_pi_b = SCOPE_straight(model_800_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
test_800_random_pi_b.IS_pipeline()

(tensor(0.5781, dtype=torch.float64), tensor(0.0772, dtype=torch.float64))

In [None]:
test_800_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_800_random_pi_b.train_var_scope(50, 0.0005)

Epoch 1
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0044, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.004218989771201134
SCOPE mean: 0.22285763285074944, SCOPE var: 0.016886603818618454
Total Loss: 0.008665766088156722
----------------------------------------
Epoch 2
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.004043985329936416
SCOPE mean: 0.21677965178133518, SCOPE var: 0.01643719341992325
Total Loss: 0.004883264246003726
----------------------------------------
Epoch 3
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.004032811828319944
SCOPE mean: 0.2141315282737515, SCOPE var: 0.016257919950321238
Total Loss: 0.00487736908343905
----------------------------------------
Epoch 4
IS variance:  tensor(0.0772, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=10, bias=True)
  )
  (output_layer): Linear(in_features=10, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_800_random_pi_b.get_heatmap()

In [None]:
calc_V_pi_e(pi_e)

15.720270087636253

In [None]:
test_800_random_pi_b.evaluate_scope()

(tensor(0.1058, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0007, dtype=torch.float64, grad_fn=<VarBackward0>))

# Test 800 pi_b weighted mse

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(800, env_50, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e = experiment_actions(1000, env_50, P_pi_e)
model_800_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
test_800_random_pi_b = SCOPE_straight(model_800_random_pi_b, 0.99, 10000, pi_b, P_pi_b, P_pi_e, 0.3, dtype = torch.float64)


In [None]:
model_800_random_pi_b = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)


In [None]:
test_800_random_pi_b.IS_pipeline()

(tensor(0.2171, dtype=torch.float64), tensor(0.0069, dtype=torch.float64))

In [None]:
test_800_random_pi_b.get_state_visitation_heatmap()

In [None]:
test_800_random_pi_b.train_var_scope(10, 0.001, 1, 0)

Epoch 1
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0014, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.634099954946919
SCOPE mean: 0.17437900171241522, SCOPE var: 0.003946551824085135
Total Loss: 0.0014133758367131921
----------------------------------------
Epoch 2
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0005, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.6583666656390397
SCOPE mean: 0.17819248489278255, SCOPE var: 0.00406334930797007
Total Loss: 0.00047821039044280424
----------------------------------------
Epoch 3
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0005, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.6769835730717948
SCOPE mean: 0.18028760100370606, SCOPE var: 0.004121662484450746
Total Loss: 0.00048106496886156304
----------------------------------------
Epoch 4
IS variance:  tensor(0.0069, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_800_random_pi_b.get_heatmap()

In [None]:
calc_V_pi_e(pi_e)

0.1603494445055814

In [None]:
test_800_random_pi_b.evaluate_scope()

(tensor(1.9744, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(1.6041, dtype=torch.float64, grad_fn=<VarBackward0>))

In [None]:
# Get the state_dict of the model
model_state_dict = test_200_0p99.model.state_dict()

# Print the keys to see the structure of the state_dict
print(model_state_dict.keys())

# Extract and print the weights of each layer
for name, param in model_state_dict.items():
    if 'weight' in name:
        print(f"Layer: {name}")
        print(param)

odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'output_layer.weight', 'output_layer.bias'])
Layer: hidden_layers.0.weight
tensor([[-0.5992, -0.0774],
        [ 0.4740, -0.2568],
        [ 0.3057,  0.1028],
        [ 0.1205, -0.2533],
        [ 0.6123,  0.1611],
        [-0.2607, -0.6321],
        [-0.7033, -0.3404],
        [-0.5834, -0.1342],
        [ 0.1821, -0.5556],
        [-0.4225,  0.3797],
        [-0.4621, -0.6912],
        [ 0.6232, -0.4203],
        [ 0.2262, -0.6769],
        [ 0.5225,  0.2460],
        [ 0.6153, -0.0401],
        [-0.1569, -0.2980]], dtype=torch.float64)
Layer: hidden_layers.1.weight
tensor([[ 1.0516e-01, -1.5570e-01, -8.9501e-02,  2.2105e-01,  2.2795e-01,
          6.3637e-02,  3.5722e-02, -1.2352e-01, -1.2532e-01, -1.9321e-01,
          1.7001e-01,  2.7690e-01,  5.0010e-02,  4.4123e-02, -1.7390e-01,
         -3.3544e-02],
        [ 2.9563e-02, -1.6539e-01,  1.8811e-01,  2.4416e-02, -2.0411

In [None]:
scope_testing = SCOPE_straight(model, 0.9, 10000, pi_b, P_pi_b, P_pi_e, dtype = torch.float64)

In [None]:
padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors = scope_testing.prepare()
timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = scope_testing.pass_then_boostraps(model, padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
IS_variance, scope_variance = scope_testing.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)

# Experiment 1 (without mse)

In [None]:
P_pi_b_200 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_200 = experiment_actions(200, env_50, P_pi_b_200)
P_pi_e_200 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_200 = experiment_actions(1000, env_50, P_pi_e_200)
# model_200_random_pi_b_200 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_200_random_pi_b_200 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_200_random_pi_b_200 = SCOPE_straight(model_200_random_pi_b_200, 0.99, 10000, pi_b_200, P_pi_b_200, P_pi_e_200, 0.3, dtype = torch.float64)
test_200_random_pi_b_200.train_var_scope(300, 0.001, 1, 0)

Epoch 1
IS mean: 0.5079378512789022,IS variance: 0.07086930484469886
SCOPE Var loss:  tensor(1.1927, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  27.441370946239815
SCOPE mean: 0.7002709791760221, SCOPE var: 0.09223498645812357
Total Loss: 1.19270345724481
----------------------------------------
Epoch 2
IS mean: 0.5079378512789022,IS variance: 0.07086930484469886
SCOPE Var loss:  tensor(0.7008, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  25.891039921865286
SCOPE mean: 0.6613629756193857, SCOPE var: 0.08769328295555842
Total Loss: 0.700843741427873
----------------------------------------
Epoch 3
IS mean: 0.5079378512789022,IS variance: 0.07086930484469886
SCOPE Var loss:  tensor(0.6421, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  24.98436251935475
SCOPE mean: 0.626444735604516, SCOPE var: 0.08371144524064618
Total Loss: 0.6420935133411058
----------------------------------------
Epoch 4
IS mean: 0.5079378512789022,IS variance: 0.07086930484469886
SCO

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_400 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_400 = experiment_actions(400, env_50, P_pi_b_400)
P_pi_e_400 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_400 = experiment_actions(1000, env_50, P_pi_e_400)
# model_400_random_pi_b_400 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_400_random_pi_b_400 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_400_random_pi_b_400 = SCOPE_straight(model_400_random_pi_b_400, 0.99, 10000, pi_b_400, P_pi_b_400, P_pi_e_400, 0.3, dtype = torch.float64)
test_400_random_pi_b_400.train_var_scope(400, 0.001, 1, 0)

Epoch 1
IS mean: 0.22463288236461731,IS variance: 0.020959556927272208
SCOPE Var loss:  tensor(0.0538, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.879882049994008
SCOPE mean: 0.2813944638295345, SCOPE var: 0.03804615043113175
Total Loss: 0.05384612892349079
----------------------------------------
Epoch 2
IS mean: 0.22463288236461731,IS variance: 0.020959556927272208
SCOPE Var loss:  tensor(0.0205, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.64835634280359
SCOPE mean: 0.2771221231141596, SCOPE var: 0.037947585553909785
Total Loss: 0.020525783240966614
----------------------------------------
Epoch 3
IS mean: 0.22463288236461731,IS variance: 0.020959556927272208
SCOPE Var loss:  tensor(0.0196, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.600753818380567
SCOPE mean: 0.27897801446152903, SCOPE var: 0.03797091717547304
Total Loss: 0.019638155283731886
----------------------------------------
Epoch 4
IS mean: 0.22463288236461731,IS variance: 0.020959

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_600 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_600 = experiment_actions(600, env_50, P_pi_b_600)
P_pi_e_600 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_600 = experiment_actions(1000, env_50, P_pi_e_600)
# model_600_random_pi_b_600 = CustomizableFeatureNet(input_dim=2, hidden_dims=[8, 8], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
# model_600_random_pi_b_600 = NN_l1_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.001)
model_600_random_pi_b_600 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_600_random_pi_b_600 = SCOPE_straight(model_600_random_pi_b_600, 0.99, 10000, pi_b_600, P_pi_b_600, P_pi_e_600, 0.3, dtype = torch.float64)
test_600_random_pi_b_600.train_var_scope(400, 0.001, 1, 0)

Epoch 1
IS mean: 0.29707760471436423,IS variance: 0.009117349795339989
SCOPE Var loss:  tensor(0.3839, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  17.549395655377467
SCOPE mean: 0.523331442128463, SCOPE var: 0.01740940735402775
Total Loss: 0.38393649853529094
----------------------------------------
Epoch 2
IS mean: 0.29707760471436423,IS variance: 0.009117349795339989
SCOPE Var loss:  tensor(0.6394, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  17.42150262673749
SCOPE mean: 0.5181575529268448, SCOPE var: 0.01714722937961731
Total Loss: 0.639440108041088
----------------------------------------
Epoch 3
IS mean: 0.29707760471436423,IS variance: 0.009117349795339989
SCOPE Var loss:  tensor(0.6074, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  17.332147204681455
SCOPE mean: 0.5129178096276976, SCOPE var: 0.016856468346866972
Total Loss: 0.6074116654510248
----------------------------------------
Epoch 4
IS mean: 0.29707760471436423,IS variance: 0.0091173497

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_600_random_pi_b_600.get_heatmap()

In [None]:
P_pi_b_800 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_800 = experiment_actions(800, env_50, P_pi_b_800)
P_pi_e_800 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_800 = experiment_actions(1000, env_50, P_pi_e_800)
# model_800_random_pi_b_800 = CustomizableFeatureNet(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
# model_800_random_pi_b_800 = NN_l1_reg(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l1_lambda=0.0001)
model_800_random_pi_b_800 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_800_random_pi_b_800 = SCOPE_straight(model_800_random_pi_b_800, 0.99, 10000, pi_b_800, P_pi_b_800, P_pi_e_800, 0.3, dtype = torch.float64)
test_800_random_pi_b_800.train_var_scope(500, 0.001, 1, 0)

Epoch 1
IS mean: 0.3141882937476792,IS variance: 0.008870786239980513
SCOPE Var loss:  tensor(0.0458, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  17.37958175927478
SCOPE mean: 0.48076314709695417, SCOPE var: 0.014904131022434408
Total Loss: 0.04575112640269493
----------------------------------------
Epoch 2
IS mean: 0.3141882937476792,IS variance: 0.008870786239980513
SCOPE Var loss:  tensor(0.0854, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  17.31908477727003
SCOPE mean: 0.4703620959062186, SCOPE var: 0.014648459593509848
Total Loss: 0.08538300643807667
----------------------------------------
Epoch 3
IS mean: 0.3141882937476792,IS variance: 0.008870786239980513
SCOPE Var loss:  tensor(0.0827, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  16.965231508387408
SCOPE mean: 0.46285641303280634, SCOPE var: 0.0143920540816015
Total Loss: 0.08270453477739553
----------------------------------------
Epoch 4
IS mean: 0.3141882937476792,IS variance: 0.008870786

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_1000 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_1000 = experiment_actions(1000, env_50, P_pi_b_1000)
P_pi_e_1000 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_1000 = experiment_actions(1000, env_50, P_pi_e_1000)
# model_1000_random_pi_b_1000 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_1000_random_pi_b_1000 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_1000_random_pi_b_1000 = SCOPE_straight(model_1000_random_pi_b_1000, 0.99, 10000, pi_b_1000, P_pi_b_1000, P_pi_e_1000, 0.3, dtype = torch.float64)
test_1000_random_pi_b_1000.train_var_scope(300, 0.001, 1, 0)

Epoch 1
IS mean: 0.3306886857857864,IS variance: 0.009833739625625476
SCOPE Var loss:  tensor(0.0077, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.532842730124387
SCOPE mean: 0.26637211052973625, SCOPE var: 0.01961388496049461
Total Loss: 0.007724452937809946
----------------------------------------
Epoch 2
IS mean: 0.3306886857857864,IS variance: 0.009833739625625476
SCOPE Var loss:  tensor(0.0062, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.66889178570887
SCOPE mean: 0.2779599594741044, SCOPE var: 0.01972888639278707
Total Loss: 0.006216754765374113
----------------------------------------
Epoch 3
IS mean: 0.3306886857857864,IS variance: 0.009833739625625476
SCOPE Var loss:  tensor(0.0058, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.756625176817343
SCOPE mean: 0.28679747731278044, SCOPE var: 0.01980254514010637
Total Loss: 0.005833856841163596
----------------------------------------
Epoch 4
IS mean: 0.3306886857857864,IS variance: 0.009833739

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_1000_random_pi_b_1000.get_heatmap()

# Experiment 2 (with mse)

In [None]:
P_pi_b_200_mse = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_200_mse = experiment_actions(200, env_50, P_pi_b_200_mse)
P_pi_e_200 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_200 = experiment_actions(1000, env_50, P_pi_e_200)
# model_200_random_pi_b_200_mse = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_200_random_pi_b_200_mse = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_200_random_pi_b_200_mse = SCOPE_straight(model_200_random_pi_b_200_mse, 0.99, 10000, pi_b_200_mse, P_pi_b_200_mse, P_pi_e_200, 0.3, dtype = torch.float64)
test_200_random_pi_b_200_mse.train_var_scope(300, 0.001, 1, 1)

Epoch 1
IS mean: 2.9507531033554093,IS variance: 3.5003270241318494
SCOPE Var loss:  tensor(0.2611, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.9192405820903025
SCOPE mean: 3.03623013914548, SCOPE var: 5.074455600234516
Total Loss: 8.180356407581256
----------------------------------------
Epoch 2
IS mean: 2.9507531033554093,IS variance: 3.5003270241318494
SCOPE Var loss:  tensor(0.1700, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.172540383266576
SCOPE mean: 3.095641933637867, SCOPE var: 5.167460307377982
Total Loss: 7.342494274568023
----------------------------------------
Epoch 3
IS mean: 2.9507531033554093,IS variance: 3.5003270241318494
SCOPE Var loss:  tensor(0.1676, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  6.799803227779146
SCOPE mean: 3.149987360695076, SCOPE var: 5.2514796584297985
Total Loss: 6.9673979348802115
----------------------------------------
Epoch 4
IS mean: 2.9507531033554093,IS variance: 3.5003270241318494
SCOPE Var loss:

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_400_mse = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_400_mse = experiment_actions(400, env_50, P_pi_b_400_mse)
P_pi_e_400 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_400 = experiment_actions(1000, env_50, P_pi_e_400)
# model_400_random_pi_b_400_mse = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_400_random_pi_b_400_mse = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0005)
test_400_random_pi_b_400_mse = SCOPE_straight(model_400_random_pi_b_400_mse, 0.99, 10000, pi_b_400_mse, P_pi_b_400_mse, P_pi_e_400, 0.3, dtype = torch.float64)
test_400_random_pi_b_400_mse.train_var_scope(600, 0.001, 1, 1)

Epoch 1
IS mean: 0.3143338730512621,IS variance: 0.02523737293606626
SCOPE Var loss:  tensor(0.3585, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  8.026483337294023
SCOPE mean: -0.3274251616380668, SCOPE var: 0.018818881618883634
Total Loss: 8.384968384201166
----------------------------------------
Epoch 2
IS mean: 0.3143338730512621,IS variance: 0.02523737293606626
SCOPE Var loss:  tensor(0.1559, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.740368690579542
SCOPE mean: -0.3001916661651302, SCOPE var: 0.01861144103359731
Total Loss: 7.896251552620683
----------------------------------------
Epoch 3
IS mean: 0.3143338730512621,IS variance: 0.02523737293606626
SCOPE Var loss:  tensor(0.1548, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.3163314681292
SCOPE mean: -0.27308848557382137, SCOPE var: 0.018401832793966046
Total Loss: 7.471105743491664
----------------------------------------
Epoch 4
IS mean: 0.3143338730512621,IS variance: 0.02523737293606626


NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_600_mse = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_600_mse = experiment_actions(600, env_50, P_pi_b_600_mse)
P_pi_e_600 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_600 = experiment_actions(1000, env_50, P_pi_e_600)
# model_600_random_pi_b_600_mse = CustomizableFeatureNet(input_dim=2, hidden_dims=[8, 8], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
# model_600_random_pi_b_600_mse = NN_l1_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.001)
model_600_random_pi_b_600_mse = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_600_random_pi_b_600_mse = SCOPE_straight(model_600_random_pi_b_600_mse, 0.99, 10000, pi_b_600_mse, P_pi_b_600_mse, P_pi_e_600, 0.3, dtype = torch.float64)
test_600_random_pi_b_600_mse.train_var_scope(200, 0.001, 1, 0.5)

Epoch 1
IS mean: 1.2158595644451584,IS variance: 0.3042718972439603
SCOPE Var loss:  tensor(1.3600, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  16.85394573179525
SCOPE mean: 1.233982608018241, SCOPE var: 0.1606939214573137
Total Loss: 9.786936898906507
----------------------------------------
Epoch 2
IS mean: 1.2158595644451584,IS variance: 0.3042718972439603
SCOPE Var loss:  tensor(4.2376, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  16.160336248059092
SCOPE mean: 1.2210136292921026, SCOPE var: 0.16082977309307356
Total Loss: 12.317754979410687
----------------------------------------
Epoch 3
IS mean: 1.2158595644451584,IS variance: 0.3042718972439603
SCOPE Var loss:  tensor(4.2058, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  15.502035759824468
SCOPE mean: 1.2070742898746634, SCOPE var: 0.16067468243348831
Total Loss: 11.956863218297187
----------------------------------------
Epoch 4
IS mean: 1.2158595644451584,IS variance: 0.3042718972439603
SCOPE 

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_800_mse = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_800_mse = experiment_actions(800, env_50, P_pi_b_800_mse)
P_pi_e_800 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_800 = experiment_actions(1000, env_50, P_pi_e_800)
model_800_random_pi_b_800_mse = CustomizableFeatureNet(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
# model_800_random_pi_b_800_mse = NN_l1_reg(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l1_lambda=0.0001)
# model_800_random_pi_b_800_mse = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_800_random_pi_b_800_mse = SCOPE_straight(model_800_random_pi_b_800_mse, 0.99, 10000, pi_b_800_mse, P_pi_b_800_mse, P_pi_e_800, 0.3, dtype = torch.float64)
test_800_random_pi_b_800_mse.train_var_scope(300, 0.001, 1, 1)

Epoch 1
IS mean: 0.3930144894842784,IS variance: 0.01232476689934019
SCOPE Var loss:  tensor(0.0088, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  20.77788605297548
SCOPE mean: 0.5641474175565288, SCOPE var: 0.026994582664913284
Total Loss: 20.78665849821228
----------------------------------------
Epoch 2
IS mean: 0.3930144894842784,IS variance: 0.01232476689934019
SCOPE Var loss:  tensor(0.0214, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  20.40859379743519
SCOPE mean: 0.5696977644839282, SCOPE var: 0.027384408647234945
Total Loss: 20.430008994417012
----------------------------------------
Epoch 3
IS mean: 0.3930144894842784,IS variance: 0.01232476689934019
SCOPE Var loss:  tensor(0.0219, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  20.070576188003972
SCOPE mean: 0.5740741616282801, SCOPE var: 0.027672732431925605
Total Loss: 20.09244975068282
----------------------------------------
Epoch 4
IS mean: 0.3930144894842784,IS variance: 0.01232476689934019

CustomizableFeatureNet(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=10, bias=True)
  )
  (output_layer): Linear(in_features=10, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_1000_mse = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_1000_mse = experiment_actions(1000, env_50, P_pi_b_1000_mse)
P_pi_e_1000 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_1000 = experiment_actions(1000, env_50, P_pi_e_1000)
# model_1000_random_pi_b_1000_mse = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_1000_random_pi_b_1000_mse = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
test_1000_random_pi_b_1000_mse = SCOPE_straight(model_1000_random_pi_b_1000_mse, 0.99, 10000, pi_b_1000_mse, P_pi_b_1000_mse, P_pi_e_1000, 0.3, dtype = torch.float64)
test_1000_random_pi_b_1000_mse.train_var_scope(300, 0.001, 1, 0.25)

Epoch 1
IS mean: 1.106270715478129,IS variance: 0.12391107861689143
SCOPE Var loss:  tensor(0.0875, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.725384057721419
SCOPE mean: 1.1199675594962764, SCOPE var: 0.23508798774489986
Total Loss: 2.0188408086472145
----------------------------------------
Epoch 2
IS mean: 1.106270715478129,IS variance: 0.12391107861689143
SCOPE Var loss:  tensor(0.0873, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.469155030957495
SCOPE mean: 1.1290194914259117, SCOPE var: 0.23862165890666512
Total Loss: 1.9546099559807464
----------------------------------------
Epoch 3
IS mean: 1.106270715478129,IS variance: 0.12391107861689143
SCOPE Var loss:  tensor(0.0881, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  7.189022368268439
SCOPE mean: 1.1373710890824933, SCOPE var: 0.24198196383120613
Total Loss: 1.8853453551105308
----------------------------------------
Epoch 4
IS mean: 1.106270715478129,IS variance: 0.12391107861689143
SCOPE

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
test_1000_random_pi_b_1000_mse.get_heatmap()

# Experiment 3 env_100

In [None]:
test_200_random_pi_b_200_env_100.evaluate_scope()

(tensor(0.4748, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(0.0947, dtype=torch.float64, grad_fn=<VarBackward0>))

In [None]:
P_pi_b_200 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_200 = experiment_actions(200, env_100, P_pi_b_200)
P_pi_e_200 = action_probs_top_n_epsilon(q_table, 1, 0.05)
# pi_e_200 = experiment_actions(1000, env_100, P_pi_e_200)
# model_200_random_pi_b_200_env_100 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_200_random_pi_b_200_env_100 = NN_l1_l2_reg(input_dim=2, hidden_dims=[6], output_dim=1, dtype = torch.float64, l1_lambda=0.00001, l2_lambda = 0.00001)
test_200_random_pi_b_200_env_100 = SCOPE_straight(model_200_random_pi_b_200_env_100, 0.99, 10000, pi_b_200, P_pi_b_200, P_pi_e_200, 0.3, dtype = torch.float64)
test_200_random_pi_b_200_env_100.train_var_scope(300, 0.001, 1, 0.1)

Epoch 1
IS mean: 0.14880267027988744,IS variance: 0.0061515626555271935
SCOPE Var loss:  tensor(0.0065, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  11.952126796991125
SCOPE mean: -0.10654215033536209, SCOPE var: 0.016902695762594084
Total Loss: 1.2017277958598398
----------------------------------------
Epoch 2
IS mean: 0.14880267027988744,IS variance: 0.0061515626555271935
SCOPE Var loss:  tensor(0.0289, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  12.041383567392199
SCOPE mean: -0.10107118088490599, SCOPE var: 0.016076063682742302
Total Loss: 1.233047946838456
----------------------------------------
Epoch 3
IS mean: 0.14880267027988744,IS variance: 0.0061515626555271935
SCOPE Var loss:  tensor(0.0277, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  11.797057650302548
SCOPE mean: -0.0955841077017358, SCOPE var: 0.015279902652995808
Total Loss: 1.2074291263107775
----------------------------------------
Epoch 4
IS mean: 0.14880267027988744,IS variance: 0

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=6, bias=True)
  )
  (output_layer): Linear(in_features=6, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_400 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_400 = experiment_actions(400, env_100, P_pi_b_400)
P_pi_e_400 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_400 = experiment_actions(1000, env_100, P_pi_e_400)
# model_400_random_pi_b_400_env_100 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_400_random_pi_b_400_env_100 = NN_l1_l2_reg(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l1_lambda=0.0002, l2_lambda = 0.0001)
test_400_random_pi_b_400_env_100 = SCOPE_straight(model_400_random_pi_b_400_env_100, 0.99, 10000, pi_b_400, P_pi_b_400, P_pi_e_400, 0.3, dtype = torch.float64)
test_400_random_pi_b_400_env_100.train_var_scope(400, 0.001, 1, 0)

Epoch 1
IS mean: 0.9031099370141558,IS variance: 0.3119340165290428
SCOPE Var loss:  tensor(0.0889, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  20.362240639987892
SCOPE mean: 1.100768566131586, SCOPE var: 0.6797449399930097
Total Loss: 0.08887433149744364
----------------------------------------
Epoch 2
IS mean: 0.9031099370141558,IS variance: 0.3119340165290428
SCOPE Var loss:  tensor(0.7305, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  20.542493045851415
SCOPE mean: 1.0917943729755988, SCOPE var: 0.6700620990008974
Total Loss: 0.730538071979331
----------------------------------------
Epoch 3
IS mean: 0.9031099370141558,IS variance: 0.3119340165290428
SCOPE Var loss:  tensor(0.6992, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  20.562039665563347
SCOPE mean: 1.0809234196303648, SCOPE var: 0.6619065042285946
Total Loss: 0.6992122502132285
----------------------------------------
Epoch 4
IS mean: 0.9031099370141558,IS variance: 0.3119340165290428
SCOPE 

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=10, bias=True)
  )
  (output_layer): Linear(in_features=10, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
# P_pi_b_600 = action_probs_top_n_epsilon(q_table, 1, 0.4)
# pi_b_600 = experiment_actions(600, env_100, P_pi_b_600)
# P_pi_e_600 = action_probs_top_n_epsilon(q_table, 1, 0.05)
# pi_e_600 = experiment_actions(1000, env_100, P_pi_e_600)
# # model_600_random_pi_b_600_env_100 = CustomizableFeatureNet(input_dim=2, hidden_dims=[8, 8], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
# # model_600_random_pi_b_600_env_100 = NN_l1_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.001)
# model_600_random_pi_b_600_env_100 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
# test_600_random_pi_b_600_env_100 = SCOPE_straight(model_600_random_pi_b_600_env_100, 0.99, 10000, pi_b_600, P_pi_b_600, P_pi_e_600, 0.3, dtype = torch.float64)
test_600_random_pi_b_600_env_100.train_var_scope(50, 0.001, 1, 0.15)

In [None]:
# P_pi_b_800 = action_probs_top_n_epsilon(q_table, 1, 0.4)
# pi_b_800 = experiment_actions(800, env_100, P_pi_b_800)
# P_pi_e_800 = action_probs_top_n_epsilon(q_table, 1, 0.05)
# pi_e_800 = experiment_actions(1000, env_100, P_pi_e_800)
# # model_800_random_pi_b_800_env_100 = CustomizableFeatureNet(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
# # model_800_random_pi_b_800_env_100 = NN_l1_reg(input_dim=2, hidden_dims=[10, 10], output_dim=1, dtype = torch.float64, l1_lambda=0.0001)
# model_800_random_pi_b_800_env_100 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0001, l2_lambda = 0.0001)
# test_800_random_pi_b_800_env_100 = SCOPE_straight(model_800_random_pi_b_800_env_100, 0.99, 10000, pi_b_800, P_pi_b_800, P_pi_e_800, 0.3, dtype = torch.float64)
test_800_random_pi_b_800_env_100.train_var_scope(100, 0.001, 1, 0.3)

Epoch 1
IS mean: 0.39155172544201666,IS variance: 0.03464493922526116
SCOPE Var loss:  tensor(0.3102, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.7104331187633571
SCOPE mean: 0.25145097682447576, SCOPE var: 0.005720949296649719
Total Loss: 0.523304287323519
----------------------------------------
Epoch 2
IS mean: 0.39155172544201666,IS variance: 0.03464493922526116
SCOPE Var loss:  tensor(0.0144, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.693729122102974
SCOPE mean: 0.23986316648905373, SCOPE var: 0.005580951116453046
Total Loss: 0.22250727909176887
----------------------------------------
Epoch 3
IS mean: 0.39155172544201666,IS variance: 0.03464493922526116
SCOPE Var loss:  tensor(0.0145, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  0.6932282166881837
SCOPE mean: 0.23705876587684002, SCOPE var: 0.00552501777462655
Total Loss: 0.22251044319185878
----------------------------------------
Epoch 4
IS mean: 0.39155172544201666,IS variance: 0.0346449

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
P_pi_b_1000 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_1000 = experiment_actions(1000, env_100, P_pi_b_1000)
P_pi_e_1000 = action_probs_top_n_epsilon(q_table, 1, 0.05)
pi_e_1000 = experiment_actions(1000, env_100, P_pi_e_1000)
# model_1000_random_pi_b_1000_env_100 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l2_lambda=0.002)
model_1000_random_pi_b_1000_env_100 = NN_l1_l2_reg(input_dim=2, hidden_dims=[16, 16], output_dim=1, dtype = torch.float64, l1_lambda=0.0002, l2_lambda = 0.0001)
test_1000_random_pi_b_1000_env_100 = SCOPE_straight(model_1000_random_pi_b_1000_env_100, 0.99, 10000, pi_b_1000, P_pi_b_1000, P_pi_e_1000, 0.3, dtype = torch.float64)
test_1000_random_pi_b_1000_env_100.train_var_scope(300, 0.001, 1, 0.1)

Epoch 1
IS mean: 0.46514684963012803,IS variance: 0.014499969252461947
SCOPE Var loss:  tensor(0.0427, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  6.337769062885827
SCOPE mean: 0.41989625769625855, SCOPE var: 0.03388364029488501
Total Loss: 0.6764456763066073
----------------------------------------
Epoch 2
IS mean: 0.46514684963012803,IS variance: 0.014499969252461947
SCOPE Var loss:  tensor(0.0402, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  6.120085835006211
SCOPE mean: 0.4182087240737377, SCOPE var: 0.03439496238293219
Total Loss: 0.6522064849802002
----------------------------------------
Epoch 3
IS mean: 0.46514684963012803,IS variance: 0.014499969252461947
SCOPE Var loss:  tensor(0.0393, dtype=torch.float64, grad_fn=<VarBackward0>)
MSE loss:  5.931176821983479
SCOPE mean: 0.4164676401268267, SCOPE var: 0.034900649650177674
Total Loss: 0.6324637253841687
----------------------------------------
Epoch 4
IS mean: 0.46514684963012803,IS variance: 0.0144999692

NN_l1_l2_reg(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
  )
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

# train var scope

In [None]:
def train_var_scope(model, num_epochs, learning_rate, test1):

    padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors, padded_states_next_tensors, padded_states_current_tensors = test1.prepare()

    model.train()

    # Enable anomaly detection
    torch.autograd.set_detect_anomaly(True)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0

        # Forward pass
        # states_output, states_first_output, states_last_output = test1.pass_states(model, padded_state_tensors, states_first_tensor, states_last_tensor)
        # sums_states_weight_diff = test1.states_weight_diff_sums(states_output, padded_weight_diff_tensors)
        # gamma_weights_states_last_sub_states_first = test1.last_first_terms_operations(gamma_weights_last_tensor, states_last_output, states_first_output, weight_first_tensor)
        # # sample_sums_states_weight_diff, samples_gamma_weight_states_last_sub_states_first, samples_all_shaping, samples_IS_SCOPE = test1.bootstrap_shaping_terms(sums_states_weight_diff, gamma_weights_states_last_sub_states_first, IS_tensor)

        # samples_IS, sample_sums_states_weight_diff, samples_gamma_weight_states_last_sub_states_first, samples_all_shaping, samples_IS_SCOPE = test1.bootstrap_all_terms(sums_states_weight_diff, gamma_weights_states_last_sub_states_first, IS_tensor, padded_psi_tensors)


        # Calculate MSE loss between states_output and padded_state_tensors
        # mse_loss = F.mse_loss(states_output, padded_state_tensors)

        # E_IS_sq, E_IS_all_sq, E_s_wdiff_sq, E_s_wdiff_all_sq, E_IS_SCOPE, E_IS_E_SCOPE, _, variance_loss, E_IS, E_SCOPE = calculate_shaped_variance_play(samples_IS, sample_sums_states_weight_diff, samples_gamma_weight_states_last_sub_states_first, samples_all_shaping, samples_IS_SCOPE)

        timestep_bootstraps, rewards_bootstraps, weights_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps = test1.pass_then_boostraps(model, padded_states_next_tensors, padded_states_current_tensors, padded_timestep_tensors, padded_reward_tensors, padded_weight_tensors)
        IS_variance, variance_loss = test1.calc_variance_straight(timestep_bootstraps, weights_bootstraps, rewards_bootstraps, phi_states_next_bootstraps, phi_states_current_bootstraps)
        print(f"Epoch {epoch+1}")
        print("IS variance: ", IS_variance)
        print("SCOPE Var loss: ", variance_loss)
        # print("MSE loss: ", mse_loss.item())


        tot = variance_loss
        # tot = variance_loss + mse_loss

        optimizer.zero_grad()

        # Retain the graph to avoid clearing it before backward pass
        tot.backward(retain_graph=True)

        optimizer.step()

        total_loss += tot.item()

        print(f"Total Loss: {total_loss}")
        print("-" * 40)

    # Disable anomaly detection after running the code
    torch.autograd.set_detect_anomaly(False)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"Parameter name: {name}")
            print(f"Weights: {param.data}")

    return model


# Test

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
pi_e = experiment_actions(200, env, P_pi_e)

In [None]:
model_200 = CustomizableFeatureNet(input_dim=2, hidden_dims=[16, 32], output_dim=1, dtype = torch.float64)

In [None]:
test_200 = SCOPE_straight(model_200, 0.9, 10000, pi_b, P_pi_b, P_pi_e, dtype = torch.float64)

In [None]:
model_200 = train_var_scope(model_200, 1000, 0.0005, test_200)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0008492256656322879
----------------------------------------
Epoch 22
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.000832036997470311
----------------------------------------
Epoch 23
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0008178091443168292
----------------------------------------
Epoch 24
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0008, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.0008067588044517235
----------------------------------------
Epoch 25
IS variance:  tensor(4.4229e-05, dtype=torch.float64)
SCOPE Var loss:  tenso

# Test 200 0.99

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e = experiment_actions(200, env, P_pi_e)

In [None]:
P_pi_b = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b = experiment_actions(200, env_30, P_pi_b)
P_pi_e = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e = experiment_actions(200, env, P_pi_e)
model_200_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float64)
test_200_0p99 = SCOPE_straight(model_200_0p99, 0.99, 10000, pi_b, P_pi_b, P_pi_e, dtype = torch.float64)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.8547, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.8546716723749704
----------------------------------------
Epoch 7
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.8345, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.8344950296004069
----------------------------------------
Epoch 8
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.8157, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.815702575610253
----------------------------------------
Epoch 9
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.7972, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.7971985257757953
----------------------------------------
Epoch 10
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(0.7790, dtype

In [None]:
model_200_0p99 = train_var_scope(model_200_0p99, 200, 0.001, test_200_0p99)

Epoch 1
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(6.2321e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 6.232071548283213e-05
----------------------------------------
Epoch 2
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(5.8212e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 5.821238477719147e-05
----------------------------------------
Epoch 3
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(5.4394e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 5.439401945160914e-05
----------------------------------------
Epoch 4
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(5.0930e-05, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 5.093024913095342e-05
----------------------------------------
Epoch 5
IS variance:  tensor(6.4534e-06, dtype=torch.float64)
SCOPE Var loss:  tensor(4.7785e-05, dtype=torch.float64, grad_fn=<

# Test 400 0.99

In [None]:
P_pi_b_400 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_400 = experiment_actions(400, env, P_pi_b_400)
P_pi_e_400 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_400 = experiment_actions(400, env, P_pi_e_400)
model_400_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float64)
test_400_0p99 = SCOPE_straight(model_400_0p99, 0.99, 1000, pi_b_400, P_pi_b_400, P_pi_e_400, dtype = torch.float64)
model_400_0p99 = train_var_scope(model_400_0p99, 5, 0.001, test_400_0p99)

Epoch 1
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.0864, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.08636927251674584
----------------------------------------
Epoch 2
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2360, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.23601228060459742
----------------------------------------
Epoch 3
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2284, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.22842130839536168
----------------------------------------
Epoch 4
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2203, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.22034274292801542
----------------------------------------
Epoch 5
IS variance:  tensor(0.0002, dtype=torch.float64)
SCOPE Var loss:  tensor(0.2122, dtype=torch.float64, grad_fn=<VarBackward0>)
Total Loss: 0.21219201524204384
-

# Test 600 0.99

In [None]:
P_pi_b_600 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_600 = experiment_actions(600, env_30, P_pi_b_600)
P_pi_e_600 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_600 = experiment_actions(600, env, P_pi_e_600)
model_600_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float32)
test_600_0p99 = SCOPE_straight(model_600_0p99, 0.99, 10000, pi_b_600, P_pi_b_600, P_pi_e_600, dtype = torch.float32)
model_600_0p99 = train_var_scope(model_600_0p99, 5, 0.001, test_600_0p99)

Epoch 1
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0505, grad_fn=<VarBackward0>)
Total Loss: 0.05046245828270912
----------------------------------------
Epoch 2
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0493, grad_fn=<VarBackward0>)
Total Loss: 0.0493154413998127
----------------------------------------
Epoch 3
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0481, grad_fn=<VarBackward0>)
Total Loss: 0.04812745749950409
----------------------------------------
Epoch 4
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0469, grad_fn=<VarBackward0>)
Total Loss: 0.046937569975852966
----------------------------------------
Epoch 5
IS variance:  tensor(0.0007)
SCOPE Var loss:  tensor(0.0458, grad_fn=<VarBackward0>)
Total Loss: 0.04576045647263527
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[ 0.1204,  0.1797],
        [ 0.1732,  0.2432],
        [ 0.6242,  0.4461],
        [ 0.4436,  0.0618],
        [ 

# Test 800 0.99

In [None]:
P_pi_b_800 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_800 = experiment_actions(800, env_30, P_pi_b_800)
P_pi_e_800 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_800 = experiment_actions(800, env_30, P_pi_e_800)
model_800_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float32)
test_800_0p99 = SCOPE_straight(model_800_0p99, 0.99, 10000, pi_b_800, P_pi_b_800, P_pi_e_800, dtype = torch.float32)
model_800_0p99 = train_var_scope(model_800_0p99, 5, 0.001, test_800_0p99)

Epoch 1
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1004, grad_fn=<VarBackward0>)
Total Loss: 0.10040785372257233
----------------------------------------
Epoch 2
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1327, grad_fn=<VarBackward0>)
Total Loss: 0.13271035254001617
----------------------------------------
Epoch 3
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1297, grad_fn=<VarBackward0>)
Total Loss: 0.12969112396240234
----------------------------------------
Epoch 4
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1267, grad_fn=<VarBackward0>)
Total Loss: 0.12670785188674927
----------------------------------------
Epoch 5
IS variance:  tensor(6.2439e-07)
SCOPE Var loss:  tensor(0.1238, grad_fn=<VarBackward0>)
Total Loss: 0.12376594543457031
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[ 0.1256,  0.1763],
        [ 0.1731,  0.2431],
        [ 0.6243,  0.4461],
        [ 0.4445, 

# Test 1000 0.99

In [None]:
P_pi_b_1000 = action_probs_top_n_epsilon(q_table, 1, 0.4)
pi_b_1000 = experiment_actions(1000, env_30, P_pi_b_1000)
P_pi_e_1000 = action_probs_top_n_epsilon(q_table, 2, 0.05)
# pi_e_1000 = experiment_actions(1000, env, P_pi_e_1000)
model_1000_0p99 = CustomizableFeatureNet(input_dim=2, hidden_dims=[6, 6], output_dim=1, dtype = torch.float32)
test_1000_0p99 = SCOPE_straight(model_1000_0p99, 0.90, 10000, pi_b_1000, P_pi_b_1000, P_pi_e_1000, dtype = torch.float32)
model_1000_0p99 = train_var_scope(model_1000_0p99, 5, 0.001, test_1000_0p99)

Epoch 1
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0071, grad_fn=<VarBackward0>)
Total Loss: 0.0071372101083397865
----------------------------------------
Epoch 2
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0059, grad_fn=<VarBackward0>)
Total Loss: 0.005885153077542782
----------------------------------------
Epoch 3
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0057, grad_fn=<VarBackward0>)
Total Loss: 0.005744975060224533
----------------------------------------
Epoch 4
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0056, grad_fn=<VarBackward0>)
Total Loss: 0.005605767946690321
----------------------------------------
Epoch 5
IS variance:  tensor(1.3628e-08)
SCOPE Var loss:  tensor(0.0055, grad_fn=<VarBackward0>)
Total Loss: 0.005469260271638632
----------------------------------------
Parameter name: hidden_layers.0.weight
Weights: tensor([[-0.3398,  0.4284],
        [ 0.6470, -0.5601],
        [-0.0836, -0.5419],
        [ 0.