In [None]:
import numpy as np
import os
from google.colab import drive
import pickle
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
from scipy.optimize import minimize
import random
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# Creating Environment

In [None]:

class GridWorld:
    def __init__(self, height, width, start, end, bad_regions, good_regions, good_region_reward, bad_region_reward, final_reward, sparsity):
        self.height = height
        self.width = width
        self.start = start
        self.end = end
        self.bad_regions = bad_regions
        self.good_regions = good_regions
        self.good_region_reward = good_region_reward
        self.bad_region_reward = bad_region_reward
        self.final_reward = final_reward
        self.sparsity = sparsity

        self.state_rewards = self.generate_state_rewards()
        self.reset()

    def reset(self):
        self.agent_position = self.start

    def generate_state_rewards(self):
        state_rewards = {}
        for x in range(self.width):
            for y in range(self.height):
                if (x, y) in self.good_regions:
                    state_rewards[(x, y)] = self.good_region_reward
                elif (x, y) in self.bad_regions:
                    state_rewards[(x, y)] = self.bad_region_reward
                elif (x,y) == self.start:
                  state_rewards[(x,y)] = 0
                elif (x,y) == self.end:
                  state_rewards[(x,y)] = self.final_reward
                else:
                    state_rewards[(x, y)] = 0.5 if np.random.random() < self.sparsity else 0.0
        return state_rewards

    def step(self, action):
        x, y = self.agent_position

        # Get the reward based on the current state and policy context
        reward = self.state_rewards.get((x, y), 0)

        if action == "up" and y < self.height - 1:
            y += 1
        elif action == "down" and y > 0:
            y -= 1
        elif action == "left" and x > 0:
            x -= 1
        elif action == "right" and x < self.width - 1:
            x += 1

        # Update agent position
        self.agent_position = (x, y)

        # Get the reward based on the current state and policy context
        reward = self.state_rewards.get(self.agent_position, 0)

        # # Update state_rewards for self.agent_position if needed
        # if self.agent_position in self.good_regions:
        #     self.state_rewards[self.agent_position] = self.good_region_reward
        # elif self.agent_position in self.bad_regions:
        #     self.state_rewards[self.agent_position] = self.bad_region_reward

        if self.agent_position in self.end:
            done = True
        else:
            done = False

        # Get the reward for the updated position and policy context
        updated_reward = self.state_rewards.get(self.agent_position, 0)

        # Check if the new position is the end state
        done = (self.agent_position == self.end)

        return self.agent_position, updated_reward, done



In [None]:
import numpy as np

class Agent:
    def __init__(self, epsilon=0.0):
        self.epsilon = epsilon

    def select_action(self, policy_func):
        if np.random.uniform() < self.epsilon:
            # Choose a random action
            action = np.random.choice(["up", "down", "left", "right"])
        else:
            # Use the provided policy function to get the best action
            action = policy_func()
        return action

# Define different policy functions outside the class

def random_policy():
    # Choose a random action
    return np.random.choice(["up", "down", "left", "right"])

# def behavior_policy(behav_policy):
#     action_probs = behav_policy
#     return np.random.choice(list(action_probs.keys()), p=list(action_probs.values()))

# def evaluation_policy(eval_policy):
#     action_probs = eval_policy
#     return np.random.choice(list(action_probs.keys()), p=list(action_probs.values()))

def run_policy(policy):
    action_probs = policy
    return np.random.choice(list(action_probs.keys()), p=list(action_probs.values()))


def manhattan_distance(pos1, pos2):
    # Compute the Manhattan distance between two positions
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}

# Generating Policy data

In [None]:
# Gridworld environment
height = 5
width  = 5
start = (0,0)
end = (4,4)

In [None]:

good_regions = [(3,3)]
bad_regions = [(1,1),(2,2)]
np.random.seed(42)

# Create an instance of the GridWorld class
gridworld = GridWorld(height, width, start, end, bad_regions, good_regions, 1, -2, 3, 0.5)

# Access the state_rewards dictionary
rewards = gridworld.state_rewards

# Print the rewards for each state
for state, reward in rewards.items():
    print(f"State: {state}, Reward: {reward}")


In [None]:
def create_policy_set(env, policy_func, policy, num_episodes):
  # Create a list to store policies as trajectories
  policies = []

  # Run multiple episodes
  for episode in range(num_episodes):
      # Create a new Agent for each episode to generate a different policy

      agent = Agent(epsilon=0.0)

      # print(episode)
      # Run an episode
      env.reset()
      done = False
      trajectory = []  # Store the trajectory for the current episode
      cumulative_reward = 0.0  # Initialize cumulative reward
      while not done:
          state = env.agent_position  # Get the current state
          # print("State: ",state)
          action = agent.select_action(lambda: policy_func(policy))
          # print("Action: ",action)
          next_state, reward, done = env.step(action)
          # print("Next State: ", next_state)

          # Compute cumulative reward
          cumulative_reward += reward

          # # Compute feature function values (manhattan distances)
          good_region_distances = [manhattan_distance(state, gr) for gr in env.good_regions]
          bad_region_distances = [manhattan_distance(state, br) for br in env.bad_regions]

          # Store the (state, action, reward, next_state) tuple in the trajectory
          trajectory.append((state, action, reward, next_state, good_region_distances, bad_region_distances))

      # Append the trajectory to the policies list
      policies.append(trajectory)

  return policies


In [None]:
pi_b = create_policy_set(gridworld, run_policy, behav_policy, 200)

In [None]:
pi_b[0][0][4]+pi_b[0][0][5]

In [None]:
def calc_V_pi_e(evaluation_policies):
    all_timesteps = []
    gamma = 0.9
    for j in range(len(evaluation_policies)):
        Timestep_values = []
        for i in range(len(evaluation_policies[j])):
          # print(i)
          timestep = gamma ** (i) * evaluation_policies[j][i][2]
          Timestep_values.append(timestep)

        all_timesteps.append(Timestep_values)

    V_est = sum([sum(sublist) for sublist in all_timesteps])/len(evaluation_policies)
    return V_est

# Saving and Loading Data

In [None]:
def filename(env, behav_policy, eval_policy, num_episodes, train_split, sparsity):
    good_regions_str = "_".join([f"gr_{pos[0]}_{pos[1]}" for pos in env.good_regions])
    bad_regions_str = "_".join([f"br_{pos[0]}_{pos[1]}" for pos in env.bad_regions])

    behav_probs_str = "_".join([f"{prob:.2f}" for prob in behav_policy.values()])
    eval_probs_str = "_".join([f"{prob:.2f}" for prob in eval_policy.values()])

    file = f"pi_b_{behav_probs_str}_pi_e_{eval_probs_str}_{good_regions_str}_{env.good_region_reward}_{bad_regions_str}_{env.bad_region_reward}_trajectories_{num_episodes}_train_split_{train_split}_sparsity_{sparsity}.txt"
    return file

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:

# # Define the path to your desired folder
# folder_path = '/content/drive/MyDrive/gridworld_same_reward_OPE_experiments'

# # Change the working directory to the specified folder
# os.chdir(folder_path)


In [None]:

def save_data_to_file(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def load_data_from_file(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data


# OPE Calculations

## Importance Weights

In [None]:
def calculate_importance_weights(eval_policy, behav_policy, behavior_policies):
  all_weights = []
  for trajectory in behavior_policies:
    cum_ratio = 1
    cumul_weights = []
    for step in trajectory:
        ratio = eval_policy[step[1]]/behav_policy[step[1]]
        # print("Ratio:",ratio)
        cum_ratio *= ratio
        cumul_weights.append(cum_ratio)
        # print("Cumul:",cum_ratio)
    all_weights.append(cumul_weights)

  return all_weights

In [None]:

# def per_step_IS(scope_set, num_bootstraps):
#     all_timesteps = []
#     gamma = 0.9
#     # scope_set,_ = subset_policies(scope_set, phi_trajectories)
#     scope_weights = calculate_importance_weights(eval_policy, behav_policy, scope_set)
#     for j in range(len(scope_weights)):
#         Timestep_values = []
#         for i in range(len(scope_weights[j]) - 1):
#             timestep = gamma ** (i) * scope_weights[j][i] * scope_set[j][i][2]
#             Timestep_values.append(timestep)

#         all_timesteps.append(Timestep_values)

#     V_per_traj = [sum(sublist) for sublist in all_timesteps]

#     # seed_value = 42
#     # np.random.seed(seed_value)

#     num_trajectories_to_sample = max(1, len(V_per_traj))

#     seed_value = 0
#     np.random.seed(seed_value)

#     bootstrap_samples = [np.random.choice(V_per_traj, size=num_trajectories_to_sample, replace=True)
#                          for _ in range(num_bootstraps)]

#     V_per_sample = [sum(sample) / len(scope_set) for sample in bootstrap_samples]
#     V_per_sample = np.array(V_per_sample)

#     std_deviation = np.std(V_per_sample)
#     quartiles = np.percentile(V_per_sample, [0,25, 50, 75,100])
#     max_value = np.max(V_per_sample)
#     min_value = np.min(V_per_sample)
#     mean = np.mean(V_per_sample)

#     return {
#         'std_deviation': std_deviation,
#         'quartiles': quartiles,
#         'max_value': max_value,
#         'min_value': min_value,
#         'mean': mean
#     }


## IS

In [None]:

# def per_step_IS1(scope_set, num_bootstraps):
#     all_timesteps = []
#     gamma = 0.9
#     # scope_set,_ = subset_policies(scope_set, phi_trajectories)
#     scope_weights = calculate_importance_weights(eval_policy, behav_policy, scope_set)
#     for j in range(len(scope_weights)):
#         Timestep_values = []
#         for i in range(len(scope_weights[j]) - 1):
#             timestep = gamma ** (i) * scope_weights[j][i] * scope_set[j][i][2]
#             Timestep_values.append(timestep)

#         all_timesteps.append(Timestep_values)

#     V_per_traj = [sum(sublist) for sublist in all_timesteps]


#     num_trajectories_to_sample = max(1, len(V_per_traj))


#     V_per_traj = [sum(sublist) for sublist in all_timesteps]
#     num_trajectories_to_sample = max(1, len(V_per_traj))

#     std_devs = []
#     means = []

#     seed_value = 0
#     np.random.seed(seed_value)

#     for i in range(5):

#       bootstrap_samples = [np.random.choice(V_per_traj, size=num_trajectories_to_sample, replace=True)
#                             for _ in range(num_bootstraps)]

#       V_per_sample = [sum(sample)/len(scope_set) for sample in bootstrap_samples]
#       V_per_sample = np.array(V_per_sample)

#       std_deviation = np.std(V_per_sample)
#       quartiles = np.percentile(V_per_sample, [0,25, 50, 75,100])
#       max_value = np.max(V_per_sample)
#       min_value = np.min(V_per_sample)
#       mean = np.mean(V_per_sample)

#       std_devs.append(std_deviation)
#       means.append(mean)

#     return {
#         'std_devs_list': std_devs,
#         'mean_list': means
#     }



In [None]:

# def SCOPE(scope_policies, beta, num_bootstraps):
#     all_timesteps = []
#     gamma = 0.9
#     # scope_policies,_ = subset_policies(scope_policies, phi_trajectories)
#     scope_weights = calculate_importance_weights(eval_policy, behav_policy, scope_policies)
#     for j in range(len(scope_weights)):
#         Timestep_values = []
#         for i in range(len(scope_weights[j]) - 1):
#             features = scope_policies[j][i][5] + scope_policies[j][i][6]
#             features_next = scope_policies[j][i + 1][5] + scope_policies[j][i + 1][6]
#             timestep = gamma ** (i) * scope_weights[j][i] * (scope_policies[j][i][2] + gamma * phi(features_next, beta) - phi(features, beta))
#             Timestep_values.append(timestep)

#         all_timesteps.append(Timestep_values)


#     V_per_traj = [sum(sublist) for sublist in all_timesteps]


#     num_trajectories_to_sample = max(1, len(V_per_traj))

#     bootstrap_samples = [np.random.choice(V_per_traj, size=num_trajectories_to_sample, replace=True)
#                          for _ in range(num_bootstraps)]

#     V_per_sample = [sum(sample)/len(scope_policies) for sample in bootstrap_samples]
#     V_per_sample = np.array(V_per_sample)

#     std_deviation = np.std(V_per_sample)
#     quartiles = np.percentile(V_per_sample, [0,25, 50, 75,100])
#     max_value = np.max(V_per_sample)
#     min_value = np.min(V_per_sample)
#     mean = np.mean(V_per_sample)

#     return {
#         'std_deviation': std_deviation,
#         'quartiles': quartiles,
#         'max_value': max_value,
#         'min_value': min_value,
#         'mean': mean
#     }


## SCOPE

In [None]:

# def SCOPE1(scope_policies, beta, num_bootstraps):
#     all_timesteps = []
#     gamma = 0.9
#     # scope_policies,_ = subset_policies(scope_policies, phi_trajectories)
#     scope_weights = calculate_importance_weights(eval_policy, behav_policy, scope_policies)
#     for j in range(len(scope_weights)):
#         Timestep_values = []
#         for i in range(len(scope_weights[j]) - 1):
#             features = scope_policies[j][i][5] + scope_policies[j][i][6]
#             features_next = scope_policies[j][i + 1][5] + scope_policies[j][i + 1][6]
#             timestep = gamma ** (i) * scope_weights[j][i] * (scope_policies[j][i][2] + gamma * phi(features_next, beta) - phi(features, beta))
#             Timestep_values.append(timestep)

#         all_timesteps.append(Timestep_values)


#     V_per_traj = [sum(sublist) for sublist in all_timesteps]
#     num_trajectories_to_sample = max(1, len(V_per_traj))

#     std_devs = []
#     means = []
#     seed_value = 0
#     np.random.seed(seed_value)
#     for i in range(5):

#       bootstrap_samples = [np.random.choice(V_per_traj, size=num_trajectories_to_sample, replace=True)
#                             for _ in range(num_bootstraps)]

#       V_per_sample = [sum(sample)/len(scope_policies) for sample in bootstrap_samples]
#       V_per_sample = np.array(V_per_sample)

#       std_deviation = np.std(V_per_sample)
#       quartiles = np.percentile(V_per_sample, [0,25, 50, 75,100])
#       max_value = np.max(V_per_sample)
#       min_value = np.min(V_per_sample)
#       mean = np.mean(V_per_sample)

#       std_devs.append(std_deviation)
#       means.append(mean)

#     return {
#         'std_devs_list': std_devs,
#         'mean_list': means
#     }


# Variance Preparation and Calculation

## Phi functions

In [None]:
# def phi(features, beta):
#   features = np.array(features)
#   beta = np.array(beta)
#   phi_linear = np.dot(beta,features)
#   return phi_linear


In [None]:
# def phi2(features, beta):
#     features = np.array(features)
#     beta = np.array(beta)

#     # Calculate the matrix multiplication and then element-wise multiplication
#     phi_quadratic = np.dot(features, np.dot(beta, features.T))

#     return phi_quadratic

## Subset Policies

In [None]:

def subset_policies(policies, percent_to_estimate_phi):
    seed_value = 0
    np.random.seed(seed_value)
    num_policies = len(policies)
    num_policies_to_estimate_phi = int(num_policies * percent_to_estimate_phi)

    policies_for_scope = policies[num_policies_to_estimate_phi:]
    policies_for_phi = policies[:num_policies_to_estimate_phi]

    return policies_for_scope, policies_for_phi


## Variance Terms

In [None]:
# import random
# # gamma = 0.9
# # beta = [random.random() for _ in range(3)]
# def variance_terms(policy_set,gamma, beta):
#   all_weights = calculate_importance_weights(eval_policy, behav_policy, policy_set)
#   y_w_r_all = 0
#   r_all = 0
#   f_a = 0
#   for n in range(len(policy_set)):
#     y_w_r = 0
#     r = 0
#     for t in range(len(policy_set[n])-1):
#       features = policy_set[n][t][0]
#       y_w_r += gamma**(t)*all_weights[n][t]*policy_set[n][t][2]
#       if t>0:
#         r += phi(features, beta)*(all_weights[n][t-1]-all_weights[n][t])
#     features_last = policy_set[n][-1][0]
#     features_first = policy_set[n][0][0]
#     y_w_r_all += y_w_r
#     f_a +=  gamma**(len(policy_set[n]))*all_weights[n][-1]*phi(features_last,beta) - phi(features_first, beta)
#     r_all += r

#   IS = y_w_r_all/len(policy_set)
#   R = r_all/len(policy_set)
#   F = f_a/len(policy_set)
#   return IS, R, F


In [None]:
def variance_terms(policy_set, gamma, feature_network):
    all_weights = calculate_importance_weights(eval_policy, behav_policy, policy_set)
    y_w_r_all = 0
    r_all = 0
    f_a = 0
    for n in range(len(policy_set)):
        y_w_r = 0
        r = 0
        for t in range(len(policy_set[n]) - 1):
            state = policy_set[n][t][0]
            distances = feature_network(torch.tensor(state, dtype=torch.float32))
            y_w_r += gamma**(t) * all_weights[n][t] * policy_set[n][t][2]
            if t > 0:
                r += distances * (all_weights[n][t-1] - all_weights[n][t])
        state_last = policy_set[n][-1][0]
        state_first = policy_set[n][0][0]
        y_w_r_all += y_w_r
        f_a += gamma**(len(policy_set[n])) * all_weights[n][-1] * feature_network(torch.tensor(state_last, dtype=torch.float32)) - feature_network(torch.tensor(state_first, dtype=torch.float32))
        r_all += r

    IS = y_w_r_all / len(policy_set)
    R = r_all / len(policy_set)
    F = f_a / len(policy_set)
    return IS, R, F


In [None]:
def calc_variance(phi_policies, gamma, beta, num_bootstrap_samples):
  # Set the seed value (you can use any integer value)
  seed_value = 0
  np.random.seed(seed_value)
  num_trajectories_to_sample = max(1, len(phi_policies))

  bootstrap_samples = [np.random.choice(phi_policies, size=num_trajectories_to_sample, replace=True)
                         for _ in range(num_bootstrap_samples)]
  IS_all = []
  R_all = []
  F_all = []

  for pol in bootstrap_samples:
    IS, R, F = variance_terms(pol,0.9,beta)
    IS_all.append(IS)
    R_all.append(R)
    F_all.append(F)
  IS_sq = np.mean([num**2 for num in IS_all])
  IS_R_F = 2*np.mean([IS_all[i]*(R_all[i]+F_all[i]) for i in range(len(IS_all))])
  R_sq = np.mean([num**2 for num in R_all])
  IS_sq_all = (np.mean(IS_all))**2
  IS_r_t_f = 2*np.mean(IS_all)*np.mean([R_all[i]+F_all[i] for i in range(len(R_all))])
  R_sq_all = (np.mean(R_all))**2

  variance_scope = IS_sq + IS_R_F + R_sq - IS_sq_all - IS_r_t_f - R_sq_all
  variance_is = IS_sq - IS_sq_all
  return variance_scope, variance_is

In [None]:
# def calc_variance1(phi_policies, gamma, beta, num_bootstrap_samples):
#   # Set the seed value (you can use any integer value)
#   # seed_value = 42
#   # np.random.seed(seed_value)
#   num_trajectories_to_sample = max(1, len(phi_policies))

#   bootstrap_samples = [np.random.choice(phi_policies, size=num_trajectories_to_sample, replace=True)
#                          for _ in range(num_bootstrap_samples)]
#   IS_all = []
#   R_all = []
#   F_all = []

#   for pol in bootstrap_samples:
#     IS, R, F = variance_terms(pol,0.9,beta)
#     IS_all.append(IS)
#     R_all.append(R)
#     F_all.append(F)
#   IS_sq = np.mean([num**2 for num in IS_all])
#   IS_R_F = 2*np.mean([IS_all[i]*(R_all[i]+F_all[i]) for i in range(len(IS_all))])
#   R_sq = np.mean([num**2 for num in R_all])
#   IS_sq_all = (np.mean(IS_all))**2
#   IS_r_t_f = 2*np.mean(IS_all)*np.mean([R_all[i]+F_all[i] for i in range(len(R_all))])
#   R_sq_all = (np.mean(R_all))**2

#   variance_scope = IS_sq + IS_R_F + R_sq - IS_sq_all - IS_r_t_f - R_sq_all
#   variance_is = IS_sq - IS_sq_all
#   return variance_scope, variance_is

An example of an initial guess of phi can be seen below, as you can see the SCOPE variance is not ideal.

In [None]:
# scope_set, phi_set = subset_policies(behavior_policies, 0.3)
# variance_scope, variance_is = calc_variance(phi_set,0.9,[-0.1,.1,.1], 100, 0.3)
# print("Var SCOPE: ",variance_scope)
# print("Var IS: ",variance_is)
# print("Percent change in variance: ",((variance_scope-variance_is)/variance_is)*100)

# Optimization

Here we aim to optimize beta to minimize SCOPE variance.

In [None]:
# # Define the objective function to minimize variance_scope
# def objective_function(beta, phi_set):
#     # scope_set, phi_set = subset_policies(phi_set, phi_trajectories)
#     variance_scope, variance_is = calc_variance(phi_set, 0.9, beta, 100)
#     return variance_scope

# # Set the initial values of beta
# # initial_beta = np.array([ 0.2610704,   0.30396575, -0.43850237])


# def optimize_variance_scope(initial_beta, phi_set, phi_trajectories):
#     # Lists to store beta and variance_scope values at each iteration
#     all_betas = []
#     all_variance_scopes = []

#     # Callback function to record beta and variance_scope values at each iteration
#     def callback_function(beta):
#         all_betas.append(beta.copy())
#         variance_scope = objective_function(beta, phi_set)
#         all_variance_scopes.append(variance_scope)
#         print("Iteration:", len(all_betas))
#         print("Beta:", beta)
#         print("Variance Scope:", variance_scope)
#         print("----------")

#     # Run the optimization with the callback
#     result = minimize(
#         objective_function,
#         initial_beta,
#         args=(phi_set),
#         method='L-BFGS-B',
#         callback=callback_function
#     )

#     # Extract the optimal beta values
#     optimal_beta = result.x

#     return optimal_beta


# Reward Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# ... (previous code)

class FeatureNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(FeatureNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        features = torch.relu(self.fc3(x))
        return features



In [None]:

# Training loop
input_size = 2  # x, y coordinates
hidden_size = 3  # Define based on your needs

feature_network = FeatureNetwork(input_size, hidden_size)
optimizer = optim.Adam(feature_network.parameters(), lr=0.001)

num_epochs = 10
_,policies_for_phi = subset_policies(pi_b, 0.3)
for epoch in range(num_epochs):
    for trajectory in policies_for_phi:
        for step in trajectory:
            state = step[0]
            target_distances = step[3]  # Use the calculated distances as targets

            distances = feature_network(torch.tensor(state, dtype=torch.float32))

            # Calculate the loss based on your custom variance expression
            loss, _ = calc_variance(policies_for_phi, 0.9, feature_network, 500)

            # Zero the gradients, perform backpropagation, and update weights
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")



In [None]:
# Apply optimized weights (beta) to test set of policies and calculate variance reduction
policies_for_test,_ = subset_policies(pi_b, 0.3)[0]  # Use policies for test
variance_scope_after_optimization, _ = calc_variance(policies_for_test, 0.9, feature_network.fc3.weight.detach().numpy(), num_bootstrap_samples)

print(f"Variance reduction after optimization: {variance_scope_before - variance_scope_after_optimization}")

# ... (rest of the code)

# Playground to run individual trajectories

In [None]:
# env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 1, -2, 3, 0.1)
# behavior_policies = create_policy_set(env, run_policy,behav_policy, 400)
# initial_beta = [random.uniform(-0.5, 0.5) for _ in range(len(env.good_regions + env.bad_regions))]
# scope_set, phi_set = subset_policies(behavior_policies, 0.3)
# optimal_beta = optimize_variance_scope(initial_beta, phi_set, 0.3)
# variance_scope, variance_is = calc_variance(phi_set,0.9,optimal_beta, 500)
# print("Var SCOPE_phi: ",variance_scope)
# print("Var IS_phi: ",variance_is)
# print("Percent change in variance: ",((variance_scope-variance_is)/variance_is)*100)

In [None]:
# scope_results = SCOPE(scope_set,optimal_beta,500)
# IS_results = per_step_IS(scope_set,500)
# print("SCOPE results: ", scope_results)
# print("IS results: ", IS_results)
# evaluation_policies = create_policy_set(env, run_policy,eval_policy, 1000)
# true_evaluation = calc_V_pi_e(evaluation_policies)
# print("true eval: ", true_evaluation)

# Modify Data

In [None]:
def modify_data(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity):
  file = filename(env, behav_policy, eval_policy, num_episodes, phi_traj, sparsity)
  # Check if the file already exists
  if os.path.exists(file):
    loaded_data = load_data_from_file(file)
    behavior_policies = loaded_data['policy_set']
    scope_set, phi_set = subset_policies(behavior_policies, phi_traj)
    beta = loaded_data['optimal_beta']
    # scope_results = loaded_data['scope_results']
    scope_results_new = SCOPE1(scope_set,beta,500)
    IS_results_new = per_step_IS1(scope_set,500)
    loaded_data['scope_results'] = scope_results_new
    loaded_data['IS_results'] = IS_results_new
    true_evals = []
    for i in range(5):
      evaluation_policies = create_policy_set(env, run_policy,eval_policy, 1000)
      true_evaluation = calc_V_pi_e(evaluation_policies)
      true_evals.append(true_evaluation)
    loaded_data['True Evaluations'] = true_evals
    save_data_to_file(loaded_data, file)
    print("modifying... ")
    print("scope new: ",scope_results_new)
    print("IS new: ", IS_results_new)



In [None]:
# def run_experiment(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity):
#   file = filename(env, behav_policy, eval_policy, num_episodes, phi_traj, sparsity)
#   # Check if the file already exists
#   if os.path.exists(file):
#     loaded_data = load_data_from_file(file)
#     # modify_data(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity)
#     return loaded_data


#   behavior_policies = create_policy_set(env, run_policy,behav_policy, num_episodes)
#   initial_beta = [random.uniform(-0.5, 0.5) for _ in range(len(env.good_regions + env.bad_regions))]
#   scope_set, phi_set = subset_policies(behavior_policies, phi_traj)
#   optimal_beta = optimize_variance_scope(initial_beta, phi_set, phi_traj)
#   variance_scope, variance_is = calc_variance(phi_set,0.9,optimal_beta, 500)
#   print("Var SCOPE_phi: ",variance_scope)
#   print("Var IS_phi: ",variance_is)
#   print("Percent change in variance: ",((variance_scope-variance_is)/variance_is)*100)
#   scope_results = SCOPE(scope_set,optimal_beta,500)
#   IS_results = per_step_IS(scope_set,500)
#   print("SCOPE results: ", scope_results)
#   print("IS results: ", IS_results)
#   true_evals = []
#   for i in range(5):
#     evaluation_policies = create_policy_set(env, run_policy,eval_policy, 1000)
#     true_evaluation = calc_V_pi_e(evaluation_policies)
#     true_evals.append(true_evaluation)
#   print("true eval: ", np.mean(np.array(true_evals)))
#   data_to_save = {
#     'policy_set': behavior_policies,
#     'optimal_beta': optimal_beta,
#     'variance_scope_train': variance_scope,
#     'variance_IS_train': variance_is,
#     'scope_results': scope_results,
#     'IS_results': IS_results,
#     'True Evaluations': true_evals
#   }
#   save_data_to_file(data_to_save, file)

#   return data_to_save



# Experiment Function

In [None]:
def run_experiment1(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity):
  file = filename(env, behav_policy, eval_policy, num_episodes, phi_traj, sparsity)
  # Check if the file already exists
  if os.path.exists(file):
    loaded_data = load_data_from_file(file)
    # loaded_data = modify_data(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity)
    return loaded_data

  behavior_policies = create_policy_set(env, run_policy,behav_policy, num_episodes)
  initial_beta = [random.uniform(-0.5, 0.5) for _ in range(len(env.good_regions + env.bad_regions))]
  scope_set, phi_set = subset_policies(behavior_policies, phi_traj)
  optimal_beta = optimize_variance_scope(initial_beta, phi_set, phi_traj)
  variance_scope, variance_is = calc_variance(phi_set,0.9,optimal_beta, 500)
  print("Var SCOPE_phi: ",variance_scope)
  print("Var IS_phi: ",variance_is)
  print("Percent change in variance: ",((variance_scope-variance_is)/variance_is)*100)
  scope_results = SCOPE1(scope_set,optimal_beta,500)
  IS_results = per_step_IS1(scope_set,500)
  print("SCOPE results: ", scope_results)
  print("IS results: ", IS_results)
  true_evals = []
  for i in range(5):
    evaluation_policies = create_policy_set(env, run_policy,eval_policy, 1000)
    true_evaluation = calc_V_pi_e(evaluation_policies)
    true_evals.append(true_evaluation)
  print("true eval: ", np.mean(np.array(true_evals)))
  data_to_save = {
    'policy_set': behavior_policies,
    'optimal_beta': optimal_beta,
    'variance_scope_train': variance_scope,
    'variance_IS_train': variance_is,
    'scope_results': scope_results,
    'IS_results': IS_results,
    'True Evaluations': true_evals
  }
  save_data_to_file(data_to_save, file)

  return data_to_save



In [None]:
def run_loaded(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity):
  file = filename(env, behav_policy, eval_policy, num_episodes, phi_traj, sparsity)
  # Check if the file already exists
  if os.path.exists(file):
    loaded_data = load_data_from_file(file)
    # loaded_data = modify_data(env, num_episodes, behav_policy, eval_policy, phi_traj, sparsity)
    return loaded_data

In [None]:
# def plot_rewards_over_trajectories(env, num_trajectories, behav_policy, eval_policy, phi_traj, sparsity):
#     combined_scope = []
#     combined_is = []

#     for i in range(len(num_trajectories)):
#         results = run_experiment(env, num_trajectories[i], behav_policy, eval_policy, phi_traj, sparsity)
#         if num_trajectories[i] == 200:
#             true_val = results['True Evaluations']

#         scope_results = results['scope_results']
#         is_results = results['IS_results']

#         quartiles_scope = scope_results['quartiles']
#         quartiles_is = is_results['quartiles']

#         combined_scope.append(quartiles_scope)
#         combined_is.append(quartiles_is)


#     # Transpose the quartiles data for compatibility with boxplot
#     combined_scope = np.array(combined_scope).T
#     combined_is = np.array(combined_is).T

#     # Create box and whisker plots for both SCOPE and IS data on the same plot
#     plt.boxplot(combined_scope, positions=np.array(range(len(num_trajectories))) * 2 - 0.4, labels=num_trajectories, widths=0.4, patch_artist=True, boxprops=dict(facecolor='blue'), vert=True)
#     plt.boxplot(combined_is, positions=np.array(range(len(num_trajectories))) * 2 + 0.4, labels=num_trajectories, widths=0.4, patch_artist=True, boxprops=dict(facecolor='orange'), vert=True)

#     plt.xlabel('Number of Trajectories')
#     plt.ylabel('Value Estimate')
#     plt.title('SCOPE and stepIS Box and Whisker Plots vs. Number of Trajectories')

#     # Add horizontal line for true_val
#     plt.axhline(y=true_val, color='green', linestyle='--', label='True Value')

#     # Create custom legend handles and labels
#     custom_legend_handles = [
#         Line2D([0], [0], color='blue', marker='s', markersize=10, label='SCOPE'),
#         Line2D([0], [0], color='orange', marker='s', markersize=10, label='stepIS'),
#         Line2D([0], [0], color='green', linestyle='--', label='True Value')
#     ]

#     plt.legend(handles=custom_legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')

#     plt.grid(True)
#     plt.tight_layout()  # Ensures proper spacing and avoids clipping
#     plt.show()



# Plotting Functions

In [None]:
def plot_rewards_over_trajectories1(env, num_trajectories, behav_policy, eval_policy, phi_traj, sparsity):
    # combined_scope = []
    # combined_is = []

    combined_scope_means = []
    combined_scope_std_devs = []
    combined_is_means = []
    combined_is_std_devs = []

    combined_scope_vars = []
    combined_scope_var_std_devs = []
    combined_is_vars = []
    combined_is_var_std_devs = []

    combined_scope_bias = []
    combined_scope_bias_std_devs = []
    combined_is_bias = []
    combined_is_bias_std_devs = []

    combined_scope_mse = []
    combined_is_mse = []


    for i in range(len(num_trajectories)):
        results = run_loaded(env, num_trajectories[i], behav_policy, eval_policy, phi_traj, sparsity)
        print("Trajectories: ", num_trajectories[i])
        # if num_trajectories[i] == 200:
        true_val = results['True Evaluations']
        print("True Val: ",true_val)

        optimal_beta = results['optimal_beta']
        print("Optimal Beta: ", optimal_beta)

        true_value = np.mean(np.array(true_val))
        scope_results = results['scope_results']
        is_results = results['IS_results']

        print('SCOPE Results: ', scope_results)
        print('IS Results: ', is_results)

        sd_scope = scope_results['std_devs_list']
        sd_is = is_results['std_devs_list']

        means_scope = scope_results['mean_list']
        means_is = is_results['mean_list']

        combined_scope_means.append(np.mean(np.array(means_scope)))
        combined_scope_std_devs.append(np.std(np.array(means_scope)))
        combined_is_means.append(np.mean(np.array(means_is)))
        combined_is_std_devs.append(np.std(np.array(means_is)))

        combined_scope_vars.append(np.mean(np.array(sd_scope)**2))
        combined_scope_var_std_devs.append(np.std(np.array(sd_scope)**2))
        combined_is_vars.append(np.mean(np.array(sd_is)**2))
        combined_is_var_std_devs.append(np.std(np.array(sd_is)**2))


        combined_scope_bias.append(np.mean(np.array(means_scope)-np.array(true_val)))
        combined_scope_bias_std_devs.append(np.std(np.array(means_scope)-np.array(true_val)))
        combined_is_bias.append(np.mean(np.array(means_is)-np.array(true_val)))
        combined_is_bias_std_devs.append(np.std(np.array(means_is)-np.array(true_val)))

        mse_scope = np.mean(np.array(sd_scope)**2) + (np.mean(np.array(means_scope)-np.array(true_val)))**2
        # print("mse_scope: ", mse_scope)
        combined_scope_mse.append(mse_scope)
        combined_is_mse.append(np.mean(np.array(sd_is)**2) + (np.mean(np.array(means_is)-np.array(true_val)))**2)

    plt.figure()
    # Plotting
    plt.errorbar(num_trajectories, combined_scope_vars, yerr=combined_scope_var_std_devs, fmt='bs', label='SCOPE')
    plt.errorbar(num_trajectories, combined_is_vars, yerr=combined_is_var_std_devs, fmt='ko', label='stepIS')

    plt.xlabel('Number of Trajectories')
    plt.ylabel('Variance')
    plt.title('SCOPE and stepIS Variance Plots vs. Number of Trajectories')
    custom_legend_handles = [
        Line2D([0], [0], color='blue', marker='s', markersize=10, label='SCOPE'),
        Line2D([0], [0], color='black', marker='o', markersize=10, label='stepIS')
    ]

    plt.legend(handles=custom_legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.grid(True)
    plt.tight_layout()  # Ensures proper spacing and avoids clipping
    plt.show()




    plt.figure()

    plt.figure()
    # Plotting
    plt.errorbar(num_trajectories, combined_scope_bias, yerr=combined_scope_bias_std_devs, fmt='bs', label='SCOPE')
    plt.errorbar(num_trajectories, combined_is_bias, yerr=combined_is_bias_std_devs, fmt='ko', label='stepIS')

    plt.xlabel('Number of Trajectories')
    plt.ylabel('Bias')
    plt.title('SCOPE and stepIS Bias Plots vs. Number of Trajectories')
    custom_legend_handles = [
        Line2D([0], [0], color='blue', marker='s', markersize=10, label='SCOPE'),
        Line2D([0], [0], color='black', marker='o', markersize=10, label='stepIS')
    ]

    plt.legend(handles=custom_legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.grid(True)
    plt.tight_layout()  # Ensures proper spacing and avoids clipping
    plt.show()



    plt.figure()

    # Plotting
    plt.errorbar(num_trajectories, combined_scope_means, yerr=combined_scope_std_devs, fmt='bs', label='SCOPE')
    plt.errorbar(num_trajectories, combined_is_means, yerr=combined_is_std_devs, fmt='ko', label='stepIS')

    plt.xlabel('Number of Trajectories')
    plt.ylabel('Value Estimate')
    plt.title('SCOPE and stepIS Value Estimate Plots vs. Number of Trajectories')

    # Add horizontal line for true_val
    plt.axhline(y=true_value, color='green', linestyle='--', label='True Value')

    # Create custom legend handles and labels
    custom_legend_handles = [
        Line2D([0], [0], color='blue', marker='s', markersize=10, label='SCOPE'),
        Line2D([0], [0], color='black', marker='o', markersize=10, label='stepIS'),
        Line2D([0], [0], color='green', linestyle='--', label='True Value')
    ]

    plt.legend(handles=custom_legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.grid(True)
    plt.tight_layout()  # Ensures proper spacing and avoids clipping
    plt.show()

    plt.figure()

    # Plotting
    plt.plot(num_trajectories, combined_scope_mse, color='blue', marker='s', label='SCOPE')
    plt.plot(num_trajectories, combined_is_mse, color='black', marker='o', label='stepIS')


    plt.xlabel('Number of Trajectories')
    plt.ylabel('MSE')
    plt.title('SCOPE and stepIS MSE Plots vs. Number of Trajectories')


    # Create custom legend handles and labels
    custom_legend_handles = [
        Line2D([0], [0], color='blue', marker='s', markersize=10, label='SCOPE'),
        Line2D([0], [0], color='black', marker='o', markersize=10, label='stepIS')
    ]

    plt.legend(handles=custom_legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.grid(True)
    plt.tight_layout()  # Ensures proper spacing and avoids clipping
    plt.show()




In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

def plot_mse_over_trajectories(env, num_trajectories, behav_policy, eval_policy, phi_traj, sparsity):
    combined_scope = []
    combined_is = []
    bias_scope = []
    bias_is = []
    variance_scope = []
    variance_is = []

    for i in range(len(num_trajectories)):
        results = run_experiment(env, num_trajectories[i], behav_policy, eval_policy, phi_traj, sparsity)
        if num_trajectories[i] == 200:
            true_val = results['True Evaluation']

        scope_results = results['scope_results']
        is_results = results['IS_results']

        scope_std_dev = scope_results['std_deviation']
        scope_mean = scope_results['mean']
        is_std_dev = is_results['std_deviation']
        is_mean = is_results['mean']

        scope_bias = true_val - scope_mean
        is_bias = true_val - is_mean

        scope_mse = scope_std_dev ** 2 + (scope_bias) ** 2
        is_mse = is_std_dev ** 2 + (is_bias) ** 2

        bias_scope.append(scope_bias)
        bias_is.append(is_bias)

        variance_scope.append(scope_std_dev ** 2)
        variance_is.append(is_std_dev ** 2)

        combined_scope.append(scope_mse)
        combined_is.append(is_mse)
    plt.figure()
    plt.plot(num_trajectories, bias_scope, marker='o', label=f'SCOPE')
    plt.plot(num_trajectories, bias_is, marker='x', label=f'stepIS')
    plt.title('SCOPE and stepIS Bias vs. Number of Trajectories')

    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure()
    plt.plot(num_trajectories, variance_scope, marker='o', label=f'SCOPE')
    plt.plot(num_trajectories, variance_is, marker='x', label=f'stepIS')
    plt.title('SCOPE and stepIS Variance vs. Number of Trajectories')

    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure()

    plt.plot(num_trajectories, combined_scope, marker='o', label=f'SCOPE')
    plt.plot(num_trajectories, combined_is, marker='x', label=f'stepIS')

    plt.xlabel('Number of Trajectories')
    plt.ylabel('MSE')
    plt.title('SCOPE and stepIS MSE vs. Number of Trajectories')

    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


# Compare Trajectories

In [None]:
def compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, phi_traj, sparsity):
  for i in num_trajectories:
    # print("Number of trajectories: ",i)
    run_experiment1(env,i ,behav_policy, eval_policy, phi_traj, sparsity)

  plot_rewards_over_trajectories1(env,num_trajectories,behav_policy, eval_policy, phi_traj,sparsity)
  # plot_mse_over_trajectories(env,num_trajectories,behav_policy, eval_policy, phi_traj,sparsity)

In [None]:
np.random.seed(42)
# Gridworld environment
height = 5
width  = 5
start = (0,0)
end = (4,4)

## Dense 2 bad regions, 1 good

In [None]:
np.random.seed(42)
# Gridworld environment
height = 5
width  = 5
start = (0,0)
end = (4,4)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [(3,3)], 1, -2, 3, 0.9)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.9 )

## Sparse 2 bad, 1 good

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [(3,3)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   print('num_trajectories: ', i)
#   modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1 )

In [None]:
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [(3,3)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
modify_data(env, 1000, behav_policy, eval_policy , 0.3, 0.1)

## Sparse 2 good, 1 bad

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1)], [(2, 2),(3,3)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   print('num_trajectories: ', i)
#   modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1 )

In [None]:
np.random.seed(31)
# Gridworld environment
height = 5
width  = 5
start = (0,0)
end = (4,4)

In [None]:
env = GridWorld(height, width, start, end, [(1, 1)], [(2, 2),(3,3)], 1, -2, 3, 0.101)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.101)

## Sparse, 1 bad 1 good

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1)], [(3,3)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   print('num_trajectories: ', i)
#   modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1 )

### Same rewards for bad and good regions

In [None]:
env = GridWorld(height, width, start, end, [(1, 1)], [(3,3)], 2, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   print('num_trajectories: ', i)
#   modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1 )

## Dense 2 bad

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 1, -2, 3, 0.9)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   print('num_trajectories: ', i)
#   modify_data(env, i, behav_policy, eval_policy, 0.3,0.9)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.9 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 2), (1, 4)], [], 1, -2, 3, 0.9)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.9)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.9 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 1, -2, 3, 0.88)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.88)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.88)

## Dense 2 bad, 0.7 for training

In [None]:
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 1, -2, 3, 0.9)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   run_experiment(env,i ,behav_policy, eval_policy, 0.3, 0.9)
#   print(i," trajectories done")
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.7, 0.9 )

## Sparse 2 bad

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 1, -2, 3, 0.2)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.2)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.2)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (3, 2)], [], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 2), (2, 3)], [], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(2, 2), (3, 3)], [], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

## Sparse 3 bad

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(2, 1), (2, 2), (2,3)], [], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2), (3,3)], [], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

### Mid Sparsity

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 1), (2, 2), (3,3)], [], 1, -2, 3, 0.5)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.5)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.5)

## Sparse 2 good

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [], [(2, 1), (2, 2)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [], [(1, 1), (2, 2)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

## Sparse 1 good

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [], [(1, 2)], 1, -2, 3, 0.1)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
for i in num_trajectories:
  print('num_trajectories: ', i)
  modify_data(env, i, behav_policy, eval_policy, 0.3,0.1)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

## Sparse, 1 bad

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(2, 2)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   print('num_trajectories: ', i)
#   modify_data(env, i, behav_policy, eval_policy, 0.3,0.0)
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(2, 3)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(1, 4)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(3, 4)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
# np.random.seed(42)
# env = GridWorld(height, width, start, end, [(3, 4)], [], 1, -2,3, 0.0)
# eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
# behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
# num_trajectories = [200,400, 600, 800,1000]
# compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(3, 1)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
np.random.seed(42)
env = GridWorld(height, width, start, end, [(4, 2)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

## Dense, 1 bad, similar

In [None]:
env = GridWorld(height, width, start, end, [(2, 2)], [], 1, -2,3, 0.8)
eval_policy = {"up": 0.36, "down": 0.14, "left": 0.14, "right": 0.36}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   run_experiment(env,i ,behav_policy, eval_policy, 0.3, 0.8)
#   print(i," trajectories done")
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.8 )

In [None]:
env = GridWorld(height, width, start, end, [(4, 2)], [], 1, -2,3, 0.8)
eval_policy = {"up": 0.36, "down": 0.14, "left": 0.14, "right": 0.36}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   run_experiment(env,i ,behav_policy, eval_policy, 0.3, 0.8)
#   print(i," trajectories done")
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.8 )

## Sparse, 1 bad, similar

In [None]:
env = GridWorld(height, width, start, end, [(2, 3)], [], 1, -2,3, 0.01)
eval_policy = {"up": 0.36, "down": 0.14, "left": 0.14, "right": 0.36}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# for i in num_trajectories:
#   run_experiment(env,i ,behav_policy, eval_policy, 0.3, 0.01)
#   print(i," trajectories done")
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.01 )

# Fix

## Dense same policy

In [None]:
env = GridWorld(height, width, start, end, [(2,3)], [], 1, -2,3, 0.8)
eval_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.8 )

In [None]:
env = GridWorld(height, width, start, end, [(2,3)], [], 1, -2,3, 0.85)
eval_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.85 )

## Sparse same policy

In [None]:
env = GridWorld(height, width, start, end, [(2,3)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

## Sparse 2 bad regions, same policy

In [None]:
env = GridWorld(height, width, start, end, [(2,3),(3,4)], [], 1, -2,3, 0.0)
eval_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.0 )

In [None]:
env = GridWorld(height, width, start, end, [(2,3),(3,4)], [], 1, -2,3, 0.1)
eval_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
# num_trajectories = [200,400, 600, 800,1000]
num_trajectories = [200,400]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

## Sparse one bad region, very similar policies

In [None]:
env = GridWorld(height, width, start, end, [(2, 2)], [], 1, -2,3, 0.1)
eval_policy = {"up": 0.26, "down": 0.24, "left": 0.24, "right": 0.26}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]
# num_trajectories = [200,400]
compare_experiments_over_trajectories(env, behav_policy, eval_policy, num_trajectories, 0.3, 0.1)

## Two bad regions

In [None]:
env_bad = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [], 0.5, -2, 3 )
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
num_trajectories = [200,400, 600, 800,1000]

In [None]:
results = run_experiment(env_bad,200 ,behav_policy, eval_policy, 0.3)

In [None]:
for i in num_trajectories:
  run_experiment(env_bad,i ,behav_policy, eval_policy, 0.3)
  print(i," trajectories done")

In [None]:
plot_rewards_over_trajectories(env_bad, num_trajectories, behav_policy, eval_policy, 0.3)

## Single bad region

In [None]:
num_trajectories = [200, 400, 600, 800, 1000]

In [None]:
env_bad_one = GridWorld(height, width, start, end, [(2, 2)], [], 0.5, -2, 3 )
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}

In [None]:
for i in num_trajectories:
  run_experiment(env_bad_one,i ,behav_policy, eval_policy, 0.3)
  print(i," trajectories done")

In [None]:
plot_rewards_over_trajectories(env_bad_one, num_trajectories ,behav_policy, eval_policy, 0.3)

In [None]:
evaluation_policies = create_policy_set(env_bad_one, run_policy,eval_policy, 1000)
true_evaluation = print("True Eval: ",calc_V_pi_e(evaluation_policies))

## Two bad regions One good region

In [None]:
env_2bad_1good = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [(3,3)], 0.5, -2, 3 )
eval_policy = {"up": 0.36, "down": 0.14, "left": 0.14, "right": 0.36}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}

In [None]:
run_experiment(env_2bad_1good,200, behav_policy, eval_policy, 0.3)

In [None]:
num_trajectories = [200, 400, 600, 800, 1000]

In [None]:
for i in num_trajectories:
  run_experiment(env_2bad_1good,i ,behav_policy, eval_policy, 0.3)
  print(i," trajectories done")

In [None]:
plot_rewards_over_trajectories(env_2bad_1good, num_trajectories, behav_policy, eval_policy, 0.3)

In [None]:
evaluation_policies = create_policy_set(env_2bad_1good, run_policy,eval_policy, 1000)
true_evaluation = print("True Eval: ",calc_V_pi_e(evaluation_policies))

## Two bad regions One good region similar policies

In [None]:
env_2bad_1good_similar = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [(3,3)], 0.5, -2, 3 )
behav_policy = {"up": 0.36, "down": 0.14, "left": 0.14, "right": 0.36}
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}

In [None]:
for i in num_trajectories:
  run_experiment(env_2bad_1good_similar,i ,behav_policy, eval_policy, 0.3)
  print(i," trajectories done")

In [None]:
plot_rewards_over_trajectories(env_2bad_1good_similar, num_trajectories, behav_policy, eval_policy, 0.3)