<a href="https://colab.research.google.com/github/ajagota7/Reward-Shaping/blob/main/gridworld_ope.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating Environment

In [1]:
class GridWorld:
    def __init__(self, height, width, start, end, bad_regions, good_regions):
        self.height = height
        self.width = width
        self.start = start
        self.end = end
        self.bad_regions = bad_regions
        self.good_regions = good_regions

    def reset(self):
        self.agent_position = self.start

    def step(self, action):
        x, y = self.agent_position

        if action == "up" and y < self.height - 1:
            y += 1
        elif action == "down" and y > 0:
            y -= 1
        elif action == "left" and x > 0:
            x -= 1
        elif action == "right" and x < self.width - 1:
            x += 1

        self.agent_position = (x, y)

        if self.agent_position == self.end:
            reward = 3
            done = True
        elif self.agent_position in self.bad_regions:
            reward = -1
            done = False
        elif self.agent_position in self.good_regions:
            reward = 0.5
            done = False
        else:
            reward = 0
            done = False

        return (x, y), reward, done


In [2]:
import numpy as np

class Agent:
    def __init__(self, epsilon=0.0):
        self.epsilon = epsilon

    def select_action(self, policy_func):
        if np.random.uniform() < self.epsilon:
            # Choose a random action
            action = np.random.choice(["up", "down", "left", "right"])
        else:
            # Use the provided policy function to get the best action
            action = policy_func()
        return action

# Define different policy functions outside the class

def random_policy():
    # Choose a random action
    return np.random.choice(["up", "down", "left", "right"])

def behavior_policy():
    action_probs = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
    return np.random.choice(list(action_probs.keys()), p=list(action_probs.values()))

def evaluation_policy():
    action_probs = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
    return np.random.choice(list(action_probs.keys()), p=list(action_probs.values()))

def manhattan_distance(pos1, pos2):
    # Compute the Manhattan distance between two positions
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

# Generating Policy data

In [3]:
# Gridworld environment
height = 5
width  = 5
start = (0,0)
end = (4,4)

In [4]:

env = GridWorld(height, width, start, end, [(1, 1), (2, 2)], [(3,3)])

# Number of episodes
num_episodes = 200

def create_policy_set(env, policy, num_episodes):
  # Create a list to store policies as trajectories
  policies = []

  # Run multiple episodes
  for episode in range(num_episodes):
      # Create a new Agent for each episode to generate a different policy
      agent = Agent(epsilon=0.0)

      # Run an episode
      env.reset()
      done = False
      trajectory = []  # Store the trajectory for the current episode
      cumulative_reward = 0.0  # Initialize cumulative reward
      while not done:
          state = env.agent_position  # Get the current state
          action = agent.select_action(policy)
          next_state, reward, done = env.step(action)

          # Compute cumulative reward
          cumulative_reward += reward

          # Compute feature function values (manhattan distances)
          good_region_distances = [manhattan_distance(state, gr) for gr in env.good_regions]
          bad_region_distances = [manhattan_distance(state, br) for br in env.bad_regions]

          # print("Good Region Distances",good_region_distances)
          # print("Bad Region Distances: ",bad_region_distances)

          # Store the (state, action, reward, next_state) tuple in the trajectory
          trajectory.append((state, action, reward, next_state, cumulative_reward, good_region_distances, bad_region_distances))

          # # Print the episode information
          # print("Episode:", episode + 1)
          # print("State:", state)
          # print("Action:", action)
          # print("Reward:", reward)
          # print("Next State:", next_state)
          # print("Cumulative Reward:", cumulative_reward)
          # print("Done:", done)
          # print("-----")

      # Append the trajectory to the policies list
      policies.append(trajectory)
  return policies


In [5]:
behavior_policies = create_policy_set(env, behavior_policy, 200)

In [8]:

bad_regions = [(1, 1), (2, 2)]
good_regions = [(3, 3)]

env = GridWorld(height, width, start, end, bad_regions, good_regions)

# Create a list to store evaluation policies as trajectories
evaluation_policies = []

# Number of episodes
num_episodes = 200

# Run multiple episodes
for episode in range(num_episodes):
    # Create a new Agent for each episode to generate a different behavior policy
    agent = Agent(epsilon=0.0)

    # Run an episode
    env.reset()
    done = False
    trajectory = []  # Store the trajectory for the current episode
    cumulative_reward = 0.0  # Initialize cumulative reward
    while not done:
        state = env.agent_position  # Get the current state
        action = agent.select_action(evaluation_policy)
        next_state, reward, done = env.step(action)

        # Compute cumulative reward
        cumulative_reward += reward

        # Store the (state, action, reward, next_state) tuple in the trajectory
        trajectory.append((state, action, reward, next_state, cumulative_reward))

        # Print the episode information
        # print("Episode:", episode + 1)
        # print("State:", state)
        # print("Action:", action)
        # print("Reward:", reward)
        # print("Next State:", next_state)
        # print("Cumulative Reward:", cumulative_reward)
        # print("Done:", done)
        # print("-----")

    # Append the trajectory to the behavior policies list
    evaluation_policies.append(trajectory)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode: 166
State: (2, 0)
Action: up
Reward: 0
Next State: (2, 1)
Cumulative Reward: 0.0
Done: False
-----
Episode: 166
State: (2, 1)
Action: up
Reward: -1
Next State: (2, 2)
Cumulative Reward: -1.0
Done: False
-----
Episode: 166
State: (2, 2)
Action: right
Reward: 0
Next State: (3, 2)
Cumulative Reward: -1.0
Done: False
-----
Episode: 166
State: (3, 2)
Action: up
Reward: 0.5
Next State: (3, 3)
Cumulative Reward: -0.5
Done: False
-----
Episode: 166
State: (3, 3)
Action: up
Reward: 0
Next State: (3, 4)
Cumulative Reward: -0.5
Done: False
-----
Episode: 166
State: (3, 4)
Action: right
Reward: 3
Next State: (4, 4)
Cumulative Reward: 2.5
Done: True
-----
Episode: 167
State: (0, 0)
Action: right
Reward: 0
Next State: (1, 0)
Cumulative Reward: 0.0
Done: False
-----
Episode: 167
State: (1, 0)
Action: up
Reward: -1
Next State: (1, 1)
Cumulative Reward: -1.0
Done: False
-----
Episode: 167
State: (1, 1)
Action: right
Reward: 0
Nex

In [71]:
# Initialize a variable to store the sum of all cumulative rewards
total_cumulative_reward = 0.0

# Iterate through the evaluation_policies list to sum up the rewards
for episode_trajectory in evaluation_policies:
    # Get the last tuple (state, action, reward, next_state, cumulative_reward)
    # from the trajectory to get the cumulative reward of the episode
    cumulative_reward_episode = episode_trajectory[-1][-1]

    # Add the episode's cumulative reward to the total cumulative reward
    total_cumulative_reward += cumulative_reward_episode

# Print the total cumulative reward
print("Mean Cumulative Reward of 200 evaluation policies:", total_cumulative_reward/len(evaluation_policies))


Mean Cumulative Reward of 200 evaluation policies: 2.15


In [37]:
evaluation_policies[178][-1][-1]

1.5

In [36]:
cumulative_reward_episode

3.0

# Training Reward Models

## State -> Reward

Training model to predict rewards based on state only

In [None]:
import tensorflow as tf
import numpy as np

# Step 1: Prepare the data
# behavior_policies = [...]  # Replace [...] with your actual behavior_policies list

# Initialize lists to store all 'next_state' and 'reward' values
all_next_states = []
all_rewards = []

# Extract the 'next_state' and 'reward' from the 'behavior_policies' list
for trajectory in behavior_policies:
    # For each trajectory, extract all 'next_state' and 'reward' values
    next_states = [state_action_reward[3] for state_action_reward in trajectory]
    rewards = [state_action_reward[2] for state_action_reward in trajectory]

    # Append the values to the corresponding lists
    all_next_states.extend(next_states)
    all_rewards.extend(rewards)

# Convert 'next_states' and 'rewards' into appropriate formats for training
all_next_states = np.array(all_next_states, dtype=np.float32)
all_rewards = np.array(all_rewards, dtype=np.float32)

# Now, 'all_next_states' contains all the 'next_state' values, and 'all_rewards' contains all the corresponding rewards.


In [None]:
len(all_rewards)

21499

In [None]:
# Step 2: Design the neural network
class RewardPredictorStates(tf.keras.Model):
    def __init__(self, input_shape):
        super(RewardPredictorStates, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return x

# Define hyperparameters for the neural network
input_shape = all_next_states.shape[1:]  # Shape of the input state (excluding batch size)
learning_rate = 0.005
num_epochs = 1000
batch_size = 32

# Create the neural network
reward_predictor_states = RewardPredictorStates(input_shape)

# Compile the model
reward_predictor_states.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                         loss='mean_squared_error')

# Step 3: Train the neural network with early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)
]

reward_predictor_states.fit(
    all_next_states, all_rewards,
    batch_size=batch_size, epochs=num_epochs,
    callbacks=callbacks, validation_split=0.2, verbose=1
)

Epoch 1/1000


KeyboardInterrupt: ignored

In [None]:
current_state = behavior_policies[0][5][0]
current_state = np.array(current_state, dtype=np.float32)
predicted_reward_current = reward_predictor_states.predict(np.expand_dims(current_state, axis=0))
print("Predicted Reward Current State:", predicted_reward_current[0, 0])

new_state = behavior_policies[0][5][3]  # Replace ... with the new state for which you want to predict the reward
new_state = np.array(new_state, dtype=np.float32)
predicted_reward_next = reward_predictor_states.predict(np.expand_dims(new_state, axis=0))
print("Predicted Reward Next State:", predicted_reward_next[0, 0])

Predicted Reward Current State: -0.03829813
Predicted Reward Next State: -0.006269932


## State -> Cumulative Reward

Reward model based on State -> Cumulative Rewards

In [None]:
import tensorflow as tf
import numpy as np

# Step 1: Prepare the data
# behavior_policies = [...]  # Replace [...] with your actual behavior_policies list

# Initialize lists to store all 'next_state' and 'reward' values
all_next_states = []
all_cum_rewards = []

# Extract the 'next_state' and 'reward' from the 'behavior_policies' list
for trajectory in behavior_policies:
    # For each trajectory, extract all 'next_state' and 'reward' values
    next_states = [state_action_reward[3] for state_action_reward in trajectory]
    cum_rewards = [state_action_reward[4] for state_action_reward in trajectory]

    # Append the values to the corresponding lists
    all_next_states.extend(next_states)
    all_cum_rewards.extend(cum_rewards)

# Convert 'next_states' and 'rewards' into appropriate formats for training
all_next_states = np.array(all_next_states, dtype=np.float32)
all_cum_rewards = np.array(all_cum_rewards, dtype=np.float32)

# Now, 'all_next_states' contains all the 'next_state' values, and 'all_rewards' contains all the corresponding rewards.


In [None]:
len(all_cum_rewards)

21499

In [None]:
# Step 2: Design the neural network
# import tensorflow as tf

class RewardPredictorCumulative(tf.keras.Model):
    def __init__(self, input_shape):
        super(RewardPredictorCumulative, self).__init__()
        self.dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01))
        self.batch_norm1 = tf.keras.layers.BatchNormalization()
        self.dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01))
        self.batch_norm2 = tf.keras.layers.BatchNormalization()
        self.dense3 = tf.keras.layers.Dense(1, kernel_initializer='he_normal')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.batch_norm1(x)
        x = self.dense2(x)
        x = self.batch_norm2(x)
        x = self.dense3(x)
        return x


# Define hyperparameters for the neural network
input_shape = all_next_states.shape[1:]  # Shape of the input state (excluding batch size)
learning_rate = 0.0001
num_epochs = 1000
batch_size = 32

# Create the neural network
reward_predictor_cumulative = RewardPredictorCumulative(input_shape)

# Compile the model
reward_predictor_cumulative.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                         loss='mean_squared_error')

# Step 3: Train the neural network with early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)
]

reward_predictor_cumulative.fit(
    all_next_states, all_cum_rewards,
    batch_size=batch_size, epochs=num_epochs,
    callbacks=callbacks, validation_split=0.2, verbose=1
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000


<keras.callbacks.History at 0x7e28c5be80a0>

## State -> Rewards over past 3 timesteps

Cumulative rewards over 3 timesteps

In [None]:
# Discounted sum
# Create a new list to store trajectories with the new data
augmented_behavior_policies = []

# Set the discount factor (gamma)
discount_factor = 0.9  # You can adjust this value as needed (usually between 0 and 1)

# Iterate through each trajectory in behavior_policies
for trajectory in behavior_policies:
    num_timesteps = len(trajectory)
    new_trajectory = []

    # Iterate through each timestep in the trajectory
    for t in range(num_timesteps):
        # Calculate the discounted sum of the past 3 rewards for the past 3 timesteps
        discounted_sum = 0.0
        for i in range(1, min(4, t + 1)):
            discounted_sum += (discount_factor ** (i - 1)) * trajectory[t - i][2]

        # Update the trajectory to include only the discounted sum of the past 3 rewards
        state, action, reward, next_state, cumulative_reward, good_prox, bad_prox = trajectory[t]
        new_trajectory.append((state, discounted_sum, action, reward, next_state, cumulative_reward))

    # Append the modified trajectory to the augmented_behavior_policies list
    augmented_behavior_policies.append(new_trajectory)


In [None]:
# Discounted sum
# Create a new list to store trajectories with the new data
augmented_evaluation_policies = []

# Set the discount factor (gamma)
discount_factor = 0.9  # You can adjust this value as needed (usually between 0 and 1)

# Iterate through each trajectory in evaluation_policies
for trajectory in evaluation_policies:
    num_timesteps = len(trajectory)
    new_trajectory = []

    # Iterate through each timestep in the trajectory
    for t in range(num_timesteps):
        # Calculate the discounted sum of the past 3 rewards for the past 3 timesteps
        discounted_sum = 0.0
        for i in range(1, min(4, t + 1)):
            discounted_sum += (discount_factor ** (i - 1)) * trajectory[t - i][2]

        # Update the trajectory to include only the discounted sum of the past 3 rewards
        state, action, reward, next_state, cumulative_reward, good_prox. bad_prox = trajectory[t]
        new_trajectory.append((state, discounted_sum, action, reward, next_state, cumulative_reward))

    # Append the modified trajectory to the augmented_evaluation_policies list
    augmented_evaluation_policies.append(new_trajectory)


NameError: ignored

In [None]:
import tensorflow as tf
import numpy as np

# Step 1: Prepare the data
def preprocess_nstep_data(policy_data):
  # Initialize lists to store all 'next_state' and 'reward' values
  all_next_states = []
  all_past3_rewards = []

  # Extract the 'next_state' and 'reward' from the 'behavior_policies' list
  for trajectory in policy_data:
      # For each trajectory, extract all 'next_state' and 'reward' values
      next_states = [state_action_reward[4] for state_action_reward in trajectory]
      rewards = [state_action_reward[3] for state_action_reward in trajectory]

      # Append the values to the corresponding lists
      all_next_states.extend(next_states)
      all_past3_rewards.extend(rewards)

  # Convert 'next_states' and 'rewards' into appropriate formats for training
  all_next_states = np.array(all_next_states, dtype=np.float32)
  all_past3_rewards = np.array(all_past3_rewards, dtype=np.float32)
  return all_next_states, all_past3_rewards

In [None]:
# Step 2: Design the neural network
class RewardPredictor3States(tf.keras.Model):
    def __init__(self, input_shape):
        super(RewardPredictor3States, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return x


In [None]:
# Behavior policies training
all_next_states_behav, all_past3_rewards_behav = preprocess_nstep_data(augmented_behavior_policies)

# Define hyperparameters for the neural network
input_shape = all_next_states_behav.shape[1:]  # Shape of the input state (excluding batch size)
learning_rate = 0.005
num_epochs = 1000
batch_size = 32

# Create the neural network
reward_predictor_3states = RewardPredictor3States(input_shape)

# Compile the model
reward_predictor_3states.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                         loss='mean_squared_error')

# Step 3: Train the neural network with early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)
]

reward_predictor_3states.fit(
    all_next_states_behav, all_past3_rewards_behav,
    batch_size=batch_size, epochs=num_epochs,
    callbacks=callbacks, validation_split=0.2, verbose=1
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000


<keras.callbacks.History at 0x7bc0d40ae080>

In [None]:
current_state = [2,3]
current_state = np.array(current_state, dtype=np.float32)
predicted_reward_current = reward_predictor_3states.predict(np.expand_dims(current_state, axis=0))
print("Predicted Reward Current State:", predicted_reward_current[0, 0])

Predicted Reward Current State: 0.0006133178


In [None]:
# Evaluation policies training
all_next_states_eval, all_past3_rewards_eval = preprocess_nstep_data(augmented_evaluation_policies)

# Define hyperparameters for the neural network
input_shape = all_next_states_eval.shape[1:]  # Shape of the input state (excluding batch size)
learning_rate = 0.005
num_epochs = 1000
batch_size = 32

# Create the neural network
reward_predictor_3states_eval = RewardPredictor3States(input_shape)

# Compile the model
reward_predictor_3states_eval.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                         loss='mean_squared_error')

# Step 3: Train the neural network with early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)
]

reward_predictor_3states_eval.fit(
    all_next_states_eval, all_past3_rewards_eval,
    batch_size=batch_size, epochs=num_epochs,
    callbacks=callbacks, validation_split=0.2, verbose=1
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x7bc0cd77ffa0>

In [None]:
current_state = [4,3]
current_state = np.array(current_state, dtype=np.float32)
predicted_reward_current = reward_predictor_3states_eval.predict(np.expand_dims(current_state, axis=0))
print("Predicted Reward Current State:", predicted_reward_current[0, 0])

Predicted Reward Current State: 0.00042444468


State, action, Next State -> Reward

In [None]:

# import tensorflow as tf
# import numpy as np

# # Step 1: Prepare the data
# behavior_policies = [...]  # Replace [...] with your actual behavior_policies list

# # Initialize lists to store all 'current_state', 'action', 'next_state', and 'reward' values
# all_current_states = []
# all_actions = []
# all_next_states = []
# all_rewards = []

# # Extract the 'current_state', 'action', 'next_state', and 'reward' from the 'behavior_policies' list
# for trajectory in behavior_policies:
#     for state, action, reward, next_state in trajectory:
#         all_current_states.append(state)
#         all_actions.append(action)
#         all_next_states.append(next_state)
#         all_rewards.append(reward)

# # Convert 'current_states', 'actions', 'next_states', and 'rewards' into appropriate formats for training
# all_current_states = np.array(all_current_states, dtype=np.float32)
# all_actions = np.array(all_actions, dtype=np.float32)
# all_next_states = np.array(all_next_states, dtype=np.float32)
# all_rewards = np.array(all_rewards, dtype=np.float32)

## State + proximity to good/bad regions -> Cumulative Reward

# OPE Calculations

In [6]:
eval_policy = {"up": 0.4, "down": 0.1, "left": 0.1, "right": 0.4}
behav_policy = {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25}
def calculate_importance_weights(eval_policy, behav_policy, behavior_policies):
  all_weights = []
  for trajectory in behavior_policies:
    cum_ratio = 1
    cumul_weights = []
    for step in trajectory:
        ratio = eval_policy[step[1]]/behav_policy[step[1]]
        # print("Ratio:",ratio)
        cum_ratio *= ratio
        cumul_weights.append(cum_ratio)
        # print("Cumul:",cum_ratio)
    all_weights.append(cumul_weights)

  return all_weights

## IS

In [133]:
def per_step_IS(behavior_policies, num_trajectories = 0.3):
  all_timesteps = []
  gamma = 0.9
  policy_for_scope,_ = subset_policies(behavior_policies, num_trajectories)
  scope_weights = calculate_importance_weights(eval_policy, behav_policy, policy_for_scope)
  for j in range(len(scope_weights)):
    Timestep_values = []
    for i in range(len(scope_weights[j])-1):
      timestep = gamma**(i)*scope_weights[j][i]*policy_for_scope[j][i][2]
      # print("Timestep: ",timestep)
      Timestep_values.append(timestep)

    all_timesteps.append(Timestep_values)

  V_per_traj = [sum(sublist) for sublist in all_timesteps]

  seed_value = 42
  np.random.seed(seed_value)

  num_trajectories_to_sample = max(1, len(V_per_traj))

  bootstrap_samples = [np.random.choice(V_per_traj, size=num_trajectories_to_sample, replace=True)
                        for _ in range(100)]

  V_per_sample = [sum(sample)/len(bootstrap_samples) for sample in bootstrap_samples]
  V_per_sample = np.array(V_per_sample)

  std_deviation = np.std(V_per_sample)
  quartiles = np.percentile(V_per_sample, [25, 50, 75])
  max_value = np.max(V_per_sample)
  min_value = np.min(V_per_sample)

  return V_per_sample, std_deviation, quartiles, max_value, min_value

In [134]:
IS_per_traj, is_std, is_quartiles, is_max_value, is_min_value = per_step_IS(behavior_policies,0.3)
print(is_std)
print(is_quartiles)
print(is_max_value)
print(is_min_value)

0.1891794197227112
[-1.22241614 -1.11114855 -0.97093305]
-0.7227057605032811
-1.7273244546915811


## SCOPE

In [131]:
import numpy as np
def SCOPE(behavior_policies, beta, num_trajectories = 0.3):
    all_timesteps = []
    gamma = 0.9
    policy_for_scope,_ = subset_policies(behavior_policies, num_trajectories)
    scope_weights = calculate_importance_weights(eval_policy, behav_policy, policy_for_scope)
    for j in range(len(scope_weights)):
        Timestep_values = []
        for i in range(len(scope_weights[j]) - 1):
            features = policy_for_scope[j][i][5] + policy_for_scope[j][i][6]
            features_next = policy_for_scope[j][i + 1][5] + policy_for_scope[j][i + 1][6]
            timestep = gamma ** (i) * scope_weights[j][i] * (policy_for_scope[j][i][2] + gamma * phi(features_next, beta) - phi(features, beta))
            Timestep_values.append(timestep)

        all_timesteps.append(Timestep_values)

    V_per_traj = [sum(sublist) for sublist in all_timesteps]



    seed_value = 42
    np.random.seed(seed_value)

    num_trajectories_to_sample = max(1, len(V_per_traj))

    bootstrap_samples = [np.random.choice(V_per_traj, size=num_trajectories_to_sample, replace=True)
                         for _ in range(100)]

    V_per_sample = [sum(sample)/len(bootstrap_samples) for sample in bootstrap_samples]
    V_per_sample = np.array(V_per_sample)

    std_deviation = np.std(V_per_sample)
    quartiles = np.percentile(V_per_sample, [25, 50, 75])
    max_value = np.max(V_per_sample)
    min_value = np.min(V_per_sample)

    return V_per_sample, std_deviation, quartiles, max_value, min_value
    # return bootstrap_samples


In [112]:
samples = SCOPE(behavior_policies, beta, 0.3)

In [120]:
sum(samples[99])

-147.58609119391252

In [132]:
# all_weights = calculate_importance_weights(eval_policy, behav_policy, behavior_policies)
beta =  [ 0.2609209,   0.47456879, -0.52815694]
# beta = [0,0,0]
V_per_sample, scope_std, scope_quartiles, scope_max_value, scope_min_value = SCOPE(behavior_policies,beta,0.3)
print(scope_std)
print(scope_quartiles)
print(scope_max_value)
print(scope_min_value)

0.15306293298484386
[-1.2602556  -1.17031182 -1.07600677]
-0.8297692802152171
-1.662222105105361


In [108]:
V_per_traj

array([-7.56562059e-01, -4.41777539e-02, -1.09117301e+00, -2.41810257e+00,
       -9.45358157e-01, -7.78874657e-02, -7.62366957e-02, -1.34624032e+00,
       -1.87134195e-01,  1.03075188e-01, -3.45502767e-01, -1.85788253e+00,
       -5.98407132e-01, -1.20016595e+00, -4.23127358e-01, -1.09955063e-01,
       -7.42024759e-02, -5.64272195e-01, -1.60282221e-02, -5.53633931e-01,
       -2.78762477e-01, -3.22443198e-01, -6.64136707e-02, -4.22590246e-01,
       -3.71587773e-01, -4.65464551e-02, -5.85888067e-01, -6.86821322e-01,
       -4.50056457e-01, -3.28122819e-01, -1.30894259e-01, -8.10605919e-02,
       -2.46292724e-01, -6.75891741e-01, -3.57964173e-01, -1.52345762e-01,
       -3.22955939e-01, -3.80209745e-01, -1.88372634e-01, -7.33269820e-02,
       -6.09426216e-02, -3.15248066e-02,  2.94639266e-01, -4.58678143e-02,
       -5.06345949e-02, -5.19557000e+00,  3.16498303e-02,  1.94812430e-01,
       -5.42871917e-02, -2.89305859e+00, -3.62613343e+00, -1.59187260e+00,
       -4.01409921e-01, -

# Variance Preparation and Calculation

In [99]:
def phi(features, beta):
  features = np.array(features)
  beta = np.array(beta)
  phi_s = np.dot(beta,features)
  return phi_s


In [101]:
import random
# gamma = 0.9
# beta = [random.random() for _ in range(3)]
def variance_terms(policy_set,gamma, beta):
  all_weights = calculate_importance_weights(eval_policy, behav_policy, policy_set)
  y_w_r_all = 0
  r_all = 0
  f_a = 0
  for j in range(len(policy_set)):
    y_w_r = 0
    r = 0
    for i in range(len(policy_set[j])):
      features = policy_set[j][i][5]+policy_set[j][i][6]
      y_w_r += gamma**(i)*all_weights[j][i]*policy_set[j][i][2]
      if i>0 & i<len(policy_set):
        r += phi(features, beta)*(all_weights[j][i-1]-all_weights[j][i])
    y_w_r_all += y_w_r
    f_a +=  gamma**(len(policy_set[j]))*all_weights[j][-1]*phi(features,beta) - phi(features, beta) # fix the features part
    r_all += r

  IS = y_w_r_all/len(policy_set)
  R = r_all/len(policy_set)
  F = f_a/len(policy_set)
  return IS, R, F


In [100]:
def subset_policies(policy, percent_to_estimate_phi):
    seed_value = 42
    np.random.seed(seed_value)
    num_policies = len(policy)
    num_policies_to_estimate_phi = int(num_policies * percent_to_estimate_phi)

    policy_for_scope = policy[num_policies_to_estimate_phi:]
    policy_for_phi = policy[:num_policies_to_estimate_phi]

    return policy_for_scope, policy_for_phi

def calc_variance(behavior_policies, gamma, beta, num_bootstrap_samples = 100,num_trajectories = 0.3):
  # Set the seed value (you can use any integer value)
  seed_value = 42
  np.random.seed(seed_value)
  num_trajectories_to_sample = max(1, int(len(behavior_policies) * num_trajectories))

  policy_for_scope, policy_for_phi = subset_policies(behavior_policies, percent_to_estimate_phi=num_trajectories)
  num_trajectories_to_sample = max(1, len(policy_for_phi))

  bootstrap_samples = [np.random.choice(policy_for_phi, size=num_trajectories_to_sample, replace=True)
                         for _ in range(num_bootstrap_samples)]
  IS_all = []
  R_all = []
  F_all = []

  for pol in bootstrap_samples:
    IS, R, F = variance_terms(pol,0.9,beta)
    IS_all.append(IS)
    R_all.append(R)
    F_all.append(F)
  IS_sq = np.mean([num**2 for num in IS_all])
  IS_R_F = 2*np.mean([IS_all[i]*(R_all[i]+F_all[i]) for i in range(len(IS_all))])
  R_sq = np.mean([num**2 for num in R_all])
  IS_sq_all = (np.mean(IS_all))**2
  IS_r_t_f = 2*np.mean(IS_all)*np.mean([R_all[i]+F_all[i] for i in range(len(R_all))])
  R_sq_all = (np.mean(R_all))**2

  variance_scope = IS_sq + IS_R_F + R_sq - IS_sq_all - IS_r_t_f - R_sq_all
  variance_is = IS_sq - IS_sq_all
  return variance_scope, variance_is

In [103]:
scope_set, phi_set = subset_policies(behavior_policies, 0.3)
variance_scope, variance_is = calc_variance(phi_set,0.9,[-0.1,.1,.1], 100, 0.3)
print("Var SCOPE: ",variance_scope)
print("Var IS: ",variance_is)
print("Percent change in variance: ",((variance_scope-variance_is)/variance_is)*100)

  bootstrap_samples = [np.random.choice(policy_for_phi, size=num_trajectories_to_sample, replace=True)


Var SCOPE:  0.9321278090837419
Var IS:  0.18251798747629672
Percent change in variance:  410.70462806017713


# Optimization

In [104]:
import numpy as np
from scipy.optimize import minimize

# Define the objective function to minimize variance_scope
def objective_function(beta):
    scope_set, phi_set = subset_policies(behavior_policies, 0.3)
    variance_scope, variance_is = calc_variance(phi_set, 0.9, beta, 100, 0.3)
    return variance_scope

# Set the initial values of beta
initial_beta = np.array([ 0.2610704,   0.30396575, -0.43850237])

# Lists to store beta and variance_scope values at each iteration
all_betas = []
all_variance_scopes = []

# Callback function to record beta and variance_scope values at each iteration
def callback_function(beta):
    all_betas.append(beta.copy())
    variance_scope = objective_function(beta)
    all_variance_scopes.append(variance_scope)
    print("Iteration:", len(all_betas))
    print("Beta:", beta)
    print("Variance Scope:", variance_scope)
    print("----------")

# Run the optimization with the callback
result = minimize(objective_function, initial_beta, method='L-BFGS-B', callback=callback_function)

# Extract the optimal beta values
optimal_beta = result.x

# Print the result
print("Optimal Beta Values:", optimal_beta)


  bootstrap_samples = [np.random.choice(policy_for_phi, size=num_trajectories_to_sample, replace=True)


Iteration: 1
Beta: [ 0.20643808  0.37225496 -0.40707805]
Variance Scope: 0.07884312743887278
----------
Iteration: 2
Beta: [ 0.20172417  0.37330396 -0.41680254]
Variance Scope: 0.07761681675957388
----------
Iteration: 3
Beta: [ 0.18827351  0.38791832 -0.46826011]
Variance Scope: 0.07467007370176332
----------
Iteration: 4
Beta: [ 0.19546902  0.39854131 -0.47835576]
Variance Scope: 0.07430152131553047
----------
Iteration: 5
Beta: [ 0.25227677  0.4674727  -0.52758759]
Variance Scope: 0.07301274073304509
----------
Iteration: 6
Beta: [ 0.2609776   0.47457761 -0.52808368]
Variance Scope: 0.0729717459656698
----------
Iteration: 7
Beta: [ 0.2609209   0.47456879 -0.52815694]
Variance Scope: 0.07297173623011044
----------
Optimal Beta Values: [ 0.2609209   0.47456879 -0.52815694]


In [10]:
import numpy as np
from scipy.optimize import minimize

# Define the objective function to minimize variance_scope
def objective_function(beta):
    IS_all, R_all, F_all, variance_scope, variance_is = calc_variance(behavior_policies, 0.9, beta)
    return variance_scope

# Set the initial values of beta
initial_beta = np.array([1, -0.1, -0.1])

# Lists to store beta and variance_scope values at each iteration
all_betas = []
all_variance_scopes = []

# Callback function to record beta and variance_scope values at each iteration
def callback_function(beta):
    all_betas.append(beta.copy())
    variance_scope = objective_function(beta)
    all_variance_scopes.append(variance_scope)
    print("Iteration:", len(all_betas))
    print("Beta:", beta)
    print("Variance Scope:", variance_scope)
    print("----------")

# Run the optimization with the callback
result = minimize(objective_function, initial_beta, method='L-BFGS-B', callback=callback_function)

# Extract the optimal beta values
optimal_beta = result.x

# Print the result
print("Optimal Beta Values:", optimal_beta)


  bootstrap_samples = [np.random.choice(behavior_policies, size=len(behavior_policies), replace=True)


Iteration: 1
Beta: [ 0.74558738 -0.0727051  -0.13429517]
Variance Scope: 0.17205980293372852
----------
Iteration: 2
Beta: [ 0.51422013  0.01444041 -0.13118779]
Variance Scope: 0.10132221718839504
----------
Iteration: 3
Beta: [ 0.21974179  0.12674824 -0.12945906]
Variance Scope: 0.06890789685172624
----------
Iteration: 4
Beta: [ 0.22100604  0.15532669 -0.17596816]
Variance Scope: 0.06790719356206737
----------
Iteration: 5
Beta: [ 0.2578635   0.30333719 -0.43552043]
Variance Scope: 0.06528770052678368
----------
Iteration: 6
Beta: [ 0.26104719  0.30397559 -0.43850381]
Variance Scope: 0.0652844236998165
----------
Iteration: 7
Beta: [ 0.2610704   0.30396575 -0.43850237]
Variance Scope: 0.06528442349549637
----------
Optimal Beta Values: [ 0.2610704   0.30396575 -0.43850237]


In [100]:
import numpy as np
from scipy.optimize import minimize

# Define the objective function to minimize variance_scope
def objective_function(beta):
    IS_all, R_all, F_all, variance_scope, variance_is = calc_variance(behavior_policies, 0.9, beta)
    return variance_scope

# Set the initial values of beta
initial_beta = np.array([1, -0.1, -0.1])

# Lists to store beta and variance_scope values at each iteration
all_betas = []
all_variance_scopes = []

# Callback function to record beta and variance_scope values at each iteration
def callback_function(beta):
    all_betas.append(beta.copy())
    variance_scope = objective_function(beta)
    all_variance_scopes.append(variance_scope)
    print("Iteration:", len(all_betas))
    print("Beta:", beta)
    print("Variance Scope:", variance_scope)
    print("----------")

# Run the optimization with the callback
result = minimize(objective_function, initial_beta, method='L-BFGS-B', callback=callback_function)

# Extract the optimal beta values
optimal_beta = result.x

# Print the result
print("Optimal Beta Values:", optimal_beta)


  bootstrap_samples = [np.random.choice(behavior_policies, size=len(behavior_policies), replace=True)


Iteration: 1
Beta: [ 0.95461424 -0.18038518 -0.14714184]
Variance Scope: 0.11818026023735667
----------
Iteration: 2
Beta: [ 0.9016154  -0.156789   -0.15068515]
Variance Scope: 0.10796090614265882
----------
Iteration: 3
Beta: [ 0.46725172  0.10210465 -0.29978589]
Variance Scope: 0.06275082447768895
----------
Iteration: 4
Beta: [ 0.44485279  0.171732   -0.410621  ]
Variance Scope: 0.05877067174584359
----------
Iteration: 5
Beta: [ 0.46031673  0.3296736  -0.71168708]
Variance Scope: 0.05390185593164576
----------
Iteration: 6
Beta: [ 0.46135818  0.32966832 -0.7124576 ]
Variance Scope: 0.05390157377166176
----------
Iteration: 7
Beta: [ 0.46138779  0.32965123 -0.71244844]
Variance Scope: 0.05390157358436144
----------
Optimal Beta Values: [ 0.46138779  0.32965123 -0.71244844]


# Value estimates of IS and SCOPE estimators

In [82]:
all_weights = calculate_importance_weights(eval_policy, behav_policy, behavior_policies)

In [110]:
beta = [-1,1,1]
# beta =  [ 0.24443418,  0.30068975, -0.40465262]
V_SCOPE = SCOPE(all_weights, behavior_policies,beta)
print("SCOPE values estimate: %f and variance: %f" % (V_SCOPE,variance_scope))

SCOPE values estimate: 1.024355 and variance: 0.015076


In [103]:
V = per_step_IS(all_weights, behavior_policies)
print("IS values estimate: %f and variance: %f" % (V,variance_is))

IS values estimate: -0.735629 and variance: 0.029827
