<a href="https://colab.research.google.com/github/arunraja-hub/Preference_Extraction/blob/dqn/dqn_subnetworks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [0]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install --upgrade tensorflow-probability
!pip install tf-agents

In [0]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

In [0]:
tf.compat.v1.enable_v2_behavior()
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

tf.version.VERSION

## Hyperparameters

In [0]:
num_iterations = 2000
initial_collect_steps = 1000
collect_steps_per_iteration = 1
replay_buffer_max_length = 100000
batch_size = 64
learning_rate = 1e-3
log_interval = 200
num_eval_episodes = 10
eval_interval = 1000

## Environment
 

In [0]:
env_name = 'CartPole-v0'

train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

## Agent
At the heart of a DQN Agent is a QNetwork, a neural network model that can learn to predict QValues (expected returns) for all actions, given an observation from the environment.

In [0]:
fc_layer_params = (100,)


q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)


agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

## Replay Buffer


In [0]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length) 

## Data Collection

Now execute the random policy in the environment for a few steps, recording the data in the replay buffer.
collect_step adds/remembers time_step,action,trajectory,next_time_step of each step

In [0]:
#@test {"skip": true}
def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
  for _ in range(steps):
    collect_step(env, policy, buffer)

# This loop is so common in RL, that we provide standard implementations. 
# For more details see the drivers module.
# https://github.com/tensorflow/agents/blob/master/tf_agents/docs/python/tf_agents/drivers.md




*   batch_size=64
*   possible actions=2
*   possible states=4






In [0]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)
print( dataset)
iterator = iter(dataset)


## Training the agent

In [0]:
#@test {"skip": true}
try:
  %%time
except:
  pass

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_steps_per_iteration):
    collect_step(train_env, agent.collect_policy, replay_buffer)
  ########  InvalidArgumentError: assertion failed: [TFUniformReplayBuffer is empty.
  dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)
  iterator = iter(dataset)
  ########
  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

## Subnetworks

In [0]:
# QNet Structure
for v in q_net.variables:
    print(v.name, v.shape)

In [0]:
# Potential Implementation for Preference Extraction
'''
    Take RL agent Q network A
    
    For each neuron n of output layer   --- 2 neuron action output
    Get subnetwork A’ as a copy of A but with single neuron n as output layer   --- single action output?
    Assign each weight in A’ a score (initialise scores with Kaming uniform)     --- Kaming uniform?
    Modify single neuron output layer in the following way
    q = value of Q-head n
    q = sigmoid(q - n_mean / n_std)
    loss = binary_cross_entropy(q, preference)
    In forward run batch of (observation, preference) pairs in A’ using top k% scores
    In backward pass update scores by subtracting: learning rate * derivative of loss with respect to weight head neuron * weight * activation of weight tail neuron
    Run for 100 epochs
    Obtain best subnetwork A’’ from A’
    Predict preferences using ensemble of subnetworks (one for each Q-head)
    Ensemble voting can be done by majority or by most-confident vote
    Ensemble voting can be used to do active reward modelling
    Start with a training pairs (obs, prefs)
    Predict prefs for b >> a observations
    Query labels for |a| observations in b for which we have max std of predictions
'''

In [0]:
# Run A for t observations and store mean and std of each Q-head in the output layer
q_heads_values = []
for _ in range(num_iterations):
    experience, unused_info = next(iterator)
    logits, _ = q_net.call(observation=experience.observation)
    q_heads_values.append(np.array(logits).reshape((64*2 ,2)))

q_heads_values = np.concatenate(q_heads_values)
q_heads_means = q_heads_values.mean(axis=0)
q_heads_stds = q_heads_values.std(axis=0)

In [0]:
from copy import deepcopy

In [0]:
# For each neuron n of output layer

for qHix in [0, 1]:

    # Get subnetwork A’ as a copy of A but with single neuron n as output layer
    q_head_net = deepcopy(q_net)
    q_head_net_W = q_head_net.get_weights()
    q_head_net_W_qHix = q_head_net_W[-1][qHix]
    q_head_net_W[-1] = np.zeros(shape=q_head_net_W[-1].shape)
    q_head_net_W[-1][qHix] = q_head_net_W_qHix
    q_head_net.set_weights(q_head_net_W)

    # Assign each weight in A’ a score (initialise scores with Kaming uniform)